diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 69db59fb..b5bfe0f8 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -107,6 +107,10 @@ jobs: # Thus, the matrix.python is not required here, only for setting up builds python-version: "3.11" + - name: Install OpenMP runtime on macOS + if: startsWith(matrix.os, 'macos') + run: brew install libomp + - name: Build and Test Wheels env: # since cibuildwheel builds for all python versions >= current version, we need to @@ -190,4 +194,4 @@ jobs: token: ${{ secrets.CODECOV_TOKEN }} fail_ci_if_error: false files: ./build/coverage/coverage.xml - verbose: true \ No newline at end of file + verbose: true diff --git a/.github/workflows/pre-release.yml b/.github/workflows/pre-release.yml index cdca0227..9581e3c5 100644 --- a/.github/workflows/pre-release.yml +++ b/.github/workflows/pre-release.yml @@ -96,6 +96,10 @@ jobs: with: python-version: "3.11" + - name: Install OpenMP runtime on macOS + if: startsWith(matrix.os, 'macos') + run: brew install libomp + - name: Build and Test Wheels env: PRE_RELEASE_BUILD: "true" @@ -176,4 +180,4 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 142c34d8..166cee61 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -98,6 +98,10 @@ jobs: with: python-version: "3.11" + - name: Install OpenMP runtime on macOS + if: startsWith(matrix.os, 'macos') + run: brew install libomp + - name: Build and Test Wheels env: CIBW_BUILD: "cp${{ matrix.python }}-${{ matrix.platform_id }}" @@ -167,4 +171,4 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/test-release.yml b/.github/workflows/test-release.yml index bd2b9475..b7f79107 100644 --- a/.github/workflows/test-release.yml +++ b/.github/workflows/test-release.yml @@ -96,6 +96,10 @@ jobs: with: python-version: "3.11" + - name: Install OpenMP runtime on macOS + if: startsWith(matrix.os, 'macos') + run: brew install libomp + - name: Build and Test Wheels env: CIBW_BUILD: "cp${{ matrix.python }}-${{ matrix.platform_id }}" @@ -166,4 +170,4 @@ jobs: with: repository-url: https://test.pypi.org/legacy/ user: __token__ - password: ${{ secrets.TEST_PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.TEST_PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore index 09e6a753..c9961e83 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__/ # Cython build files src/pyqasm/accelerate/linalg.c +src/pyqasm/accelerate/sv_sim.c # Distribution / packaging .Python diff --git a/benchmarks/bench_evolved.json b/benchmarks/bench_evolved.json new file mode 100644 index 00000000..941d4b03 --- /dev/null +++ b/benchmarks/bench_evolved.json @@ -0,0 +1,183 @@ +{ + "benchmark": "statevector preprocessing no-cache comparison", + "baseline": "PyQASM sv branch before evolved preprocessing commit", + "candidate": "PyQASM sv branch with evolved no-cache preprocessing", + "n_repeats": 5, + "pyqasm_num_threads": 1, + "notes": [ + "No QASM string memoization is included in the candidate.", + "Both series use pre-unrolled QasmModule inputs, matching the branch benchmark convention." + ], + "series": { + "random": [ + { + "num_qubits": 4, + "depth": 20, + "pyqasm_sv_ms": 0.06092991679906845, + "evolved_no_cache_ms": 0.06546999793499708, + "speedup": 0.9306540204807044, + "correct": true + }, + { + "num_qubits": 6, + "depth": 40, + "pyqasm_sv_ms": 0.1580890966579318, + "evolved_no_cache_ms": 0.1151899341493845, + "speedup": 1.372421104546456, + "correct": true + }, + { + "num_qubits": 8, + "depth": 60, + "pyqasm_sv_ms": 0.16204908024519682, + "evolved_no_cache_ms": 0.14194007962942123, + "speedup": 1.1416724625509327, + "correct": true + }, + { + "num_qubits": 10, + "depth": 80, + "pyqasm_sv_ms": 0.24963996838778257, + "evolved_no_cache_ms": 0.21547905635088682, + "speedup": 1.1585347208002805, + "correct": true + }, + { + "num_qubits": 12, + "depth": 100, + "pyqasm_sv_ms": 0.458328053355217, + "evolved_no_cache_ms": 0.41990901809185743, + "speedup": 1.0914937131808757, + "correct": true + }, + { + "num_qubits": 14, + "depth": 150, + "pyqasm_sv_ms": 1.5235739992931485, + "evolved_no_cache_ms": 1.3638049131259322, + "speedup": 1.117149516495739, + "correct": true + }, + { + "num_qubits": 16, + "depth": 200, + "pyqasm_sv_ms": 7.1852849796414375, + "evolved_no_cache_ms": 5.937180016189814, + "speedup": 1.210218480835721, + "correct": true + }, + { + "num_qubits": 18, + "depth": 250, + "pyqasm_sv_ms": 35.15114798210561, + "evolved_no_cache_ms": 28.548280941322446, + "speedup": 1.231287728124666, + "correct": true + }, + { + "num_qubits": 20, + "depth": 300, + "pyqasm_sv_ms": 162.53185598179698, + "evolved_no_cache_ms": 126.72401999589056, + "speedup": 1.282565499319447, + "correct": true + }, + { + "num_qubits": 22, + "depth": 400, + "pyqasm_sv_ms": 1289.0826460206881, + "evolved_no_cache_ms": 1231.460615992546, + "speedup": 1.0467916141854843, + "correct": true + } + ], + "qft": [ + { + "num_qubits": 4, + "depth": "QFT", + "pyqasm_sv_ms": 0.20883907563984394, + "evolved_no_cache_ms": 0.16430008690804243, + "speedup": 1.2710831720784765, + "correct": true + }, + { + "num_qubits": 6, + "depth": "QFT", + "pyqasm_sv_ms": 0.4983479157090187, + "evolved_no_cache_ms": 0.315018929541111, + "speedup": 1.5819618091997316, + "correct": true + }, + { + "num_qubits": 8, + "depth": "QFT", + "pyqasm_sv_ms": 0.9089269442483783, + "evolved_no_cache_ms": 0.5902479169890285, + "speedup": 1.5399070764789728, + "correct": true + }, + { + "num_qubits": 10, + "depth": "QFT", + "pyqasm_sv_ms": 1.5004950109869242, + "evolved_no_cache_ms": 1.0477059986442327, + "speedup": 1.4321718238977499, + "correct": true + }, + { + "num_qubits": 12, + "depth": "QFT", + "pyqasm_sv_ms": 2.5123610394075513, + "evolved_no_cache_ms": 1.782613922841847, + "speedup": 1.409369133279482, + "correct": true + }, + { + "num_qubits": 14, + "depth": "QFT", + "pyqasm_sv_ms": 5.448751035146415, + "evolved_no_cache_ms": 4.14173596072942, + "speedup": 1.3155718005226993, + "correct": true + }, + { + "num_qubits": 16, + "depth": "QFT", + "pyqasm_sv_ms": 18.029507948085666, + "evolved_no_cache_ms": 15.524596092291176, + "speedup": 1.1613511772482323, + "correct": true + }, + { + "num_qubits": 18, + "depth": "QFT", + "pyqasm_sv_ms": 77.3717820411548, + "evolved_no_cache_ms": 68.92851099837571, + "speedup": 1.1224931587885028, + "correct": true + }, + { + "num_qubits": 20, + "depth": "QFT", + "pyqasm_sv_ms": 382.6392919290811, + "evolved_no_cache_ms": 349.38782698009163, + "speedup": 1.0951706452866319, + "correct": true + } + ] + }, + "summary": { + "random": { + "geomean_speedup": 1.1521318658620598, + "min_speedup": 0.9306540204807044, + "max_speedup": 1.372421104546456, + "all_correct": true + }, + "qft": { + "geomean_speedup": 1.3147632402156637, + "min_speedup": 1.0951706452866319, + "max_speedup": 1.5819618091997316, + "all_correct": true + } + } +} diff --git a/benchmarks/bench_qft.png b/benchmarks/bench_qft.png new file mode 100644 index 00000000..78d2eb64 Binary files /dev/null and b/benchmarks/bench_qft.png differ diff --git a/benchmarks/bench_qft_evolved.png b/benchmarks/bench_qft_evolved.png new file mode 100644 index 00000000..2dab0d29 Binary files /dev/null and b/benchmarks/bench_qft_evolved.png differ diff --git a/benchmarks/bench_random.png b/benchmarks/bench_random.png new file mode 100644 index 00000000..a783fa11 Binary files /dev/null and b/benchmarks/bench_random.png differ diff --git a/benchmarks/bench_random_evolved.png b/benchmarks/bench_random_evolved.png new file mode 100644 index 00000000..27228cc4 Binary files /dev/null and b/benchmarks/bench_random_evolved.png differ diff --git a/benchmarks/bench_simulator.py b/benchmarks/bench_simulator.py new file mode 100644 index 00000000..6ab4b3c3 --- /dev/null +++ b/benchmarks/bench_simulator.py @@ -0,0 +1,406 @@ +#!/usr/bin/env python3 +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Benchmark: PyQASM statevector simulator vs Qiskit Aer, Cirq, +and PennyLane Lightning. + +Each simulator builds circuits programmatically from a shared gate list so the +comparison is apples-to-apples. Timing covers only simulation (circuit +construction is excluded). + +Usage: + python benchmarks/bench_simulator.py +""" + +import math +import random +import time + +import numpy as np +from tabulate import tabulate + +# --------------------------------------------------------------------------- +# Shared circuit representation +# --------------------------------------------------------------------------- + +# Gate spec: (name, qubits_tuple, params_tuple) +# name: "h", "x", "y", "z", "s", "t", "rx", "ry", "rz", "cx", "cy", "cz", +# "swap", "crz" +# qubits_tuple: (target,) for 1q gates, (control, target) for 2q gates +# params_tuple: () or (angle,) + +SINGLE_GATE_NAMES = ["h", "x", "y", "z", "s", "t", "rx", "ry", "rz"] +TWO_GATE_NAMES = ["cx", "cy", "cz", "swap"] + + +def generate_random_gates(num_qubits: int, depth: int, seed: int = 42): + """Return (num_qubits, gate_list) for a random circuit.""" + rng = random.Random(seed) + gates = [] + for _ in range(depth): + if num_qubits >= 2 and rng.random() < 0.4: + name = rng.choice(TWO_GATE_NAMES) + q0, q1 = rng.sample(range(num_qubits), 2) + gates.append((name, (q0, q1), ())) + else: + name = rng.choice(SINGLE_GATE_NAMES) + q = rng.randint(0, num_qubits - 1) + if name == "rx": + gates.append((name, (q,), (1.0,))) + elif name == "ry": + gates.append((name, (q,), (0.5,))) + elif name == "rz": + gates.append((name, (q,), (0.3,))) + else: + gates.append((name, (q,), ())) + return num_qubits, gates + + +def generate_qft_gates(num_qubits: int): + """Return (num_qubits, gate_list) for a QFT circuit.""" + gates = [] + for i in range(num_qubits): + gates.append(("h", (i,), ())) + for j in range(i + 1, num_qubits): + k = j - i + angle = math.pi / (2**k) + gates.append(("crz", (j, i), (angle,))) + for i in range(num_qubits // 2): + gates.append(("swap", (i, num_qubits - 1 - i), ())) + return num_qubits, gates + + +# --------------------------------------------------------------------------- +# Gate-list → QASM 3 string (for PyQASM / Qiskit) +# --------------------------------------------------------------------------- + +def gates_to_qasm(num_qubits, gates): + lines = [ + "OPENQASM 3;", + 'include "stdgates.inc";', + f"qubit[{num_qubits}] q;", + ] + for name, qubits, params in gates: + if len(qubits) == 1: + q = qubits[0] + if params: + lines.append(f"{name}({params[0]}) q[{q}];") + else: + lines.append(f"{name} q[{q}];") + else: + q0, q1 = qubits + if params: + lines.append(f"{name}({params[0]}) q[{q0}], q[{q1}];") + else: + lines.append(f"{name} q[{q0}], q[{q1}];") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Simulator wrappers +# --------------------------------------------------------------------------- + +N_REPEATS = 5 + + +def _median_time(fn, n_repeats=N_REPEATS): + """Run fn() n_repeats times, return (median_seconds, last_result).""" + times = [] + result = None + for _ in range(n_repeats): + start = time.perf_counter() + result = fn() + times.append(time.perf_counter() - start) + return float(np.median(times)), result + + +# -- PyQASM ------------------------------------------------------------------ + +def prepare_pyqasm(num_qubits, gates): + from pyqasm import loads as pyqasm_loads + from pyqasm.simulator.statevector import Simulator + + qasm = gates_to_qasm(num_qubits, gates) + module = pyqasm_loads(qasm) + module.unroll() + module.remove_idle_qubits() + sim = Simulator(seed=0) + return module, sim + + +def bench_pyqasm(module, sim): + def run(): + return sim.run(module, shots=0).final_statevector + return _median_time(run) + + +# -- Qiskit Aer --------------------------------------------------------------- + +def prepare_qiskit(num_qubits, gates): + from qiskit import transpile + from qiskit.qasm3 import loads as qiskit_loads + from qiskit_aer import AerSimulator + + qasm = gates_to_qasm(num_qubits, gates) + backend = AerSimulator(method="statevector") + circuit = qiskit_loads(qasm) + circuit.save_statevector() + compiled = transpile(circuit, backend, optimization_level=0) + return compiled, backend + + +def bench_qiskit(compiled, backend): + def run(): + job = backend.run(compiled) + result = job.result() + return np.asarray(result.get_statevector(compiled)) + return _median_time(run) + + +# -- Cirq ---------------------------------------------------------------------- + +def _build_cirq_circuit(num_qubits, gates): + import cirq + + qubits = cirq.LineQubit.range(num_qubits) + ops = [] + for name, qubit_indices, params in gates: + if name == "h": + ops.append(cirq.H(qubits[qubit_indices[0]])) + elif name == "x": + ops.append(cirq.X(qubits[qubit_indices[0]])) + elif name == "y": + ops.append(cirq.Y(qubits[qubit_indices[0]])) + elif name == "z": + ops.append(cirq.Z(qubits[qubit_indices[0]])) + elif name == "s": + ops.append(cirq.S(qubits[qubit_indices[0]])) + elif name == "t": + ops.append(cirq.T(qubits[qubit_indices[0]])) + elif name == "rx": + ops.append(cirq.rx(params[0])(qubits[qubit_indices[0]])) + elif name == "ry": + ops.append(cirq.ry(params[0])(qubits[qubit_indices[0]])) + elif name == "rz": + ops.append(cirq.rz(params[0])(qubits[qubit_indices[0]])) + elif name == "cx": + ops.append(cirq.CNOT(qubits[qubit_indices[0]], qubits[qubit_indices[1]])) + elif name == "cy": + ops.append( + cirq.ControlledGate(cirq.Y).on( + qubits[qubit_indices[0]], qubits[qubit_indices[1]] + ) + ) + elif name == "cz": + ops.append(cirq.CZ(qubits[qubit_indices[0]], qubits[qubit_indices[1]])) + elif name == "swap": + ops.append(cirq.SWAP(qubits[qubit_indices[0]], qubits[qubit_indices[1]])) + elif name == "crz": + ops.append( + cirq.ControlledGate(cirq.rz(params[0])).on( + qubits[qubit_indices[0]], qubits[qubit_indices[1]] + ) + ) + return cirq.Circuit(ops), qubits + + +def prepare_cirq(num_qubits, gates): + circuit, qubits = _build_cirq_circuit(num_qubits, gates) + import cirq + sim = cirq.Simulator(dtype=np.complex128) + return circuit, sim, qubits + + +def bench_cirq(circuit, sim, qubits): + def run(): + result = sim.simulate(circuit, qubit_order=qubits) + return result.final_state_vector + return _median_time(run) + + +# -- PennyLane Lightning ------------------------------------------------------- + +def _build_pennylane_fn(num_qubits, gates): + import pennylane as qml + + dev = qml.device("lightning.qubit", wires=num_qubits) + + gate_list = list(gates) # capture for closure + + @qml.qnode(dev) + def circuit(): + for name, qubit_indices, params in gate_list: + if name == "h": + qml.Hadamard(qubit_indices[0]) + elif name == "x": + qml.PauliX(qubit_indices[0]) + elif name == "y": + qml.PauliY(qubit_indices[0]) + elif name == "z": + qml.PauliZ(qubit_indices[0]) + elif name == "s": + qml.S(qubit_indices[0]) + elif name == "t": + qml.T(qubit_indices[0]) + elif name == "rx": + qml.RX(params[0], qubit_indices[0]) + elif name == "ry": + qml.RY(params[0], qubit_indices[0]) + elif name == "rz": + qml.RZ(params[0], qubit_indices[0]) + elif name == "cx": + qml.CNOT([qubit_indices[0], qubit_indices[1]]) + elif name == "cy": + qml.CY([qubit_indices[0], qubit_indices[1]]) + elif name == "cz": + qml.CZ([qubit_indices[0], qubit_indices[1]]) + elif name == "swap": + qml.SWAP([qubit_indices[0], qubit_indices[1]]) + elif name == "crz": + qml.CRZ(params[0], [qubit_indices[0], qubit_indices[1]]) + return qml.state() + + return circuit + + +def prepare_pennylane(num_qubits, gates): + circuit_fn = _build_pennylane_fn(num_qubits, gates) + return (circuit_fn,) + + +def bench_pennylane(circuit_fn): + def run(): + return np.asarray(circuit_fn()) + return _median_time(run) + + +# --------------------------------------------------------------------------- +# Qubit ordering helpers +# --------------------------------------------------------------------------- + +def _reverse_endian(sv, num_qubits): + """Convert big-endian statevector to little-endian (or vice versa).""" + return sv.reshape([2] * num_qubits).transpose(range(num_qubits - 1, -1, -1)).ravel() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def run_benchmarks(circuit_name, configs, generator_fn): + """Run all simulators on a set of circuit configs.""" + headers = [ + "Qubits", "Depth", + "PyQASM (ms)", "Qiskit (ms)", "Cirq (ms)", "Lightning (ms)", + "Correct", + ] + rows = [] + + print(f"\n{'='*90}") + print(f" {circuit_name} circuits (median of {N_REPEATS} runs)") + print(f"{'='*90}") + + for config in configs: + if len(config) == 2: + num_qubits, depth = config + nq, gates = generator_fn(num_qubits, depth) + else: + num_qubits = config[0] + depth = None + nq, gates = generator_fn(num_qubits) + + depth_str = str(depth) if depth is not None else "QFT" + + # --- Prepare all simulators --- + pyqasm_args = prepare_pyqasm(nq, gates) + qiskit_args = prepare_qiskit(nq, gates) + cirq_args = prepare_cirq(nq, gates) + pl_args = prepare_pennylane(nq, gates) + + # --- Benchmark --- + t_pyqasm, sv_pyqasm = bench_pyqasm(*pyqasm_args) + t_qiskit, sv_qiskit = bench_qiskit(*qiskit_args) + t_cirq, sv_cirq = bench_cirq(*cirq_args) + t_pl, sv_pl = bench_pennylane(*pl_args) + + # --- Correctness: compare all against PyQASM (little-endian) --- + # Qiskit Aer is little-endian like PyQASM + # Cirq, PennyLane are big-endian → convert + sv_cirq_le = _reverse_endian(sv_cirq, nq) + sv_pl_le = _reverse_endian(sv_pl, nq) + + checks = { + "qiskit": np.allclose(sv_pyqasm, sv_qiskit, atol=1e-10), + "cirq": np.allclose(sv_pyqasm, sv_cirq_le, atol=1e-10), + "lightning": np.allclose(sv_pyqasm, sv_pl_le, atol=1e-10), + } + all_pass = all(checks.values()) + status = "PASS" if all_pass else "FAIL " + ",".join( + k for k, v in checks.items() if not v + ) + + rows.append([ + nq, depth_str, + f"{t_pyqasm*1000:.2f}", + f"{t_qiskit*1000:.2f}", + f"{t_cirq*1000:.2f}", + f"{t_pl*1000:.2f}", + status, + ]) + + print( + f" n={nq:2d} depth={depth_str:>4s} " + f"pyqasm={t_pyqasm*1000:8.2f} " + f"qiskit={t_qiskit*1000:8.2f} " + f"cirq={t_cirq*1000:8.2f} " + f"lightning={t_pl*1000:8.2f} ms " + f"{status}" + ) + + print() + print(tabulate(rows, headers=headers, tablefmt="github")) + return rows + + +def main(): + random_configs = [ + (4, 20), + (6, 40), + (8, 60), + (10, 80), + (12, 100), + (14, 150), + (16, 200), + (18, 250), + (20, 300), + (22, 400), + ] + + qft_configs = [ + (4,), + (6,), + (8,), + (10,), + (12,), + (14,), + (16,), + ] + + run_benchmarks("Random", random_configs, generate_random_gates) + run_benchmarks("QFT", qft_configs, generate_qft_gates) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/plot_benchmarks.py b/benchmarks/plot_benchmarks.py new file mode 100644 index 00000000..8f83a299 --- /dev/null +++ b/benchmarks/plot_benchmarks.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate benchmark plots comparing PyQASM against other simulators. + +Usage: + python benchmarks/plot_benchmarks.py +""" + +import os +import sys + +import matplotlib.pyplot as plt +import numpy as np + +# Ensure we can import from the benchmarks directory +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from bench_simulator import ( + N_REPEATS, + bench_cirq, + bench_pennylane, + bench_pyqasm, + bench_qiskit, + generate_qft_gates, + generate_random_gates, + prepare_cirq, + prepare_pennylane, + prepare_pyqasm, + prepare_qiskit, +) + +RANDOM_CONFIGS = [ + (4, 20), + (6, 40), + (8, 60), + (10, 80), + (12, 100), + (14, 150), + (16, 200), + (18, 250), + (20, 300), + (22, 400), +] + +QFT_CONFIGS = [4, 6, 8, 10, 12, 14, 16, 18, 20] + +SIMULATORS = { + "PyQASM": (prepare_pyqasm, bench_pyqasm), + "Qiskit Aer": (prepare_qiskit, bench_qiskit), + "Cirq": (prepare_cirq, bench_cirq), + "PennyLane Lightning": (prepare_pennylane, bench_pennylane), +} + +COLORS = { + "PyQASM": "#2563eb", + "Qiskit Aer": "#16a34a", + "Cirq": "#eab308", + "PennyLane Lightning": "#dc2626", +} + +MARKERS = { + "PyQASM": "o", + "Qiskit Aer": "s", + "Cirq": "D", + "PennyLane Lightning": "^", +} + + +def collect_times(circuit_type, configs): + """Run benchmarks and return {simulator_name: ([qubits], [times_ms])}.""" + results = {name: ([], []) for name in SIMULATORS} + + for config in configs: + if circuit_type == "random": + nq, depth = config + num_qubits, gates = generate_random_gates(nq, depth) + label = f"n={nq}, d={depth}" + else: + nq = config + num_qubits, gates = generate_qft_gates(nq) + label = f"QFT n={nq}" + + print(f" {label} ...", end="", flush=True) + for name, (prepare_fn, bench_fn) in SIMULATORS.items(): + args = prepare_fn(num_qubits, gates) + t, _ = bench_fn(*args) + results[name][0].append(num_qubits) + results[name][1].append(t * 1000) # ms + print(f" {name}={t*1000:.1f}ms", end="", flush=True) + print() + + return results + + +def plot_comparison(results, title, filename): + """Create a log-scale line plot comparing simulators.""" + fig, ax = plt.subplots(figsize=(10, 6)) + + for name in SIMULATORS: + qubits, times = results[name] + ax.plot( + qubits, times, + marker=MARKERS[name], + color=COLORS[name], + linewidth=2, + markersize=7, + label=name, + ) + + ax.set_yscale("log") + ax.set_xlabel("Number of Qubits", fontsize=13) + ax.set_ylabel("Simulation Time (ms)", fontsize=13) + ax.set_title(title, fontsize=15) + ax.legend(fontsize=11) + ax.grid(True, which="both", ls="--", alpha=0.4) + ax.set_xticks(results["PyQASM"][0]) + fig.tight_layout() + out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) + fig.savefig(out_path, dpi=150) + print(f" Saved: {out_path}") + plt.close(fig) + + +def main(): + print(f"Running benchmarks (median of {N_REPEATS} runs each)\n") + + print("Random circuits:") + random_results = collect_times("random", RANDOM_CONFIGS) + plot_comparison( + random_results, + "Statevector Simulation Time — Random Circuits", + "bench_random.png", + ) + + print("\nQFT circuits:") + qft_results = collect_times("qft", QFT_CONFIGS) + plot_comparison( + qft_results, + "Statevector Simulation Time — QFT Circuits", + "bench_qft.png", + ) + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/profile_simulator.py b/benchmarks/profile_simulator.py new file mode 100644 index 00000000..2bbb1a91 --- /dev/null +++ b/benchmarks/profile_simulator.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Profiling script for PyQASM statevector simulator. + +Three modes: + 1. cProfile function-level profiling + 2. Section timing (preprocess vs simulation vs post-processing) + 3. Per-kernel timing (accumulate time per opcode type) + +Usage: + python benchmarks/profile_simulator.py [--mode cprofile|section|kernel] [--qubits N] [--depth D] +""" + +import argparse +import cProfile +import pstats +import random +import time +from io import StringIO + +import numpy as np + +from pyqasm import loads as pyqasm_loads +from pyqasm.simulator.statevector import ( + Simulator, + _OP_CONTROLLED, + _OP_CTRL_DIAGONAL, + _OP_DIAGONAL, + _OP_SINGLE, + _OP_TWO_QUBIT, + _fuse_gates, + _preprocess, +) +from pyqasm.accelerate.sv_sim import ( + apply_circuit, + apply_controlled_diagonal_gate, + apply_controlled_gate, + apply_diagonal_gate, + apply_single_qubit_gate, + apply_two_qubit_gate, +) + + +SINGLE_QUBIT_GATES = ["h", "x", "y", "z", "s", "t", "rx(1.0)", "ry(0.5)", "rz(0.3)"] +TWO_QUBIT_GATES = ["cx", "cy", "cz", "swap"] + +OP_NAMES = { + _OP_SINGLE: "single", + _OP_CONTROLLED: "controlled", + _OP_DIAGONAL: "diagonal", + _OP_CTRL_DIAGONAL: "ctrl_diag", + _OP_TWO_QUBIT: "two_qubit", +} + + +def generate_random_qasm(num_qubits: int, depth: int, seed: int = 42) -> str: + rng = random.Random(seed) + lines = [ + "OPENQASM 3;", + 'include "stdgates.inc";', + f"qubit[{num_qubits}] q;", + ] + for _ in range(depth): + if num_qubits >= 2 and rng.random() < 0.4: + gate = rng.choice(TWO_QUBIT_GATES) + q0, q1 = rng.sample(range(num_qubits), 2) + lines.append(f"{gate} q[{q0}], q[{q1}];") + else: + gate = rng.choice(SINGLE_QUBIT_GATES) + q = rng.randint(0, num_qubits - 1) + lines.append(f"{gate} q[{q}];") + return "\n".join(lines) + + +def mode_cprofile(num_qubits, depth): + """Function-level profiling with cProfile.""" + qasm = generate_random_qasm(num_qubits, depth) + sim = Simulator(seed=0) + module = pyqasm_loads(qasm) + + print(f"\n=== cProfile: {num_qubits} qubits, depth {depth} ===\n") + pr = cProfile.Profile() + pr.enable() + sim.run(module, shots=0) + pr.disable() + + s = StringIO() + ps = pstats.Stats(pr, stream=s).sort_stats("tottime") + ps.print_stats(30) + print(s.getvalue()) + + +def mode_section(num_qubits, depth): + """Section timing: preprocess vs simulation vs post-processing.""" + qasm = generate_random_qasm(num_qubits, depth) + module = pyqasm_loads(qasm) + module.unroll() + module.remove_idle_qubits() + nq = module.num_qubits + + print(f"\n=== Section timing: {num_qubits} qubits, depth {depth} ===\n") + + # Preprocess + t0 = time.perf_counter_ns() + n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \ + _preprocess(module, nq) + t1 = time.perf_counter_ns() + + # Fusion + if n > 0: + n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \ + _fuse_gates(n, opcodes, targets, controls, gate_params, diag_phases, + tq_offsets, tq_gates) + t2 = time.perf_counter_ns() + + # Simulation + sv = np.zeros(2**nq, dtype=np.complex128) + sv[0] = 1.0 + if n > 0: + apply_circuit(sv, nq, opcodes, targets, controls, gate_params, diag_phases, + tq_offsets, tq_gates, n) + t3 = time.perf_counter_ns() + + # Post-processing + probabilities = np.abs(sv) ** 2 + t4 = time.perf_counter_ns() + + pre_us = (t1 - t0) / 1000 + fuse_us = (t2 - t1) / 1000 + sim_us = (t3 - t2) / 1000 + post_us = (t4 - t3) / 1000 + total_us = (t4 - t0) / 1000 + + print(f" Instructions: {n}") + print(f" Preprocess: {pre_us:10.1f} us ({pre_us/total_us*100:5.1f}%)") + print(f" Gate fusion: {fuse_us:10.1f} us ({fuse_us/total_us*100:5.1f}%)") + print(f" Simulation: {sim_us:10.1f} us ({sim_us/total_us*100:5.1f}%)") + print(f" Post-process: {post_us:10.1f} us ({post_us/total_us*100:5.1f}%)") + print(f" Total: {total_us:10.1f} us") + + +def mode_kernel(num_qubits, depth): + """Per-kernel timing: accumulate time per opcode type.""" + qasm = generate_random_qasm(num_qubits, depth) + module = pyqasm_loads(qasm) + module.unroll() + module.remove_idle_qubits() + nq = module.num_qubits + + n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \ + _preprocess(module, nq) + if n > 0: + n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \ + _fuse_gates(n, opcodes, targets, controls, gate_params, diag_phases, + tq_offsets, tq_gates) + + sv = np.zeros(2**nq, dtype=np.complex128) + sv[0] = 1.0 + + kernel_times = {} + kernel_counts = {} + + print(f"\n=== Per-kernel timing: {num_qubits} qubits, depth {depth} ===\n") + + for i in range(n): + op = int(opcodes[i]) + tgt = int(targets[i]) + ctrl = int(controls[i]) + gp_off = i * 4 + dp_off = i * 2 + + t0 = time.perf_counter_ns() + if op == _OP_SINGLE: + flat = gate_params[gp_off:gp_off + 4].copy() + apply_single_qubit_gate(sv, nq, tgt, flat) + elif op == _OP_CONTROLLED: + flat = gate_params[gp_off:gp_off + 4].copy() + apply_controlled_gate(sv, nq, ctrl, tgt, flat) + elif op == _OP_DIAGONAL: + apply_diagonal_gate(sv, nq, tgt, diag_phases[dp_off], diag_phases[dp_off + 1]) + elif op == _OP_CTRL_DIAGONAL: + apply_controlled_diagonal_gate(sv, nq, ctrl, tgt, diag_phases[dp_off]) + elif op == _OP_TWO_QUBIT: + tq_off = int(tq_offsets[i]) + flat = tq_gates[tq_off:tq_off + 16].copy() + apply_two_qubit_gate(sv, nq, ctrl, tgt, flat) + t1 = time.perf_counter_ns() + + name = OP_NAMES.get(op, f"op_{op}") + kernel_times[name] = kernel_times.get(name, 0) + (t1 - t0) + kernel_counts[name] = kernel_counts.get(name, 0) + 1 + + total_ns = sum(kernel_times.values()) + for name in sorted(kernel_times.keys()): + t_us = kernel_times[name] / 1000 + cnt = kernel_counts[name] + pct = kernel_times[name] / total_ns * 100 if total_ns > 0 else 0 + print(f" {name:15s}: {t_us:10.1f} us ({pct:5.1f}%) count={cnt}") + print(f" {'total':15s}: {total_ns/1000:10.1f} us") + + +def main(): + parser = argparse.ArgumentParser(description="Profile PyQASM simulator") + parser.add_argument("--mode", choices=["cprofile", "section", "kernel"], default="section") + parser.add_argument("--qubits", type=int, default=16) + parser.add_argument("--depth", type=int, default=200) + args = parser.parse_args() + + if args.mode == "cprofile": + mode_cprofile(args.qubits, args.depth) + elif args.mode == "section": + mode_section(args.qubits, args.depth) + elif args.mode == "kernel": + mode_kernel(args.qubits, args.depth) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index a70e0c38..e9a3303b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,9 +42,20 @@ Homepage = "https://docs.qbraid.com/v2/pyqasm" [project.optional-dependencies] cli = ["typer>=0.12.1", "rich>=10.11.0", "typing-extensions"] -test = ["pytest", "pytest-cov", "pytest-mpl", "pillow<12.3.0", "matplotlib", "tabulate"] +test = [ + "pytest", + "pytest-cov", + "pytest-mpl", + "pillow<12.3.0", + "matplotlib", + "tabulate", + "qiskit", + "qiskit-aer", + "qiskit-qasm3-import", +] lint = ["black", "isort>=6.0.0", "pylint", "mypy", "qbraid-cli>=0.10.2"] docs = ["sphinx>=7.3.7,<8.3.0", "sphinx-autodoc-typehints>=1.24,<3.2", "sphinx-rtd-theme>=2.0.0,<4.0.0", "docutils<0.23", "sphinx-copybutton"] +simulation = ["numba>=0.59"] visualization = ["pillow<12.3.0", "matplotlib", "tabulate"] pulse = ["openpulse>=1.0.1"] @@ -53,9 +64,6 @@ pyqasm = ["py.typed", "*.pyx"] [tool.setuptools] packages = {find = {where = ["src"]}} -ext-modules = [ - {name = "pyqasm.accelerate.linalg", sources = ["src/pyqasm/accelerate/linalg.pyx"], include-dirs = ["numpy"]} -] [project.scripts] pyqasm = "pyqasm.cli.main:app" @@ -75,6 +83,9 @@ line_length = 100 [tool.pytest.ini_options] addopts = "-ra" testpaths = ["tests"] +markers = [ + "benchmark: performance regression tests (may be slow)", +] [tool.coverage.run] source = ["pyqasm"] diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..f8621a39 --- /dev/null +++ b/setup.py @@ -0,0 +1,84 @@ +"""Build configuration for Cython extensions with optional OpenMP support.""" + +import os +import platform +import subprocess +import sys +import tempfile + +import numpy as np +from Cython.Build import cythonize +from setuptools import Extension, setup + +BASE_COMPILE_ARGS = ["-O3", "-ffast-math", "-march=native"] +BASE_LINK_ARGS = [] + + +def _check_openmp(): + """Detect OpenMP availability and return (compile_args, link_args).""" + test_code = b"#include \nint main() { return omp_get_max_threads(); }\n" + + candidates = [] + if sys.platform == "darwin": + # macOS: try libomp from Homebrew + for prefix in ["/opt/homebrew/opt/libomp", "/usr/local/opt/libomp"]: + if os.path.isdir(prefix): + candidates.append(( + ["-Xpreprocessor", "-fopenmp", f"-I{prefix}/include"], + [f"-L{prefix}/lib", "-lomp"], + )) + # Also try Xcode / system clang with -fopenmp + candidates.append((["-fopenmp"], ["-fopenmp"])) + else: + # Linux / other: standard -fopenmp + candidates.append((["-fopenmp"], ["-fopenmp"])) + + for cflags, ldflags in candidates: + try: + with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as f: + f.write(test_code) + f.flush() + cmd = ["cc"] + cflags + ldflags + [f.name, "-o", f.name + ".out"] + subprocess.check_call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + os.unlink(f.name + ".out") + os.unlink(f.name) + return cflags, ldflags + except (subprocess.CalledProcessError, FileNotFoundError, OSError): + try: + os.unlink(f.name) + except OSError: + pass + continue + + return [], [] + + +omp_cflags, omp_ldflags = _check_openmp() +if omp_cflags: + print(f"OpenMP detected: compile={omp_cflags}, link={omp_ldflags}") +else: + print("OpenMP not available — building single-threaded kernels") + +sv_sim_compile_args = BASE_COMPILE_ARGS + omp_cflags +sv_sim_link_args = BASE_LINK_ARGS + omp_ldflags + +extensions = [ + Extension( + "pyqasm.accelerate.linalg", + sources=["src/pyqasm/accelerate/linalg.pyx"], + include_dirs=[np.get_include()], + extra_compile_args=BASE_COMPILE_ARGS, + extra_link_args=BASE_LINK_ARGS, + ), + Extension( + "pyqasm.accelerate.sv_sim", + sources=["src/pyqasm/accelerate/sv_sim.pyx"], + include_dirs=[np.get_include()], + extra_compile_args=sv_sim_compile_args, + extra_link_args=sv_sim_link_args, + ), +] + +setup( + ext_modules=cythonize(extensions, language_level=3), +) diff --git a/src/pyqasm/accelerate/sv_sim.pyx b/src/pyqasm/accelerate/sv_sim.pyx new file mode 100644 index 00000000..2ce338ad --- /dev/null +++ b/src/pyqasm/accelerate/sv_sim.pyx @@ -0,0 +1,407 @@ +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# cython: language_level=3 +# cython: infer_types=True + +import os + +import cython +import numpy as np + +from cython.parallel cimport prange +from openmp cimport omp_get_max_threads + +# Threshold: parallelize when statevector has >= this many pairs +# 2^17 = 131072 pairs corresponds to ~18 qubits (4 MB statevector) +# Below this, OpenMP thread overhead dominates over parallel gains. +DEF PARALLEL_THRESHOLD = 131072 + +# Module-level flag: parallelism is opt-in via PYQASM_NUM_THREADS env var. +# Default is serial (1 thread) to avoid conflicts with other OpenMP runtimes +# (e.g. Qiskit Aer) in the same process. Set PYQASM_NUM_THREADS>1 to enable. +cdef int _max_threads = int(os.environ.get("PYQASM_NUM_THREADS", "1")) +if _max_threads > 1: + from openmp cimport omp_set_num_threads + omp_set_num_threads(_max_threads) +else: + from openmp cimport omp_set_num_threads + omp_set_num_threads(1) + +cdef bint _use_threads() noexcept nogil: + """Check if multithreading is enabled.""" + return omp_get_max_threads() > 1 + + +# --- cdef kernel functions --- +# Each kernel manages its own GIL release: prange paths use prange's +# built-in nogil, serial paths use explicit `with nogil:` blocks. + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cdef void _apply_single_qubit_gate( + double complex* sv, + Py_ssize_t num_qubits, + Py_ssize_t target, + double complex g00, + double complex g01, + double complex g10, + double complex g11, +) noexcept: + cdef Py_ssize_t half = 1 << (num_qubits - 1) + cdef Py_ssize_t step = 1 << target + cdef Py_ssize_t s, i0, i1 + cdef double complex a0, a1 + + if half >= PARALLEL_THRESHOLD and _use_threads(): + for s in prange(half, schedule='static', nogil=True): + i0 = ((s >> target) << (target + 1)) | (s & (step - 1)) + i1 = i0 | step + a0 = sv[i0] + a1 = sv[i1] + sv[i0] = g00 * a0 + g01 * a1 + sv[i1] = g10 * a0 + g11 * a1 + else: + with nogil: + for s in range(half): + i0 = ((s >> target) << (target + 1)) | (s & (step - 1)) + i1 = i0 | step + a0 = sv[i0] + a1 = sv[i1] + sv[i0] = g00 * a0 + g01 * a1 + sv[i1] = g10 * a0 + g11 * a1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cdef void _apply_controlled_gate( + double complex* sv, + Py_ssize_t num_qubits, + Py_ssize_t control, + Py_ssize_t target, + double complex g00, + double complex g01, + double complex g10, + double complex g11, +) noexcept: + cdef Py_ssize_t quarter = 1 << (num_qubits - 2) + cdef Py_ssize_t lo, hi + cdef Py_ssize_t lo_mask, hi_limit + cdef Py_ssize_t s, base, i0, i1 + cdef double complex a0, a1 + cdef Py_ssize_t ctrl_bit = 1 << control + cdef Py_ssize_t tgt_bit = 1 << target + + if control < target: + lo = control + hi = target + else: + lo = target + hi = control + + lo_mask = (1 << lo) - 1 + hi_limit = (1 << hi) - 1 + + if quarter >= PARALLEL_THRESHOLD and _use_threads(): + for s in prange(quarter, schedule='static', nogil=True): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + base = base | ctrl_bit + i0 = base + i1 = base | tgt_bit + a0 = sv[i0] + a1 = sv[i1] + sv[i0] = g00 * a0 + g01 * a1 + sv[i1] = g10 * a0 + g11 * a1 + else: + with nogil: + for s in range(quarter): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + base = base | ctrl_bit + i0 = base + i1 = base | tgt_bit + a0 = sv[i0] + a1 = sv[i1] + sv[i0] = g00 * a0 + g01 * a1 + sv[i1] = g10 * a0 + g11 * a1 + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cdef void _apply_diagonal_gate( + double complex* sv, + Py_ssize_t num_qubits, + Py_ssize_t target, + double complex phase0, + double complex phase1, +) noexcept: + cdef Py_ssize_t half = 1 << (num_qubits - 1) + cdef Py_ssize_t step = 1 << target + cdef Py_ssize_t s, i0, i1 + + if half >= PARALLEL_THRESHOLD and _use_threads(): + for s in prange(half, schedule='static', nogil=True): + i0 = ((s >> target) << (target + 1)) | (s & (step - 1)) + i1 = i0 | step + sv[i0] = phase0 * sv[i0] + sv[i1] = phase1 * sv[i1] + else: + with nogil: + for s in range(half): + i0 = ((s >> target) << (target + 1)) | (s & (step - 1)) + i1 = i0 | step + sv[i0] = phase0 * sv[i0] + sv[i1] = phase1 * sv[i1] + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cdef void _apply_controlled_diagonal_gate( + double complex* sv, + Py_ssize_t num_qubits, + Py_ssize_t control, + Py_ssize_t target, + double complex phase, +) noexcept: + cdef Py_ssize_t quarter = 1 << (num_qubits - 2) + cdef Py_ssize_t lo, hi + cdef Py_ssize_t lo_mask, hi_limit + cdef Py_ssize_t s, base, idx + cdef Py_ssize_t ctrl_bit = 1 << control + cdef Py_ssize_t tgt_bit = 1 << target + + if control < target: + lo = control + hi = target + else: + lo = target + hi = control + + lo_mask = (1 << lo) - 1 + hi_limit = (1 << hi) - 1 + + if quarter >= PARALLEL_THRESHOLD and _use_threads(): + for s in prange(quarter, schedule='static', nogil=True): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + idx = base | ctrl_bit | tgt_bit + sv[idx] = phase * sv[idx] + else: + with nogil: + for s in range(quarter): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + idx = base | ctrl_bit | tgt_bit + sv[idx] = phase * sv[idx] + + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cdef void _apply_two_qubit_gate( + double complex* sv, + Py_ssize_t num_qubits, + Py_ssize_t qubit0, + Py_ssize_t qubit1, + double complex* gate, +) noexcept: + cdef Py_ssize_t quarter = 1 << (num_qubits - 2) + cdef Py_ssize_t lo, hi + cdef Py_ssize_t lo_mask, hi_limit + cdef Py_ssize_t s, base, i00, i01, i10, i11 + cdef double complex a00, a01, a10, a11 + cdef Py_ssize_t q0_bit = 1 << qubit0 + cdef Py_ssize_t q1_bit = 1 << qubit1 + cdef double complex g0 = gate[0], g1 = gate[1], g2 = gate[2], g3 = gate[3] + cdef double complex g4 = gate[4], g5 = gate[5], g6 = gate[6], g7 = gate[7] + cdef double complex g8 = gate[8], g9 = gate[9], g10 = gate[10], g11_ = gate[11] + cdef double complex g12 = gate[12], g13 = gate[13], g14 = gate[14], g15 = gate[15] + + if qubit0 < qubit1: + lo = qubit0 + hi = qubit1 + else: + lo = qubit1 + hi = qubit0 + + lo_mask = (1 << lo) - 1 + hi_limit = (1 << hi) - 1 + + if quarter >= PARALLEL_THRESHOLD and _use_threads(): + for s in prange(quarter, schedule='static', nogil=True): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + i00 = base + i01 = base | q0_bit + i10 = base | q1_bit + i11 = base | q0_bit | q1_bit + a00 = sv[i00] + a01 = sv[i01] + a10 = sv[i10] + a11 = sv[i11] + sv[i00] = g0 * a00 + g1 * a01 + g2 * a10 + g3 * a11 + sv[i01] = g4 * a00 + g5 * a01 + g6 * a10 + g7 * a11 + sv[i10] = g8 * a00 + g9 * a01 + g10 * a10 + g11_ * a11 + sv[i11] = g12 * a00 + g13 * a01 + g14 * a10 + g15 * a11 + else: + with nogil: + for s in range(quarter): + base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1)) + i00 = base + i01 = base | q0_bit + i10 = base | q1_bit + i11 = base | q0_bit | q1_bit + a00 = sv[i00] + a01 = sv[i01] + a10 = sv[i10] + a11 = sv[i11] + sv[i00] = g0 * a00 + g1 * a01 + g2 * a10 + g3 * a11 + sv[i01] = g4 * a00 + g5 * a01 + g6 * a10 + g7 * a11 + sv[i10] = g8 * a00 + g9 * a01 + g10 * a10 + g11_ * a11 + sv[i11] = g12 * a00 + g13 * a01 + g14 * a10 + g15 * a11 + + +# --- cpdef wrappers for Python API --- + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void apply_single_qubit_gate( + double complex[::1] sv, + Py_ssize_t num_qubits, + Py_ssize_t target, + double complex[::1] gate, +) noexcept: + cdef double complex* sv_ptr = &sv[0] + _apply_single_qubit_gate(sv_ptr, num_qubits, target, gate[0], gate[1], gate[2], gate[3]) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void apply_controlled_gate( + double complex[::1] sv, + Py_ssize_t num_qubits, + Py_ssize_t control, + Py_ssize_t target, + double complex[::1] gate, +) noexcept: + cdef double complex* sv_ptr = &sv[0] + _apply_controlled_gate(sv_ptr, num_qubits, control, target, gate[0], gate[1], gate[2], gate[3]) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void apply_diagonal_gate( + double complex[::1] sv, + Py_ssize_t num_qubits, + Py_ssize_t target, + double complex phase0, + double complex phase1, +) noexcept: + cdef double complex* sv_ptr = &sv[0] + _apply_diagonal_gate(sv_ptr, num_qubits, target, phase0, phase1) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void apply_controlled_diagonal_gate( + double complex[::1] sv, + Py_ssize_t num_qubits, + Py_ssize_t control, + Py_ssize_t target, + double complex phase, +) noexcept: + cdef double complex* sv_ptr = &sv[0] + _apply_controlled_diagonal_gate(sv_ptr, num_qubits, control, target, phase) + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef void apply_two_qubit_gate( + double complex[::1] sv, + Py_ssize_t num_qubits, + Py_ssize_t qubit0, + Py_ssize_t qubit1, + double complex[::1] gate, +) noexcept: + cdef double complex* sv_ptr = &sv[0] + cdef double complex* gate_ptr = &gate[0] + _apply_two_qubit_gate(sv_ptr, num_qubits, qubit0, qubit1, gate_ptr) + + +# --- Batch dispatch: single Python->C crossing for entire circuit --- + +DEF OP_SINGLE = 0 +DEF OP_CONTROLLED = 1 +DEF OP_DIAGONAL = 2 +DEF OP_CTRL_DIAGONAL = 3 +DEF OP_TWO_QUBIT = 4 + +@cython.boundscheck(False) +@cython.wraparound(False) +@cython.cdivision(True) +cpdef void apply_circuit( + double complex[::1] sv, + Py_ssize_t num_qubits, + int[::1] opcodes, + int[::1] targets, + int[::1] controls, + double complex[::1] gate_params, + double complex[::1] diag_phases, + int[::1] two_qubit_offsets, + double complex[::1] two_qubit_gates, + Py_ssize_t n_instructions, +) noexcept: + """Execute an entire circuit. Each gate releases the GIL internally.""" + cdef Py_ssize_t i, gp_offset, dp_offset, tq_offset + cdef int op, tgt, ctrl + cdef double complex* sv_ptr = &sv[0] + cdef double complex* gp_ptr = &gate_params[0] + cdef double complex* dp_ptr = &diag_phases[0] + cdef double complex* tq_ptr = &two_qubit_gates[0] + + for i in range(n_instructions): + op = opcodes[i] + tgt = targets[i] + ctrl = controls[i] + gp_offset = i * 4 + dp_offset = i * 2 + + if op == OP_SINGLE: + _apply_single_qubit_gate( + sv_ptr, num_qubits, tgt, + gp_ptr[gp_offset], gp_ptr[gp_offset + 1], + gp_ptr[gp_offset + 2], gp_ptr[gp_offset + 3], + ) + elif op == OP_CONTROLLED: + _apply_controlled_gate( + sv_ptr, num_qubits, ctrl, tgt, + gp_ptr[gp_offset], gp_ptr[gp_offset + 1], + gp_ptr[gp_offset + 2], gp_ptr[gp_offset + 3], + ) + elif op == OP_DIAGONAL: + _apply_diagonal_gate( + sv_ptr, num_qubits, tgt, + dp_ptr[dp_offset], dp_ptr[dp_offset + 1], + ) + elif op == OP_CTRL_DIAGONAL: + _apply_controlled_diagonal_gate( + sv_ptr, num_qubits, ctrl, tgt, + dp_ptr[dp_offset], + ) + elif op == OP_TWO_QUBIT: + tq_offset = two_qubit_offsets[i] + _apply_two_qubit_gate( + sv_ptr, num_qubits, ctrl, tgt, + &tq_ptr[tq_offset], + ) diff --git a/src/pyqasm/modules/base.py b/src/pyqasm/modules/base.py index 3b21330a..b5317ba2 100644 --- a/src/pyqasm/modules/base.py +++ b/src/pyqasm/modules/base.py @@ -607,6 +607,10 @@ def unroll(self, **kwargs): if not kwargs: kwargs = {} + # Cache: skip if already unrolled (no kwargs means default unroll) + if not kwargs and len(self._unrolled_ast.statements) > 0: + return + try: self.num_qubits, self.num_clbits = 0, 0 if ext_gates := kwargs.get("external_gates"): diff --git a/src/pyqasm/simulator/__init__.py b/src/pyqasm/simulator/__init__.py new file mode 100644 index 00000000..a1d56354 --- /dev/null +++ b/src/pyqasm/simulator/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2026 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Module containing the PyQASM statevector simulator. + +.. currentmodule:: pyqasm.simulator + +Classes +--------- + +.. autosummary:: + :toctree: ../stubs/ + + Simulator + SimulatorResult + +""" + +from .statevector import Simulator, SimulatorResult + +__all__ = ["Simulator", "SimulatorResult"] diff --git a/src/pyqasm/simulator/statevector.py b/src/pyqasm/simulator/statevector.py new file mode 100644 index 00000000..e9ea74d9 --- /dev/null +++ b/src/pyqasm/simulator/statevector.py @@ -0,0 +1,589 @@ +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Statevector simulator for PyQASM. + +""" + +# pylint: disable=no-name-in-module,too-many-return-statements +# pylint: disable=too-many-locals,too-many-branches,too-many-statements +# pylint: disable=arguments-out-of-order,unused-argument + +from collections import Counter +from dataclasses import dataclass + +import numpy as np +from openqasm3.ast import ( + BinaryExpression, + BinaryOperator, + FloatLiteral, + Identifier, + IntegerLiteral, + QuantumGate, + UnaryExpression, + UnaryOperator, +) + +from pyqasm import loads +from pyqasm.accelerate.sv_sim import apply_circuit # type: ignore[import-not-found] +from pyqasm.modules.base import QasmModule + +try: + import numba as nb # type: ignore[import-not-found] +except ImportError: # pragma: no cover - performance-only optional dependency + + class _NumbaCompat: + """Fallback decorator shim when numba is not installed.""" + + @staticmethod + def njit(*args, **kwargs): # noqa: ARG004 + if args and callable(args[0]): + return args[0] + + def decorator(fn): + return fn + + return decorator + + nb = _NumbaCompat() + + +_COMPLEX_ZERO = 0j +_PI = np.pi +_E = np.e +_T_PHASE = np.exp(1j * _PI / 4) +_TDG_PHASE = np.exp(-1j * _PI / 4) + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def _rz_phases(theta: float) -> tuple[complex, complex]: + """Parameterized Rz gate as diagonal phases.""" + half_theta = theta / 2 + return np.exp(-1j * half_theta), np.exp(1j * half_theta) + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def ry(theta: float) -> np.ndarray: + """Parameterized Ry gate.""" + c, s = np.cos(theta / 2), np.sin(theta / 2) + mat = np.empty((2, 2), dtype=np.complex128) + mat[0, 0] = c + mat[0, 1] = -s + mat[1, 0] = s + mat[1, 1] = c + return mat + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def rx(theta: float) -> np.ndarray: + """Parameterized Rx gate.""" + c, s = np.cos(theta / 2), np.sin(theta / 2) + mat = np.empty((2, 2), dtype=np.complex128) + mat[0, 0] = c + mat[0, 1] = -1j * s + mat[1, 0] = -1j * s + mat[1, 1] = c + return mat + + +def rz(theta: float) -> np.ndarray: + """Parameterized Rz gate.""" + phase0, phase1 = _rz_phases(theta) + return _diag_to_matrix(phase0, phase1) + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def u3(theta: float, phi: float, lam: float) -> np.ndarray: + """Parameterized U3 gate (generic single-qubit rotation).""" + c, s = np.cos(theta / 2), np.sin(theta / 2) + exp_phi = np.exp(1j * phi) + exp_lam = np.exp(1j * lam) + exp_philam = np.exp(1j * (phi + lam)) + mat = np.empty((2, 2), dtype=np.complex128) + mat[0, 0] = c + mat[0, 1] = -exp_lam * s + mat[1, 0] = exp_phi * s + mat[1, 1] = exp_philam * c + return mat + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def _diag_to_matrix(phase0: complex, phase1: complex) -> np.ndarray: + """Convert diagonal phases to a 2x2 matrix.""" + mat = np.zeros((2, 2), dtype=np.complex128) + mat[0, 0] = phase0 + mat[1, 1] = phase1 + return mat + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def _is_diagonal_matrix(mat: np.ndarray, tol: float = 1e-10) -> bool: + """Return whether a 2x2 matrix is diagonal within tolerance.""" + return (np.abs(mat[0, 1]) < tol) and (np.abs(mat[1, 0]) < tol) + + +@nb.njit(cache=True, nogil=True, fastmath=True) +def _diagonal_phases_from_matrix(mat: np.ndarray) -> tuple[complex, complex]: + """Extract diagonal phases from a diagonal 2x2 matrix.""" + return mat[0, 0], mat[1, 1] + + +PARAMETERIZED_GATES = { + "rz": rz, + "ry": ry, + "rx": rx, + "u3": u3, +} + +_CONST_X = np.array([[0, 1], [1, 0]], dtype=np.complex128) +_CONST_Y = np.array([[0, -1j], [1j, 0]], dtype=np.complex128) +_CONST_Z = np.array([[1, 0], [0, -1]], dtype=np.complex128) +_CONST_H = np.array([[1, 1], [1, -1]], dtype=np.complex128) / np.sqrt(2) +_CONST_ID = np.eye(2, dtype=np.complex128) +_CONST_S = np.array([[1, 0], [0, 1j]], dtype=np.complex128) +_CONST_T = np.array([[1, 0], [0, _T_PHASE]], dtype=np.complex128) +_CONST_SDG = np.array([[1, 0], [0, -1j]], dtype=np.complex128) +_CONST_TDG = np.array([[1, 0], [0, _TDG_PHASE]], dtype=np.complex128) +_CONST_SWAP = np.array( + [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], dtype=np.complex128 +) + +NON_PARAMETERIZED_GATES: dict[str, np.ndarray] = { + "x": _CONST_X, + "y": _CONST_Y, + "z": _CONST_Z, + "h": _CONST_H, + "id": _CONST_ID, + "s": _CONST_S, + "t": _CONST_T, + "sdg": _CONST_SDG, + "tdg": _CONST_TDG, + "swap": _CONST_SWAP, +} + +# Fast-path single-qubit gate properties. Diagonal gates are kept as phase +# pairs while fusing so they can flush directly to the diagonal Cython kernel +# instead of materializing a 2x2 matrix. +NON_PARAMETERIZED_GATE_PROPS: dict[str, tuple[bool, complex, complex, np.ndarray | None]] = { + "x": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_X), + "y": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_Y), + "z": (True, 1.0 + _COMPLEX_ZERO, -1.0 + _COMPLEX_ZERO, None), + "h": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_H), + "id": (True, 1.0 + _COMPLEX_ZERO, 1.0 + _COMPLEX_ZERO, None), + "s": (True, 1.0 + _COMPLEX_ZERO, 1j, None), + "t": (True, 1.0 + _COMPLEX_ZERO, _T_PHASE, None), + "sdg": (True, 1.0 + _COMPLEX_ZERO, -1j, None), + "tdg": (True, 1.0 + _COMPLEX_ZERO, _TDG_PHASE, None), +} + +CONTROLLED_GATE_SUB_UNITARIES: dict[str, np.ndarray] = { + "cx": _CONST_X, + "cy": _CONST_Y, + "cz": _CONST_Z, +} + +# Pre-flatten non-parameterized gates for Cython (contiguous 1D arrays) +GATE_CACHE: dict[str, np.ndarray] = {} +for _name, _mat in NON_PARAMETERIZED_GATES.items(): + GATE_CACHE[_name] = np.ascontiguousarray(_mat.ravel()) +for _name, _mat in CONTROLLED_GATE_SUB_UNITARIES.items(): + GATE_CACHE[_name] = np.ascontiguousarray(_mat.ravel()) + +# Diagonal gate phases: gate_name -> (phase0, phase1) for diag(phase0, phase1) +DIAGONAL_PHASES: dict[str, tuple[complex, complex]] = { + "z": (1.0, -1.0), + "s": (1.0, 1j), + "t": (1.0, _T_PHASE), + "sdg": (1.0, -1j), + "tdg": (1.0, _TDG_PHASE), + "id": (1.0, 1.0), +} + +# Controlled diagonal phases: gate_name -> phase (applied when control=1 AND target=1) +CONTROLLED_DIAGONAL_PHASES: dict[str, complex] = { + "cz": -1.0, +} + +# Integer opcodes for preprocessed instructions +_OP_SINGLE = 0 +_OP_CONTROLLED = 1 +_OP_DIAGONAL = 2 +_OP_CTRL_DIAGONAL = 3 +_OP_TWO_QUBIT = 4 + + +def _try_eval_expression(expr) -> float | None: + """Try to evaluate an AST expression to a float value.""" + if isinstance(expr, (IntegerLiteral, FloatLiteral)): + return float(expr.value) + if isinstance(expr, Identifier): + if expr.name == "pi": + return _PI + if expr.name == "tau": + return 2 * _PI + if expr.name == "euler": + return _E + return None + if isinstance(expr, BinaryExpression): + lhs = _try_eval_expression(expr.lhs) + rhs = _try_eval_expression(expr.rhs) + if lhs is None or rhs is None: + return None + if expr.op is BinaryOperator["+"]: + return lhs + rhs + if expr.op is BinaryOperator["-"]: + return lhs - rhs + if expr.op is BinaryOperator["*"]: + return lhs * rhs + if expr.op is BinaryOperator["/"]: + return lhs / rhs + if expr.op is BinaryOperator["**"]: + return lhs**rhs + return None + if isinstance(expr, UnaryExpression): + operand = _try_eval_expression(expr.expression) + if operand is None: + return None + if expr.op is UnaryOperator["-"]: + return -operand + return None + return None + + +def _extract_params(statement): + """Extract float parameters from a QuantumGate's arguments.""" + params = [] + for arg in statement.arguments: + val = _try_eval_expression(arg) + if val is not None: + params.append(val) + return params + + +def _preprocess(program, num_qubits): + """Walk the AST once, producing packed numpy arrays with inline gate fusion. + + This keeps the existing no-cache semantics: every call preprocesses its + input program. The speedup comes from avoiding Python list growth, + preserving diagonal gates as phase pairs while fusing, and using optional + numba-compiled helpers for small matrix/phase constructors. + """ + statements = ( + program._unrolled_ast.statements + if len(program._unrolled_ast.statements) > 0 + else program.original_program.statements + ) + + # Conservative bound: pending one-qubit flushes plus SWAP expansion can + # emit more instructions than source statements. The bound is deliberately + # simple and over-allocates a little to avoid dynamic list growth. + max_instructions = max(1, len(statements) * 4 + num_qubits) + opcodes = np.empty(max_instructions, dtype=np.int32) + targets = np.empty(max_instructions, dtype=np.int32) + controls = np.empty(max_instructions, dtype=np.int32) + gate_params = np.empty(max_instructions * 4, dtype=np.complex128) + diag_phases = np.empty(max_instructions * 2, dtype=np.complex128) + two_qubit_offsets = np.empty(max_instructions, dtype=np.int32) + two_qubit_gates = np.empty(max_instructions * 16, dtype=np.complex128) + + zero4 = np.zeros(4, dtype=np.complex128) + zero2 = np.zeros(2, dtype=np.complex128) + instruction_idx = 0 + tq_offset = 0 + + # target -> (is_diagonal, phase0, phase1, matrix) + pending: dict[int, tuple[bool, complex, complex, np.ndarray | None]] = {} + + def _write_single(target: int, mat: np.ndarray): + nonlocal instruction_idx + opcodes[instruction_idx] = _OP_SINGLE + targets[instruction_idx] = target + controls[instruction_idx] = -1 + gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = mat.ravel() + diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2 + two_qubit_offsets[instruction_idx] = -1 + instruction_idx += 1 + + def _write_diagonal(target: int, phase0: complex, phase1: complex): + nonlocal instruction_idx + opcodes[instruction_idx] = _OP_DIAGONAL + targets[instruction_idx] = target + controls[instruction_idx] = -1 + gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4 + diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = (phase0, phase1) + two_qubit_offsets[instruction_idx] = -1 + instruction_idx += 1 + + def _write_controlled(control: int, target: int, flat: np.ndarray): + nonlocal instruction_idx + opcodes[instruction_idx] = _OP_CONTROLLED + targets[instruction_idx] = target + controls[instruction_idx] = control + gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = flat + diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2 + two_qubit_offsets[instruction_idx] = -1 + instruction_idx += 1 + + def _write_controlled_diagonal(control: int, target: int, phase: complex): + nonlocal instruction_idx + opcodes[instruction_idx] = _OP_CTRL_DIAGONAL + targets[instruction_idx] = target + controls[instruction_idx] = control + gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4 + diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = (phase, 0j) + two_qubit_offsets[instruction_idx] = -1 + instruction_idx += 1 + + def _write_two_qubit(control: int, target: int, flat: np.ndarray): + nonlocal instruction_idx, tq_offset + opcodes[instruction_idx] = _OP_TWO_QUBIT + targets[instruction_idx] = target + controls[instruction_idx] = control + gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4 + diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2 + two_qubit_offsets[instruction_idx] = tq_offset + two_qubit_gates[tq_offset : tq_offset + 16] = flat + tq_offset += 16 + instruction_idx += 1 + + def _flush_pending(target: int): + if target not in pending: + return + is_diagonal, phase0, phase1, mat = pending.pop(target) + + if is_diagonal and (np.abs(phase0 - 1.0) < 1e-10) and (np.abs(phase1 - 1.0) < 1e-10): + return + if not is_diagonal and mat is not None and _is_diagonal_matrix(mat): + is_diagonal = True + phase0, phase1 = _diagonal_phases_from_matrix(mat) + mat = None + + if is_diagonal: + _write_diagonal(target, phase0, phase1) + else: + if mat is None: + mat = _diag_to_matrix(phase0, phase1) + _write_single(target, mat) + + def _flush_all(): + for target in list(pending.keys()): + _flush_pending(target) + + def _accumulate( + target: int, + new_is_diagonal: bool, + new_phase0: complex, + new_phase1: complex, + new_mat: np.ndarray | None, + ): + if target not in pending: + pending[target] = (new_is_diagonal, new_phase0, new_phase1, new_mat) + return + + current_is_diagonal, current_phase0, current_phase1, current_mat = pending[target] + if current_is_diagonal and new_is_diagonal: + pending[target] = ( + True, + new_phase0 * current_phase0, + new_phase1 * current_phase1, + None, + ) + return + + current_matrix = ( + _diag_to_matrix(current_phase0, current_phase1) if current_is_diagonal else current_mat + ) + new_matrix = _diag_to_matrix(new_phase0, new_phase1) if new_is_diagonal else new_mat + if current_matrix is None or new_matrix is None: + raise ValueError("Invalid pending gate fusion state.") + fused = new_matrix @ current_matrix + if _is_diagonal_matrix(fused): + phase0, phase1 = _diagonal_phases_from_matrix(fused) + pending[target] = (True, phase0, phase1, None) + else: + pending[target] = (False, _COMPLEX_ZERO, _COMPLEX_ZERO, fused) + + for statement in statements: + if not isinstance(statement, QuantumGate): + continue + + gate_name = statement.name.name + params = _extract_params(statement) + + if len(statement.qubits) == 1: + target = statement.qubits[0].indices[0][0].value + + if gate_name in NON_PARAMETERIZED_GATE_PROPS: + is_diagonal, phase0, phase1, mat = NON_PARAMETERIZED_GATE_PROPS[gate_name] + _accumulate(target, is_diagonal, phase0, phase1, mat) + elif gate_name == "rz" and len(params) == 1: + phase0, phase1 = _rz_phases(params[0]) + _accumulate(target, True, phase0, phase1, None) + elif gate_name in PARAMETERIZED_GATES: + gate_fn = PARAMETERIZED_GATES[gate_name] + required_params = 3 if gate_name == "u3" else 1 + if len(params) != required_params: + raise ValueError(f"Gate {gate_name} requires {required_params} parameter(s).") + mat = gate_fn(*params) + _accumulate(target, False, _COMPLEX_ZERO, _COMPLEX_ZERO, mat) + else: + raise ValueError( + f"Gate '{gate_name}' not supported by simulator. " + f"Call module.unroll() first." + ) + + elif len(statement.qubits) == 2: + target = statement.qubits[1].indices[0][0].value + control = statement.qubits[0].indices[0][0].value + + # Flush pending gates only when they do not commute with the + # incoming two-qubit gate. Diagonal one-qubit gates commute with + # diagonal two-qubit gates (CZ/CRZ), so they can remain fused until + # a later non-diagonal interaction or the final flush. + for qubit in (control, target): + if qubit not in pending: + continue + is_diagonal, _, _, _ = pending[qubit] + if (not is_diagonal) or gate_name not in {"cz", "crz"}: + _flush_pending(qubit) + + if gate_name == "swap": + # Decompose SWAP into three controlled-X instructions. This + # stays on the optimized controlled-gate kernel and avoids the + # generic 4x4 two-qubit path for this common gate. + flat = GATE_CACHE["cx"] + _write_controlled(control, target, flat) + _write_controlled(target, control, flat) + _write_controlled(control, target, flat) + elif gate_name in CONTROLLED_DIAGONAL_PHASES: + phase = CONTROLLED_DIAGONAL_PHASES[gate_name] + _write_controlled_diagonal(control, target, phase) + elif gate_name == "crz" and len(params) == 1: + theta = params[0] + _, phase_at_11 = _rz_phases(theta) + _write_controlled_diagonal(control, target, phase_at_11) + elif gate_name in CONTROLLED_GATE_SUB_UNITARIES: + flat = GATE_CACHE[gate_name] + _write_controlled(control, target, flat) + else: + raise ValueError( + f"Gate '{gate_name}' not supported by simulator. " + f"Call module.unroll() first." + ) + + # Flush remaining pending single-qubit gates + _flush_all() + + n = instruction_idx + if n == 0: + return n, None, None, None, None, None, None, None + + if tq_offset > 0: + two_qubit_gates_trimmed = np.ascontiguousarray(two_qubit_gates[:tq_offset]) + else: + two_qubit_gates_trimmed = np.zeros(1, dtype=np.complex128) + + return ( + n, + opcodes[:n], + targets[:n], + controls[:n], + np.ascontiguousarray(gate_params[: n * 4]), + np.ascontiguousarray(diag_phases[: n * 2]), + two_qubit_offsets[:n], + two_qubit_gates_trimmed, + ) + + +@dataclass(frozen=True) +class SimulatorResult: + """Class to store the result of a statevector simulation.""" + + probabilities: np.ndarray + measurement_counts: Counter[str] + final_statevector: np.ndarray + + +class Simulator: + """ + Statevector simulator + + """ + + def __init__(self, seed: None | int = None): + """ + Initialize the statevector simulator. + + Parameters: + seed (None | int): A seed to initialize the `np.random.BitGenerator`. + If None, then fresh, unpredictable entropy will be pulled from + the OS using `np.random.SeedSequence`. + """ + self._rng = np.random.default_rng(seed) + + def run(self, program: QasmModule | str, shots: int = 1) -> SimulatorResult: + """Run the statevector simulator. + + For QasmModule input, the module is used as-is. Call module.unroll() + and module.remove_idle_qubits() beforehand if needed. + + For string input, loads/unrolls/removes idle qubits automatically. + + Args: + program (QasmModule | str): The program to simulate. + shots (int): The number of shots to simulate. Defaults to 1. + + Returns: + SimulatorResult: The result of the simulation. + + Raises: + ValueError: If shots is less than 0, or if the program contains + an unsupported gate or invalid syntax. + """ + if isinstance(program, str): + program = loads(program) + program.unroll() + program.remove_idle_qubits() + + num_qubits = program.num_qubits + sv = np.zeros(2**num_qubits, dtype=np.complex128) + sv[0] = 1.0 + + n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = _preprocess( + program, num_qubits + ) + + if n > 0: + apply_circuit( + sv, + num_qubits, + opcodes, + targets, + controls, + gate_params, + diag_phases, + tq_offsets, + tq_gates, + n, + ) + + if shots < 0: + raise ValueError("Shots must be greater than or equal to 0.") + + probabilities = np.abs(sv) ** 2 + samples = self._rng.choice(len(probabilities), size=shots, p=probabilities) + counts = Counter(format(s, f"0{num_qubits}b")[::-1] for s in samples) + + return SimulatorResult(probabilities, counts, sv) diff --git a/tests/test_perf_regression.py b/tests/test_perf_regression.py new file mode 100644 index 00000000..ab56caf7 --- /dev/null +++ b/tests/test_perf_regression.py @@ -0,0 +1,186 @@ +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=redefined-outer-name + +""" +Performance regression tests for the PyQASM statevector simulator. + +These tests enforce time budgets to catch performance regressions. Each test +runs the simulator multiple times and checks that the median execution time +stays within a generous upper bound (3x the baseline measured on Apple Silicon). + +Run with: pytest tests/test_perf_regression.py -v +Skip in CI: pytest -m "not benchmark" + +Time budgets are intentionally loose (3x baseline) to avoid flakiness on +different hardware. The goal is catching order-of-magnitude regressions, +not enforcing tight timings. + +Baselines (Apple Silicon M-series, March 2026): + Random 16q/200d: ~24 ms + Random 20q/300d: ~330 ms + Random 22q/400d: ~1625 ms + QFT 10q: ~27 ms + QFT 14q: ~58 ms +""" + +import random +import time + +import numpy as np +import pytest + +from pyqasm import loads as pyqasm_loads +from pyqasm.simulator.statevector import Simulator + +SINGLE_QUBIT_GATES = ["h", "x", "y", "z", "s", "t", "rx(1.0)", "ry(0.5)", "rz(0.3)"] +TWO_QUBIT_GATES = ["cx", "cy", "cz", "swap"] + +N_REPEATS = 5 +BUDGET_MULTIPLIER = 3 # allow 3x baseline for portability across hardware + + +def generate_random_qasm(num_qubits: int, depth: int, seed: int = 42) -> str: + rng = random.Random(seed) + lines = [ + "OPENQASM 3;", + 'include "stdgates.inc";', + f"qubit[{num_qubits}] q;", + ] + for _ in range(depth): + if num_qubits >= 2 and rng.random() < 0.4: + gate = rng.choice(TWO_QUBIT_GATES) + q0, q1 = rng.sample(range(num_qubits), 2) + lines.append(f"{gate} q[{q0}], q[{q1}];") + else: + gate = rng.choice(SINGLE_QUBIT_GATES) + q = rng.randint(0, num_qubits - 1) + lines.append(f"{gate} q[{q}];") + return "\n".join(lines) + + +def generate_qft_qasm(num_qubits: int) -> str: + lines = [ + "OPENQASM 3;", + 'include "stdgates.inc";', + f"qubit[{num_qubits}] q;", + ] + for i in range(num_qubits): + lines.append(f"h q[{i}];") + for j in range(i + 1, num_qubits): + k = j - i + angle = f"pi/{2**k}" + lines.append(f"crz({angle}) q[{j}], q[{i}];") + for i in range(num_qubits // 2): + lines.append(f"swap q[{i}], q[{num_qubits - 1 - i}];") + return "\n".join(lines) + + +def median_sim_time(qasm: str, sim: Simulator, n_repeats: int = N_REPEATS) -> float: + """Return the median wall-clock time (seconds) of sim.run() over n_repeats.""" + module = pyqasm_loads(qasm) + module.unroll() + module.remove_idle_qubits() + times = [] + for _ in range(n_repeats): + start = time.perf_counter() + sim.run(module, shots=0) + times.append(time.perf_counter() - start) + return float(np.median(times)) + + +@pytest.fixture +def sim(): + return Simulator(seed=0) + + +# --------------------------------------------------------------------------- +# Random circuit performance tests +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "num_qubits, depth, baseline_ms", + [ + (16, 200, 24), + (20, 300, 330), + ], + ids=["random-16q", "random-20q"], +) +def test_random_circuit_perf(sim, num_qubits, depth, baseline_ms): + """Median simulation time must stay within budget for random circuits.""" + qasm = generate_random_qasm(num_qubits, depth) + elapsed = median_sim_time(qasm, sim) + budget_ms = baseline_ms * BUDGET_MULTIPLIER + elapsed_ms = elapsed * 1000 + assert elapsed_ms < budget_ms, ( + f"Random {num_qubits}q/{depth}d: {elapsed_ms:.1f} ms exceeded " + f"budget {budget_ms:.0f} ms (baseline {baseline_ms} ms x{BUDGET_MULTIPLIER})" + ) + + +# --------------------------------------------------------------------------- +# QFT circuit performance tests +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "num_qubits, baseline_ms", + [ + (10, 27), + (14, 58), + ], + ids=["qft-10q", "qft-14q"], +) +def test_qft_circuit_perf(sim, num_qubits, baseline_ms): + """Median simulation time must stay within budget for QFT circuits.""" + qasm = generate_qft_qasm(num_qubits) + elapsed = median_sim_time(qasm, sim) + budget_ms = baseline_ms * BUDGET_MULTIPLIER + elapsed_ms = elapsed * 1000 + assert elapsed_ms < budget_ms, ( + f"QFT {num_qubits}q: {elapsed_ms:.1f} ms exceeded " + f"budget {budget_ms:.0f} ms (baseline {baseline_ms} ms x{BUDGET_MULTIPLIER})" + ) + + +# --------------------------------------------------------------------------- +# Scaling sanity check: ensure O(2^n) per gate, not worse +# --------------------------------------------------------------------------- + + +@pytest.mark.benchmark +def test_scaling_not_superexponential(sim): + """Verify that doubling qubits roughly doubles per-gate time (not worse). + + Compares 14q vs 16q random circuits with the same gate count. The ratio + of per-gate times should be near 4x (since state size quadruples with +2 + qubits). We allow up to 8x to account for cache effects and noise. + """ + depth = 150 + qasm_14 = generate_random_qasm(14, depth) + qasm_16 = generate_random_qasm(16, depth) + + t14 = median_sim_time(qasm_14, sim) + t16 = median_sim_time(qasm_16, sim) + + ratio = t16 / t14 + # +2 qubits => 4x state size => expect ~4x time. Allow up to 8x. + assert ratio < 8.0, ( + f"Scaling ratio 16q/14q = {ratio:.1f}x, expected <8x " + f"(14q={t14*1000:.1f}ms, 16q={t16*1000:.1f}ms)" + ) diff --git a/tests/test_sv_sim.py b/tests/test_sv_sim.py new file mode 100644 index 00000000..d9b92263 --- /dev/null +++ b/tests/test_sv_sim.py @@ -0,0 +1,350 @@ +# Copyright 2025 qBraid +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=redefined-outer-name + +""" +Module containing tests for the PyQASM statevector simulator. + +""" + +import numpy as np +import pytest +from qiskit import transpile +from qiskit.qasm3 import loads +from qiskit_aer import AerSimulator + +from pyqasm.simulator.statevector import Simulator + + +@pytest.fixture +def aer_simulator(): + """Fixture to create and return a Qiskit AerSimulator.""" + return AerSimulator(method="statevector") + + +@pytest.fixture +def pyqasm_simulator(): + """Fixture to create and return a pyqasm simulator.""" + return Simulator(seed=22) + + +@pytest.mark.parametrize( + "qasm, description", + [ + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + h q[0]; + """, + "Hadamard gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + rx(pi) q[0]; + """, + "RX(pi) gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + ry(pi) q[0]; + """, + "RY(pi) gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + rz(pi) q[0]; + """, + "RZ(pi) gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + s q[0]; + """, + "S gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + t q[0]; + """, + "T gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + sdg q[0]; + """, + "Sdg gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + tdg q[0]; + """, + "Tdg gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + id q[0]; + """, + "Identity gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + u3(1.0, 0.5, 0.3) q[0]; + """, + "U3 gate (global phase)", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[1] q; + h q[0]; + rx(pi) q[0]; + ry(pi) q[0]; + rz(pi) q[0]; + s q[0]; + t q[0]; + sdg q[0]; + tdg q[0]; + id q[0]; + """, + "Combined gates: H, RX(pi), RY(pi), RZ(pi), S, T, Sdg, Tdg, ID", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + cx q[0], q[1]; + """, + "CX gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + x q[0]; + x q[1]; + cy q[0], q[1]; + """, + "CY on |11>", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + x q[0]; + x q[1]; + cz q[0], q[1]; + """, + "CZ on |11>", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + x q[1]; + swap q[0], q[1]; + """, + "SWAP on |01>", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[3] q; + x q[0]; + id q[1]; + id q[2]; + """, + "X on qubit 0 of 3-qubit system", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + cy q[0], q[1]; + """, + "CY gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + cz q[0], q[1]; + """, + "CZ gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + swap q[0], q[1]; + """, + "Swap gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + h q[0]; + crz(pi/4) q[0], q[1]; + """, + "CRZ gate", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + h q[0]; + h q[1]; + """, + "Two H Gates", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + h q[0]; + cx q[0], q[1]; + """, + "Bell state", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[3] q; + h q[0]; + cx q[0], q[1]; + cx q[1], q[2]; + """, + "GHZ state", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + h q[0]; + swap q[0], q[1]; + cx q[0], q[1]; + """, + "Bell state with swap", + ), + ( + """ + OPENQASM 3; + include "stdgates.inc"; + qubit[2] q; + h q[0]; + swap q[1], q[0]; + cx q[0], q[1]; + """, + "Bell state with swap rev qubits", + ), + ], +) +def test_simulator_sv_results(qasm, description, aer_simulator, pyqasm_simulator): + """Test PyQASM simulator by comparing SV results against the Qiskit Aer simulator.""" + circuit = loads(qasm) + circuit.save_statevector() + compiled_circuit = transpile(circuit, aer_simulator, optimization_level=0) + result = aer_simulator.run(compiled_circuit).result() + sv_qiskit = result.get_statevector(compiled_circuit) + sv_expected = np.asarray(sv_qiskit) + + result = pyqasm_simulator.run(qasm, shots=1000) + + sv_actual = result.final_statevector + if "global phase" in description: + # Compare up to global phase: find first nonzero element and normalize + idx = np.argmax(np.abs(sv_expected) > 1e-10) + phase = sv_actual[idx] / sv_expected[idx] + sv_actual = sv_actual / phase + + assert np.allclose(sv_actual, sv_expected), ( + f"Test failed for: {description}\n" + f"PyQASM simulator statevector: {result.final_statevector}\n" + f"Expected statevector: {sv_expected}" + ) + + +def test_simulator_large_circuit(aer_simulator, pyqasm_simulator): + """Test PyQASM simulator on an 8-qubit circuit.""" + qasm = """ + OPENQASM 3; + include "stdgates.inc"; + qubit[8] q; + h q[0]; + cx q[0], q[1]; + cx q[1], q[2]; + cx q[2], q[3]; + h q[4]; + cx q[4], q[5]; + cx q[5], q[6]; + cx q[6], q[7]; + swap q[3], q[4]; + """ + circuit = loads(qasm) + circuit.save_statevector() + compiled_circuit = transpile(circuit, aer_simulator, optimization_level=0) + result_qiskit = aer_simulator.run(compiled_circuit).result() + sv_expected = np.asarray(result_qiskit.get_statevector(compiled_circuit)) + + result = pyqasm_simulator.run(qasm, shots=0) + + assert np.allclose(result.final_statevector, sv_expected), ( + f"Test failed for 8-qubit circuit\n" + f"PyQASM: {result.final_statevector}\n" + f"Expected: {sv_expected}" + )