diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 69db59fb..b5bfe0f8 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -107,6 +107,10 @@ jobs:
           # Thus, the matrix.python is not required here, only for setting up builds
           python-version: "3.11"
 
+      - name: Install OpenMP runtime on macOS
+        if: startsWith(matrix.os, 'macos')
+        run: brew install libomp
+
       - name: Build and Test Wheels
         env: 
           # since cibuildwheel builds for all python versions >= current version, we need to 
@@ -190,4 +194,4 @@ jobs:
           token: ${{ secrets.CODECOV_TOKEN }}
           fail_ci_if_error: false
           files: ./build/coverage/coverage.xml
-          verbose: true
\ No newline at end of file
+          verbose: true
diff --git a/.github/workflows/pre-release.yml b/.github/workflows/pre-release.yml
index cdca0227..9581e3c5 100644
--- a/.github/workflows/pre-release.yml
+++ b/.github/workflows/pre-release.yml
@@ -96,6 +96,10 @@ jobs:
         with:
           python-version: "3.11"
 
+      - name: Install OpenMP runtime on macOS
+        if: startsWith(matrix.os, 'macos')
+        run: brew install libomp
+
       - name: Build and Test Wheels
         env: 
           PRE_RELEASE_BUILD: "true"
@@ -176,4 +180,4 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__
-          password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 142c34d8..166cee61 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -98,6 +98,10 @@ jobs:
         with:
           python-version: "3.11"
 
+      - name: Install OpenMP runtime on macOS
+        if: startsWith(matrix.os, 'macos')
+        run: brew install libomp
+
       - name: Build and Test Wheels
         env: 
           CIBW_BUILD: "cp${{ matrix.python }}-${{ matrix.platform_id }}"
@@ -167,4 +171,4 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           user: __token__
-          password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/test-release.yml b/.github/workflows/test-release.yml
index bd2b9475..b7f79107 100644
--- a/.github/workflows/test-release.yml
+++ b/.github/workflows/test-release.yml
@@ -96,6 +96,10 @@ jobs:
         with:
           python-version: "3.11"
 
+      - name: Install OpenMP runtime on macOS
+        if: startsWith(matrix.os, 'macos')
+        run: brew install libomp
+
       - name: Build and Test Wheels
         env: 
           CIBW_BUILD: "cp${{ matrix.python }}-${{ matrix.platform_id }}"
@@ -166,4 +170,4 @@ jobs:
         with:
           repository-url: https://test.pypi.org/legacy/
           user: __token__
-          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
\ No newline at end of file
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 09e6a753..c9961e83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ __pycache__/
 
 # Cython build files
 src/pyqasm/accelerate/linalg.c
+src/pyqasm/accelerate/sv_sim.c
 
 # Distribution / packaging
 .Python
diff --git a/benchmarks/bench_evolved.json b/benchmarks/bench_evolved.json
new file mode 100644
index 00000000..941d4b03
--- /dev/null
+++ b/benchmarks/bench_evolved.json
@@ -0,0 +1,183 @@
+{
+  "benchmark": "statevector preprocessing no-cache comparison",
+  "baseline": "PyQASM sv branch before evolved preprocessing commit",
+  "candidate": "PyQASM sv branch with evolved no-cache preprocessing",
+  "n_repeats": 5,
+  "pyqasm_num_threads": 1,
+  "notes": [
+    "No QASM string memoization is included in the candidate.",
+    "Both series use pre-unrolled QasmModule inputs, matching the branch benchmark convention."
+  ],
+  "series": {
+    "random": [
+      {
+        "num_qubits": 4,
+        "depth": 20,
+        "pyqasm_sv_ms": 0.06092991679906845,
+        "evolved_no_cache_ms": 0.06546999793499708,
+        "speedup": 0.9306540204807044,
+        "correct": true
+      },
+      {
+        "num_qubits": 6,
+        "depth": 40,
+        "pyqasm_sv_ms": 0.1580890966579318,
+        "evolved_no_cache_ms": 0.1151899341493845,
+        "speedup": 1.372421104546456,
+        "correct": true
+      },
+      {
+        "num_qubits": 8,
+        "depth": 60,
+        "pyqasm_sv_ms": 0.16204908024519682,
+        "evolved_no_cache_ms": 0.14194007962942123,
+        "speedup": 1.1416724625509327,
+        "correct": true
+      },
+      {
+        "num_qubits": 10,
+        "depth": 80,
+        "pyqasm_sv_ms": 0.24963996838778257,
+        "evolved_no_cache_ms": 0.21547905635088682,
+        "speedup": 1.1585347208002805,
+        "correct": true
+      },
+      {
+        "num_qubits": 12,
+        "depth": 100,
+        "pyqasm_sv_ms": 0.458328053355217,
+        "evolved_no_cache_ms": 0.41990901809185743,
+        "speedup": 1.0914937131808757,
+        "correct": true
+      },
+      {
+        "num_qubits": 14,
+        "depth": 150,
+        "pyqasm_sv_ms": 1.5235739992931485,
+        "evolved_no_cache_ms": 1.3638049131259322,
+        "speedup": 1.117149516495739,
+        "correct": true
+      },
+      {
+        "num_qubits": 16,
+        "depth": 200,
+        "pyqasm_sv_ms": 7.1852849796414375,
+        "evolved_no_cache_ms": 5.937180016189814,
+        "speedup": 1.210218480835721,
+        "correct": true
+      },
+      {
+        "num_qubits": 18,
+        "depth": 250,
+        "pyqasm_sv_ms": 35.15114798210561,
+        "evolved_no_cache_ms": 28.548280941322446,
+        "speedup": 1.231287728124666,
+        "correct": true
+      },
+      {
+        "num_qubits": 20,
+        "depth": 300,
+        "pyqasm_sv_ms": 162.53185598179698,
+        "evolved_no_cache_ms": 126.72401999589056,
+        "speedup": 1.282565499319447,
+        "correct": true
+      },
+      {
+        "num_qubits": 22,
+        "depth": 400,
+        "pyqasm_sv_ms": 1289.0826460206881,
+        "evolved_no_cache_ms": 1231.460615992546,
+        "speedup": 1.0467916141854843,
+        "correct": true
+      }
+    ],
+    "qft": [
+      {
+        "num_qubits": 4,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 0.20883907563984394,
+        "evolved_no_cache_ms": 0.16430008690804243,
+        "speedup": 1.2710831720784765,
+        "correct": true
+      },
+      {
+        "num_qubits": 6,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 0.4983479157090187,
+        "evolved_no_cache_ms": 0.315018929541111,
+        "speedup": 1.5819618091997316,
+        "correct": true
+      },
+      {
+        "num_qubits": 8,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 0.9089269442483783,
+        "evolved_no_cache_ms": 0.5902479169890285,
+        "speedup": 1.5399070764789728,
+        "correct": true
+      },
+      {
+        "num_qubits": 10,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 1.5004950109869242,
+        "evolved_no_cache_ms": 1.0477059986442327,
+        "speedup": 1.4321718238977499,
+        "correct": true
+      },
+      {
+        "num_qubits": 12,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 2.5123610394075513,
+        "evolved_no_cache_ms": 1.782613922841847,
+        "speedup": 1.409369133279482,
+        "correct": true
+      },
+      {
+        "num_qubits": 14,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 5.448751035146415,
+        "evolved_no_cache_ms": 4.14173596072942,
+        "speedup": 1.3155718005226993,
+        "correct": true
+      },
+      {
+        "num_qubits": 16,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 18.029507948085666,
+        "evolved_no_cache_ms": 15.524596092291176,
+        "speedup": 1.1613511772482323,
+        "correct": true
+      },
+      {
+        "num_qubits": 18,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 77.3717820411548,
+        "evolved_no_cache_ms": 68.92851099837571,
+        "speedup": 1.1224931587885028,
+        "correct": true
+      },
+      {
+        "num_qubits": 20,
+        "depth": "QFT",
+        "pyqasm_sv_ms": 382.6392919290811,
+        "evolved_no_cache_ms": 349.38782698009163,
+        "speedup": 1.0951706452866319,
+        "correct": true
+      }
+    ]
+  },
+  "summary": {
+    "random": {
+      "geomean_speedup": 1.1521318658620598,
+      "min_speedup": 0.9306540204807044,
+      "max_speedup": 1.372421104546456,
+      "all_correct": true
+    },
+    "qft": {
+      "geomean_speedup": 1.3147632402156637,
+      "min_speedup": 1.0951706452866319,
+      "max_speedup": 1.5819618091997316,
+      "all_correct": true
+    }
+  }
+}
diff --git a/benchmarks/bench_qft.png b/benchmarks/bench_qft.png
new file mode 100644
index 00000000..78d2eb64
Binary files /dev/null and b/benchmarks/bench_qft.png differ
diff --git a/benchmarks/bench_qft_evolved.png b/benchmarks/bench_qft_evolved.png
new file mode 100644
index 00000000..2dab0d29
Binary files /dev/null and b/benchmarks/bench_qft_evolved.png differ
diff --git a/benchmarks/bench_random.png b/benchmarks/bench_random.png
new file mode 100644
index 00000000..a783fa11
Binary files /dev/null and b/benchmarks/bench_random.png differ
diff --git a/benchmarks/bench_random_evolved.png b/benchmarks/bench_random_evolved.png
new file mode 100644
index 00000000..27228cc4
Binary files /dev/null and b/benchmarks/bench_random_evolved.png differ
diff --git a/benchmarks/bench_simulator.py b/benchmarks/bench_simulator.py
new file mode 100644
index 00000000..6ab4b3c3
--- /dev/null
+++ b/benchmarks/bench_simulator.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Benchmark: PyQASM statevector simulator vs Qiskit Aer, Cirq,
+and PennyLane Lightning.
+
+Each simulator builds circuits programmatically from a shared gate list so the
+comparison is apples-to-apples. Timing covers only simulation (circuit
+construction is excluded).
+
+Usage:
+    python benchmarks/bench_simulator.py
+"""
+
+import math
+import random
+import time
+
+import numpy as np
+from tabulate import tabulate
+
+# ---------------------------------------------------------------------------
+# Shared circuit representation
+# ---------------------------------------------------------------------------
+
+# Gate spec: (name, qubits_tuple, params_tuple)
+#   name: "h", "x", "y", "z", "s", "t", "rx", "ry", "rz", "cx", "cy", "cz",
+#         "swap", "crz"
+# qubits_tuple: (target,) for 1q gates, (control, target) for 2q gates
+# params_tuple: () or (angle,)
+
+SINGLE_GATE_NAMES = ["h", "x", "y", "z", "s", "t", "rx", "ry", "rz"]
+TWO_GATE_NAMES = ["cx", "cy", "cz", "swap"]
+
+
+def generate_random_gates(num_qubits: int, depth: int, seed: int = 42):
+    """Return (num_qubits, gate_list) for a random circuit."""
+    rng = random.Random(seed)
+    gates = []
+    for _ in range(depth):
+        if num_qubits >= 2 and rng.random() < 0.4:
+            name = rng.choice(TWO_GATE_NAMES)
+            q0, q1 = rng.sample(range(num_qubits), 2)
+            gates.append((name, (q0, q1), ()))
+        else:
+            name = rng.choice(SINGLE_GATE_NAMES)
+            q = rng.randint(0, num_qubits - 1)
+            if name == "rx":
+                gates.append((name, (q,), (1.0,)))
+            elif name == "ry":
+                gates.append((name, (q,), (0.5,)))
+            elif name == "rz":
+                gates.append((name, (q,), (0.3,)))
+            else:
+                gates.append((name, (q,), ()))
+    return num_qubits, gates
+
+
+def generate_qft_gates(num_qubits: int):
+    """Return (num_qubits, gate_list) for a QFT circuit."""
+    gates = []
+    for i in range(num_qubits):
+        gates.append(("h", (i,), ()))
+        for j in range(i + 1, num_qubits):
+            k = j - i
+            angle = math.pi / (2**k)
+            gates.append(("crz", (j, i), (angle,)))
+    for i in range(num_qubits // 2):
+        gates.append(("swap", (i, num_qubits - 1 - i), ()))
+    return num_qubits, gates
+
+
+# ---------------------------------------------------------------------------
+# Gate-list → QASM 3 string (for PyQASM / Qiskit)
+# ---------------------------------------------------------------------------
+
+def gates_to_qasm(num_qubits, gates):
+    lines = [
+        "OPENQASM 3;",
+        'include "stdgates.inc";',
+        f"qubit[{num_qubits}] q;",
+    ]
+    for name, qubits, params in gates:
+        if len(qubits) == 1:
+            q = qubits[0]
+            if params:
+                lines.append(f"{name}({params[0]}) q[{q}];")
+            else:
+                lines.append(f"{name} q[{q}];")
+        else:
+            q0, q1 = qubits
+            if params:
+                lines.append(f"{name}({params[0]}) q[{q0}], q[{q1}];")
+            else:
+                lines.append(f"{name} q[{q0}], q[{q1}];")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Simulator wrappers
+# ---------------------------------------------------------------------------
+
+N_REPEATS = 5
+
+
+def _median_time(fn, n_repeats=N_REPEATS):
+    """Run fn() n_repeats times, return (median_seconds, last_result)."""
+    times = []
+    result = None
+    for _ in range(n_repeats):
+        start = time.perf_counter()
+        result = fn()
+        times.append(time.perf_counter() - start)
+    return float(np.median(times)), result
+
+
+# -- PyQASM ------------------------------------------------------------------
+
+def prepare_pyqasm(num_qubits, gates):
+    from pyqasm import loads as pyqasm_loads
+    from pyqasm.simulator.statevector import Simulator
+
+    qasm = gates_to_qasm(num_qubits, gates)
+    module = pyqasm_loads(qasm)
+    module.unroll()
+    module.remove_idle_qubits()
+    sim = Simulator(seed=0)
+    return module, sim
+
+
+def bench_pyqasm(module, sim):
+    def run():
+        return sim.run(module, shots=0).final_statevector
+    return _median_time(run)
+
+
+# -- Qiskit Aer ---------------------------------------------------------------
+
+def prepare_qiskit(num_qubits, gates):
+    from qiskit import transpile
+    from qiskit.qasm3 import loads as qiskit_loads
+    from qiskit_aer import AerSimulator
+
+    qasm = gates_to_qasm(num_qubits, gates)
+    backend = AerSimulator(method="statevector")
+    circuit = qiskit_loads(qasm)
+    circuit.save_statevector()
+    compiled = transpile(circuit, backend, optimization_level=0)
+    return compiled, backend
+
+
+def bench_qiskit(compiled, backend):
+    def run():
+        job = backend.run(compiled)
+        result = job.result()
+        return np.asarray(result.get_statevector(compiled))
+    return _median_time(run)
+
+
+# -- Cirq ----------------------------------------------------------------------
+
+def _build_cirq_circuit(num_qubits, gates):
+    import cirq
+
+    qubits = cirq.LineQubit.range(num_qubits)
+    ops = []
+    for name, qubit_indices, params in gates:
+        if name == "h":
+            ops.append(cirq.H(qubits[qubit_indices[0]]))
+        elif name == "x":
+            ops.append(cirq.X(qubits[qubit_indices[0]]))
+        elif name == "y":
+            ops.append(cirq.Y(qubits[qubit_indices[0]]))
+        elif name == "z":
+            ops.append(cirq.Z(qubits[qubit_indices[0]]))
+        elif name == "s":
+            ops.append(cirq.S(qubits[qubit_indices[0]]))
+        elif name == "t":
+            ops.append(cirq.T(qubits[qubit_indices[0]]))
+        elif name == "rx":
+            ops.append(cirq.rx(params[0])(qubits[qubit_indices[0]]))
+        elif name == "ry":
+            ops.append(cirq.ry(params[0])(qubits[qubit_indices[0]]))
+        elif name == "rz":
+            ops.append(cirq.rz(params[0])(qubits[qubit_indices[0]]))
+        elif name == "cx":
+            ops.append(cirq.CNOT(qubits[qubit_indices[0]], qubits[qubit_indices[1]]))
+        elif name == "cy":
+            ops.append(
+                cirq.ControlledGate(cirq.Y).on(
+                    qubits[qubit_indices[0]], qubits[qubit_indices[1]]
+                )
+            )
+        elif name == "cz":
+            ops.append(cirq.CZ(qubits[qubit_indices[0]], qubits[qubit_indices[1]]))
+        elif name == "swap":
+            ops.append(cirq.SWAP(qubits[qubit_indices[0]], qubits[qubit_indices[1]]))
+        elif name == "crz":
+            ops.append(
+                cirq.ControlledGate(cirq.rz(params[0])).on(
+                    qubits[qubit_indices[0]], qubits[qubit_indices[1]]
+                )
+            )
+    return cirq.Circuit(ops), qubits
+
+
+def prepare_cirq(num_qubits, gates):
+    circuit, qubits = _build_cirq_circuit(num_qubits, gates)
+    import cirq
+    sim = cirq.Simulator(dtype=np.complex128)
+    return circuit, sim, qubits
+
+
+def bench_cirq(circuit, sim, qubits):
+    def run():
+        result = sim.simulate(circuit, qubit_order=qubits)
+        return result.final_state_vector
+    return _median_time(run)
+
+
+# -- PennyLane Lightning -------------------------------------------------------
+
+def _build_pennylane_fn(num_qubits, gates):
+    import pennylane as qml
+
+    dev = qml.device("lightning.qubit", wires=num_qubits)
+
+    gate_list = list(gates)  # capture for closure
+
+    @qml.qnode(dev)
+    def circuit():
+        for name, qubit_indices, params in gate_list:
+            if name == "h":
+                qml.Hadamard(qubit_indices[0])
+            elif name == "x":
+                qml.PauliX(qubit_indices[0])
+            elif name == "y":
+                qml.PauliY(qubit_indices[0])
+            elif name == "z":
+                qml.PauliZ(qubit_indices[0])
+            elif name == "s":
+                qml.S(qubit_indices[0])
+            elif name == "t":
+                qml.T(qubit_indices[0])
+            elif name == "rx":
+                qml.RX(params[0], qubit_indices[0])
+            elif name == "ry":
+                qml.RY(params[0], qubit_indices[0])
+            elif name == "rz":
+                qml.RZ(params[0], qubit_indices[0])
+            elif name == "cx":
+                qml.CNOT([qubit_indices[0], qubit_indices[1]])
+            elif name == "cy":
+                qml.CY([qubit_indices[0], qubit_indices[1]])
+            elif name == "cz":
+                qml.CZ([qubit_indices[0], qubit_indices[1]])
+            elif name == "swap":
+                qml.SWAP([qubit_indices[0], qubit_indices[1]])
+            elif name == "crz":
+                qml.CRZ(params[0], [qubit_indices[0], qubit_indices[1]])
+        return qml.state()
+
+    return circuit
+
+
+def prepare_pennylane(num_qubits, gates):
+    circuit_fn = _build_pennylane_fn(num_qubits, gates)
+    return (circuit_fn,)
+
+
+def bench_pennylane(circuit_fn):
+    def run():
+        return np.asarray(circuit_fn())
+    return _median_time(run)
+
+
+# ---------------------------------------------------------------------------
+# Qubit ordering helpers
+# ---------------------------------------------------------------------------
+
+def _reverse_endian(sv, num_qubits):
+    """Convert big-endian statevector to little-endian (or vice versa)."""
+    return sv.reshape([2] * num_qubits).transpose(range(num_qubits - 1, -1, -1)).ravel()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def run_benchmarks(circuit_name, configs, generator_fn):
+    """Run all simulators on a set of circuit configs."""
+    headers = [
+        "Qubits", "Depth",
+        "PyQASM (ms)", "Qiskit (ms)", "Cirq (ms)", "Lightning (ms)",
+        "Correct",
+    ]
+    rows = []
+
+    print(f"\n{'='*90}")
+    print(f"  {circuit_name} circuits  (median of {N_REPEATS} runs)")
+    print(f"{'='*90}")
+
+    for config in configs:
+        if len(config) == 2:
+            num_qubits, depth = config
+            nq, gates = generator_fn(num_qubits, depth)
+        else:
+            num_qubits = config[0]
+            depth = None
+            nq, gates = generator_fn(num_qubits)
+
+        depth_str = str(depth) if depth is not None else "QFT"
+
+        # --- Prepare all simulators ---
+        pyqasm_args = prepare_pyqasm(nq, gates)
+        qiskit_args = prepare_qiskit(nq, gates)
+        cirq_args = prepare_cirq(nq, gates)
+        pl_args = prepare_pennylane(nq, gates)
+
+        # --- Benchmark ---
+        t_pyqasm, sv_pyqasm = bench_pyqasm(*pyqasm_args)
+        t_qiskit, sv_qiskit = bench_qiskit(*qiskit_args)
+        t_cirq, sv_cirq = bench_cirq(*cirq_args)
+        t_pl, sv_pl = bench_pennylane(*pl_args)
+
+        # --- Correctness: compare all against PyQASM (little-endian) ---
+        # Qiskit Aer is little-endian like PyQASM
+        # Cirq, PennyLane are big-endian → convert
+        sv_cirq_le = _reverse_endian(sv_cirq, nq)
+        sv_pl_le = _reverse_endian(sv_pl, nq)
+
+        checks = {
+            "qiskit": np.allclose(sv_pyqasm, sv_qiskit, atol=1e-10),
+            "cirq": np.allclose(sv_pyqasm, sv_cirq_le, atol=1e-10),
+            "lightning": np.allclose(sv_pyqasm, sv_pl_le, atol=1e-10),
+        }
+        all_pass = all(checks.values())
+        status = "PASS" if all_pass else "FAIL " + ",".join(
+            k for k, v in checks.items() if not v
+        )
+
+        rows.append([
+            nq, depth_str,
+            f"{t_pyqasm*1000:.2f}",
+            f"{t_qiskit*1000:.2f}",
+            f"{t_cirq*1000:.2f}",
+            f"{t_pl*1000:.2f}",
+            status,
+        ])
+
+        print(
+            f"  n={nq:2d}  depth={depth_str:>4s}  "
+            f"pyqasm={t_pyqasm*1000:8.2f}  "
+            f"qiskit={t_qiskit*1000:8.2f}  "
+            f"cirq={t_cirq*1000:8.2f}  "
+            f"lightning={t_pl*1000:8.2f} ms  "
+            f"{status}"
+        )
+
+    print()
+    print(tabulate(rows, headers=headers, tablefmt="github"))
+    return rows
+
+
+def main():
+    random_configs = [
+        (4, 20),
+        (6, 40),
+        (8, 60),
+        (10, 80),
+        (12, 100),
+        (14, 150),
+        (16, 200),
+        (18, 250),
+        (20, 300),
+        (22, 400),
+    ]
+
+    qft_configs = [
+        (4,),
+        (6,),
+        (8,),
+        (10,),
+        (12,),
+        (14,),
+        (16,),
+    ]
+
+    run_benchmarks("Random", random_configs, generate_random_gates)
+    run_benchmarks("QFT", qft_configs, generate_qft_gates)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/plot_benchmarks.py b/benchmarks/plot_benchmarks.py
new file mode 100644
index 00000000..8f83a299
--- /dev/null
+++ b/benchmarks/plot_benchmarks.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate benchmark plots comparing PyQASM against other simulators.
+
+Usage:
+    python benchmarks/plot_benchmarks.py
+"""
+
+import os
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Ensure we can import from the benchmarks directory
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from bench_simulator import (
+    N_REPEATS,
+    bench_cirq,
+    bench_pennylane,
+    bench_pyqasm,
+    bench_qiskit,
+    generate_qft_gates,
+    generate_random_gates,
+    prepare_cirq,
+    prepare_pennylane,
+    prepare_pyqasm,
+    prepare_qiskit,
+)
+
+RANDOM_CONFIGS = [
+    (4, 20),
+    (6, 40),
+    (8, 60),
+    (10, 80),
+    (12, 100),
+    (14, 150),
+    (16, 200),
+    (18, 250),
+    (20, 300),
+    (22, 400),
+]
+
+QFT_CONFIGS = [4, 6, 8, 10, 12, 14, 16, 18, 20]
+
+SIMULATORS = {
+    "PyQASM": (prepare_pyqasm, bench_pyqasm),
+    "Qiskit Aer": (prepare_qiskit, bench_qiskit),
+    "Cirq": (prepare_cirq, bench_cirq),
+    "PennyLane Lightning": (prepare_pennylane, bench_pennylane),
+}
+
+COLORS = {
+    "PyQASM": "#2563eb",
+    "Qiskit Aer": "#16a34a",
+    "Cirq": "#eab308",
+    "PennyLane Lightning": "#dc2626",
+}
+
+MARKERS = {
+    "PyQASM": "o",
+    "Qiskit Aer": "s",
+    "Cirq": "D",
+    "PennyLane Lightning": "^",
+}
+
+
+def collect_times(circuit_type, configs):
+    """Run benchmarks and return {simulator_name: ([qubits], [times_ms])}."""
+    results = {name: ([], []) for name in SIMULATORS}
+
+    for config in configs:
+        if circuit_type == "random":
+            nq, depth = config
+            num_qubits, gates = generate_random_gates(nq, depth)
+            label = f"n={nq}, d={depth}"
+        else:
+            nq = config
+            num_qubits, gates = generate_qft_gates(nq)
+            label = f"QFT n={nq}"
+
+        print(f"  {label} ...", end="", flush=True)
+        for name, (prepare_fn, bench_fn) in SIMULATORS.items():
+            args = prepare_fn(num_qubits, gates)
+            t, _ = bench_fn(*args)
+            results[name][0].append(num_qubits)
+            results[name][1].append(t * 1000)  # ms
+            print(f"  {name}={t*1000:.1f}ms", end="", flush=True)
+        print()
+
+    return results
+
+
+def plot_comparison(results, title, filename):
+    """Create a log-scale line plot comparing simulators."""
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    for name in SIMULATORS:
+        qubits, times = results[name]
+        ax.plot(
+            qubits, times,
+            marker=MARKERS[name],
+            color=COLORS[name],
+            linewidth=2,
+            markersize=7,
+            label=name,
+        )
+
+    ax.set_yscale("log")
+    ax.set_xlabel("Number of Qubits", fontsize=13)
+    ax.set_ylabel("Simulation Time (ms)", fontsize=13)
+    ax.set_title(title, fontsize=15)
+    ax.legend(fontsize=11)
+    ax.grid(True, which="both", ls="--", alpha=0.4)
+    ax.set_xticks(results["PyQASM"][0])
+    fig.tight_layout()
+    out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
+    fig.savefig(out_path, dpi=150)
+    print(f"  Saved: {out_path}")
+    plt.close(fig)
+
+
+def main():
+    print(f"Running benchmarks (median of {N_REPEATS} runs each)\n")
+
+    print("Random circuits:")
+    random_results = collect_times("random", RANDOM_CONFIGS)
+    plot_comparison(
+        random_results,
+        "Statevector Simulation Time — Random Circuits",
+        "bench_random.png",
+    )
+
+    print("\nQFT circuits:")
+    qft_results = collect_times("qft", QFT_CONFIGS)
+    plot_comparison(
+        qft_results,
+        "Statevector Simulation Time — QFT Circuits",
+        "bench_qft.png",
+    )
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/profile_simulator.py b/benchmarks/profile_simulator.py
new file mode 100644
index 00000000..2bbb1a91
--- /dev/null
+++ b/benchmarks/profile_simulator.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Profiling script for PyQASM statevector simulator.
+
+Three modes:
+  1. cProfile function-level profiling
+  2. Section timing (preprocess vs simulation vs post-processing)
+  3. Per-kernel timing (accumulate time per opcode type)
+
+Usage:
+  python benchmarks/profile_simulator.py [--mode cprofile|section|kernel] [--qubits N] [--depth D]
+"""
+
+import argparse
+import cProfile
+import pstats
+import random
+import time
+from io import StringIO
+
+import numpy as np
+
+from pyqasm import loads as pyqasm_loads
+from pyqasm.simulator.statevector import (
+    Simulator,
+    _OP_CONTROLLED,
+    _OP_CTRL_DIAGONAL,
+    _OP_DIAGONAL,
+    _OP_SINGLE,
+    _OP_TWO_QUBIT,
+    _fuse_gates,
+    _preprocess,
+)
+from pyqasm.accelerate.sv_sim import (
+    apply_circuit,
+    apply_controlled_diagonal_gate,
+    apply_controlled_gate,
+    apply_diagonal_gate,
+    apply_single_qubit_gate,
+    apply_two_qubit_gate,
+)
+
+
+SINGLE_QUBIT_GATES = ["h", "x", "y", "z", "s", "t", "rx(1.0)", "ry(0.5)", "rz(0.3)"]
+TWO_QUBIT_GATES = ["cx", "cy", "cz", "swap"]
+
+OP_NAMES = {
+    _OP_SINGLE: "single",
+    _OP_CONTROLLED: "controlled",
+    _OP_DIAGONAL: "diagonal",
+    _OP_CTRL_DIAGONAL: "ctrl_diag",
+    _OP_TWO_QUBIT: "two_qubit",
+}
+
+
+def generate_random_qasm(num_qubits: int, depth: int, seed: int = 42) -> str:
+    rng = random.Random(seed)
+    lines = [
+        "OPENQASM 3;",
+        'include "stdgates.inc";',
+        f"qubit[{num_qubits}] q;",
+    ]
+    for _ in range(depth):
+        if num_qubits >= 2 and rng.random() < 0.4:
+            gate = rng.choice(TWO_QUBIT_GATES)
+            q0, q1 = rng.sample(range(num_qubits), 2)
+            lines.append(f"{gate} q[{q0}], q[{q1}];")
+        else:
+            gate = rng.choice(SINGLE_QUBIT_GATES)
+            q = rng.randint(0, num_qubits - 1)
+            lines.append(f"{gate} q[{q}];")
+    return "\n".join(lines)
+
+
+def mode_cprofile(num_qubits, depth):
+    """Function-level profiling with cProfile."""
+    qasm = generate_random_qasm(num_qubits, depth)
+    sim = Simulator(seed=0)
+    module = pyqasm_loads(qasm)
+
+    print(f"\n=== cProfile: {num_qubits} qubits, depth {depth} ===\n")
+    pr = cProfile.Profile()
+    pr.enable()
+    sim.run(module, shots=0)
+    pr.disable()
+
+    s = StringIO()
+    ps = pstats.Stats(pr, stream=s).sort_stats("tottime")
+    ps.print_stats(30)
+    print(s.getvalue())
+
+
+def mode_section(num_qubits, depth):
+    """Section timing: preprocess vs simulation vs post-processing."""
+    qasm = generate_random_qasm(num_qubits, depth)
+    module = pyqasm_loads(qasm)
+    module.unroll()
+    module.remove_idle_qubits()
+    nq = module.num_qubits
+
+    print(f"\n=== Section timing: {num_qubits} qubits, depth {depth} ===\n")
+
+    # Preprocess
+    t0 = time.perf_counter_ns()
+    n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \
+        _preprocess(module, nq)
+    t1 = time.perf_counter_ns()
+
+    # Fusion
+    if n > 0:
+        n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \
+            _fuse_gates(n, opcodes, targets, controls, gate_params, diag_phases,
+                        tq_offsets, tq_gates)
+    t2 = time.perf_counter_ns()
+
+    # Simulation
+    sv = np.zeros(2**nq, dtype=np.complex128)
+    sv[0] = 1.0
+    if n > 0:
+        apply_circuit(sv, nq, opcodes, targets, controls, gate_params, diag_phases,
+                      tq_offsets, tq_gates, n)
+    t3 = time.perf_counter_ns()
+
+    # Post-processing
+    probabilities = np.abs(sv) ** 2
+    t4 = time.perf_counter_ns()
+
+    pre_us = (t1 - t0) / 1000
+    fuse_us = (t2 - t1) / 1000
+    sim_us = (t3 - t2) / 1000
+    post_us = (t4 - t3) / 1000
+    total_us = (t4 - t0) / 1000
+
+    print(f"  Instructions:   {n}")
+    print(f"  Preprocess:     {pre_us:10.1f} us  ({pre_us/total_us*100:5.1f}%)")
+    print(f"  Gate fusion:    {fuse_us:10.1f} us  ({fuse_us/total_us*100:5.1f}%)")
+    print(f"  Simulation:     {sim_us:10.1f} us  ({sim_us/total_us*100:5.1f}%)")
+    print(f"  Post-process:   {post_us:10.1f} us  ({post_us/total_us*100:5.1f}%)")
+    print(f"  Total:          {total_us:10.1f} us")
+
+
+def mode_kernel(num_qubits, depth):
+    """Per-kernel timing: accumulate time per opcode type."""
+    qasm = generate_random_qasm(num_qubits, depth)
+    module = pyqasm_loads(qasm)
+    module.unroll()
+    module.remove_idle_qubits()
+    nq = module.num_qubits
+
+    n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \
+        _preprocess(module, nq)
+    if n > 0:
+        n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = \
+            _fuse_gates(n, opcodes, targets, controls, gate_params, diag_phases,
+                        tq_offsets, tq_gates)
+
+    sv = np.zeros(2**nq, dtype=np.complex128)
+    sv[0] = 1.0
+
+    kernel_times = {}
+    kernel_counts = {}
+
+    print(f"\n=== Per-kernel timing: {num_qubits} qubits, depth {depth} ===\n")
+
+    for i in range(n):
+        op = int(opcodes[i])
+        tgt = int(targets[i])
+        ctrl = int(controls[i])
+        gp_off = i * 4
+        dp_off = i * 2
+
+        t0 = time.perf_counter_ns()
+        if op == _OP_SINGLE:
+            flat = gate_params[gp_off:gp_off + 4].copy()
+            apply_single_qubit_gate(sv, nq, tgt, flat)
+        elif op == _OP_CONTROLLED:
+            flat = gate_params[gp_off:gp_off + 4].copy()
+            apply_controlled_gate(sv, nq, ctrl, tgt, flat)
+        elif op == _OP_DIAGONAL:
+            apply_diagonal_gate(sv, nq, tgt, diag_phases[dp_off], diag_phases[dp_off + 1])
+        elif op == _OP_CTRL_DIAGONAL:
+            apply_controlled_diagonal_gate(sv, nq, ctrl, tgt, diag_phases[dp_off])
+        elif op == _OP_TWO_QUBIT:
+            tq_off = int(tq_offsets[i])
+            flat = tq_gates[tq_off:tq_off + 16].copy()
+            apply_two_qubit_gate(sv, nq, ctrl, tgt, flat)
+        t1 = time.perf_counter_ns()
+
+        name = OP_NAMES.get(op, f"op_{op}")
+        kernel_times[name] = kernel_times.get(name, 0) + (t1 - t0)
+        kernel_counts[name] = kernel_counts.get(name, 0) + 1
+
+    total_ns = sum(kernel_times.values())
+    for name in sorted(kernel_times.keys()):
+        t_us = kernel_times[name] / 1000
+        cnt = kernel_counts[name]
+        pct = kernel_times[name] / total_ns * 100 if total_ns > 0 else 0
+        print(f"  {name:15s}: {t_us:10.1f} us  ({pct:5.1f}%)  count={cnt}")
+    print(f"  {'total':15s}: {total_ns/1000:10.1f} us")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Profile PyQASM simulator")
+    parser.add_argument("--mode", choices=["cprofile", "section", "kernel"], default="section")
+    parser.add_argument("--qubits", type=int, default=16)
+    parser.add_argument("--depth", type=int, default=200)
+    args = parser.parse_args()
+
+    if args.mode == "cprofile":
+        mode_cprofile(args.qubits, args.depth)
+    elif args.mode == "section":
+        mode_section(args.qubits, args.depth)
+    elif args.mode == "kernel":
+        mode_kernel(args.qubits, args.depth)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index a70e0c38..e9a3303b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,9 +42,20 @@ Homepage = "https://docs.qbraid.com/v2/pyqasm"
 
 [project.optional-dependencies]
 cli = ["typer>=0.12.1", "rich>=10.11.0", "typing-extensions"]
-test = ["pytest", "pytest-cov", "pytest-mpl", "pillow<12.3.0", "matplotlib", "tabulate"]
+test = [
+    "pytest",
+    "pytest-cov",
+    "pytest-mpl",
+    "pillow<12.3.0",
+    "matplotlib",
+    "tabulate",
+    "qiskit",
+    "qiskit-aer",
+    "qiskit-qasm3-import",
+]
 lint = ["black", "isort>=6.0.0", "pylint", "mypy", "qbraid-cli>=0.10.2"]
 docs = ["sphinx>=7.3.7,<8.3.0", "sphinx-autodoc-typehints>=1.24,<3.2", "sphinx-rtd-theme>=2.0.0,<4.0.0", "docutils<0.23", "sphinx-copybutton"]
+simulation = ["numba>=0.59"]
 visualization = ["pillow<12.3.0", "matplotlib", "tabulate"]
 pulse = ["openpulse>=1.0.1"]
 
@@ -53,9 +64,6 @@ pyqasm = ["py.typed", "*.pyx"]
 
 [tool.setuptools]
 packages = {find = {where = ["src"]}}
-ext-modules = [
-  {name = "pyqasm.accelerate.linalg", sources = ["src/pyqasm/accelerate/linalg.pyx"], include-dirs = ["numpy"]}
-]
 
 [project.scripts]
 pyqasm = "pyqasm.cli.main:app"
@@ -75,6 +83,9 @@ line_length = 100
 [tool.pytest.ini_options]
 addopts = "-ra"
 testpaths = ["tests"]
+markers = [
+    "benchmark: performance regression tests (may be slow)",
+]
 
 [tool.coverage.run]
 source = ["pyqasm"]
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..f8621a39
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,84 @@
+"""Build configuration for Cython extensions with optional OpenMP support."""
+
+import os
+import platform
+import subprocess
+import sys
+import tempfile
+
+import numpy as np
+from Cython.Build import cythonize
+from setuptools import Extension, setup
+
+BASE_COMPILE_ARGS = ["-O3", "-ffast-math", "-march=native"]
+BASE_LINK_ARGS = []
+
+
+def _check_openmp():
+    """Detect OpenMP availability and return (compile_args, link_args)."""
+    test_code = b"#include <omp.h>\nint main() { return omp_get_max_threads(); }\n"
+
+    candidates = []
+    if sys.platform == "darwin":
+        # macOS: try libomp from Homebrew
+        for prefix in ["/opt/homebrew/opt/libomp", "/usr/local/opt/libomp"]:
+            if os.path.isdir(prefix):
+                candidates.append((
+                    ["-Xpreprocessor", "-fopenmp", f"-I{prefix}/include"],
+                    [f"-L{prefix}/lib", "-lomp"],
+                ))
+        # Also try Xcode / system clang with -fopenmp
+        candidates.append((["-fopenmp"], ["-fopenmp"]))
+    else:
+        # Linux / other: standard -fopenmp
+        candidates.append((["-fopenmp"], ["-fopenmp"]))
+
+    for cflags, ldflags in candidates:
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".c", delete=False) as f:
+                f.write(test_code)
+                f.flush()
+                cmd = ["cc"] + cflags + ldflags + [f.name, "-o", f.name + ".out"]
+                subprocess.check_call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                os.unlink(f.name + ".out")
+                os.unlink(f.name)
+            return cflags, ldflags
+        except (subprocess.CalledProcessError, FileNotFoundError, OSError):
+            try:
+                os.unlink(f.name)
+            except OSError:
+                pass
+            continue
+
+    return [], []
+
+
+omp_cflags, omp_ldflags = _check_openmp()
+if omp_cflags:
+    print(f"OpenMP detected: compile={omp_cflags}, link={omp_ldflags}")
+else:
+    print("OpenMP not available — building single-threaded kernels")
+
+sv_sim_compile_args = BASE_COMPILE_ARGS + omp_cflags
+sv_sim_link_args = BASE_LINK_ARGS + omp_ldflags
+
+extensions = [
+    Extension(
+        "pyqasm.accelerate.linalg",
+        sources=["src/pyqasm/accelerate/linalg.pyx"],
+        include_dirs=[np.get_include()],
+        extra_compile_args=BASE_COMPILE_ARGS,
+        extra_link_args=BASE_LINK_ARGS,
+    ),
+    Extension(
+        "pyqasm.accelerate.sv_sim",
+        sources=["src/pyqasm/accelerate/sv_sim.pyx"],
+        include_dirs=[np.get_include()],
+        extra_compile_args=sv_sim_compile_args,
+        extra_link_args=sv_sim_link_args,
+    ),
+]
+
+setup(
+    ext_modules=cythonize(extensions, language_level=3),
+)
diff --git a/src/pyqasm/accelerate/sv_sim.pyx b/src/pyqasm/accelerate/sv_sim.pyx
new file mode 100644
index 00000000..2ce338ad
--- /dev/null
+++ b/src/pyqasm/accelerate/sv_sim.pyx
@@ -0,0 +1,407 @@
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cython: language_level=3
+# cython: infer_types=True
+
+import os
+
+import cython
+import numpy as np
+
+from cython.parallel cimport prange
+from openmp cimport omp_get_max_threads
+
+# Threshold: parallelize when statevector has >= this many pairs
+# 2^17 = 131072 pairs corresponds to ~18 qubits (4 MB statevector)
+# Below this, OpenMP thread overhead dominates over parallel gains.
+DEF PARALLEL_THRESHOLD = 131072
+
+# Module-level flag: parallelism is opt-in via PYQASM_NUM_THREADS env var.
+# Default is serial (1 thread) to avoid conflicts with other OpenMP runtimes
+# (e.g. Qiskit Aer) in the same process. Set PYQASM_NUM_THREADS>1 to enable.
+cdef int _max_threads = int(os.environ.get("PYQASM_NUM_THREADS", "1"))
+if _max_threads > 1:
+    from openmp cimport omp_set_num_threads
+    omp_set_num_threads(_max_threads)
+else:
+    from openmp cimport omp_set_num_threads
+    omp_set_num_threads(1)
+
+cdef bint _use_threads() noexcept nogil:
+    """Check if multithreading is enabled."""
+    return omp_get_max_threads() > 1
+
+
+# --- cdef kernel functions ---
+# Each kernel manages its own GIL release: prange paths use prange's
+# built-in nogil, serial paths use explicit `with nogil:` blocks.
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cdef void _apply_single_qubit_gate(
+    double complex* sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t target,
+    double complex g00,
+    double complex g01,
+    double complex g10,
+    double complex g11,
+) noexcept:
+    cdef Py_ssize_t half = 1 << (num_qubits - 1)
+    cdef Py_ssize_t step = 1 << target
+    cdef Py_ssize_t s, i0, i1
+    cdef double complex a0, a1
+
+    if half >= PARALLEL_THRESHOLD and _use_threads():
+        for s in prange(half, schedule='static', nogil=True):
+            i0 = ((s >> target) << (target + 1)) | (s & (step - 1))
+            i1 = i0 | step
+            a0 = sv[i0]
+            a1 = sv[i1]
+            sv[i0] = g00 * a0 + g01 * a1
+            sv[i1] = g10 * a0 + g11 * a1
+    else:
+        with nogil:
+            for s in range(half):
+                i0 = ((s >> target) << (target + 1)) | (s & (step - 1))
+                i1 = i0 | step
+                a0 = sv[i0]
+                a1 = sv[i1]
+                sv[i0] = g00 * a0 + g01 * a1
+                sv[i1] = g10 * a0 + g11 * a1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cdef void _apply_controlled_gate(
+    double complex* sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t control,
+    Py_ssize_t target,
+    double complex g00,
+    double complex g01,
+    double complex g10,
+    double complex g11,
+) noexcept:
+    cdef Py_ssize_t quarter = 1 << (num_qubits - 2)
+    cdef Py_ssize_t lo, hi
+    cdef Py_ssize_t lo_mask, hi_limit
+    cdef Py_ssize_t s, base, i0, i1
+    cdef double complex a0, a1
+    cdef Py_ssize_t ctrl_bit = 1 << control
+    cdef Py_ssize_t tgt_bit = 1 << target
+
+    if control < target:
+        lo = control
+        hi = target
+    else:
+        lo = target
+        hi = control
+
+    lo_mask = (1 << lo) - 1
+    hi_limit = (1 << hi) - 1
+
+    if quarter >= PARALLEL_THRESHOLD and _use_threads():
+        for s in prange(quarter, schedule='static', nogil=True):
+            base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+            base = base | ctrl_bit
+            i0 = base
+            i1 = base | tgt_bit
+            a0 = sv[i0]
+            a1 = sv[i1]
+            sv[i0] = g00 * a0 + g01 * a1
+            sv[i1] = g10 * a0 + g11 * a1
+    else:
+        with nogil:
+            for s in range(quarter):
+                base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+                base = base | ctrl_bit
+                i0 = base
+                i1 = base | tgt_bit
+                a0 = sv[i0]
+                a1 = sv[i1]
+                sv[i0] = g00 * a0 + g01 * a1
+                sv[i1] = g10 * a0 + g11 * a1
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cdef void _apply_diagonal_gate(
+    double complex* sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t target,
+    double complex phase0,
+    double complex phase1,
+) noexcept:
+    cdef Py_ssize_t half = 1 << (num_qubits - 1)
+    cdef Py_ssize_t step = 1 << target
+    cdef Py_ssize_t s, i0, i1
+
+    if half >= PARALLEL_THRESHOLD and _use_threads():
+        for s in prange(half, schedule='static', nogil=True):
+            i0 = ((s >> target) << (target + 1)) | (s & (step - 1))
+            i1 = i0 | step
+            sv[i0] = phase0 * sv[i0]
+            sv[i1] = phase1 * sv[i1]
+    else:
+        with nogil:
+            for s in range(half):
+                i0 = ((s >> target) << (target + 1)) | (s & (step - 1))
+                i1 = i0 | step
+                sv[i0] = phase0 * sv[i0]
+                sv[i1] = phase1 * sv[i1]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cdef void _apply_controlled_diagonal_gate(
+    double complex* sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t control,
+    Py_ssize_t target,
+    double complex phase,
+) noexcept:
+    cdef Py_ssize_t quarter = 1 << (num_qubits - 2)
+    cdef Py_ssize_t lo, hi
+    cdef Py_ssize_t lo_mask, hi_limit
+    cdef Py_ssize_t s, base, idx
+    cdef Py_ssize_t ctrl_bit = 1 << control
+    cdef Py_ssize_t tgt_bit = 1 << target
+
+    if control < target:
+        lo = control
+        hi = target
+    else:
+        lo = target
+        hi = control
+
+    lo_mask = (1 << lo) - 1
+    hi_limit = (1 << hi) - 1
+
+    if quarter >= PARALLEL_THRESHOLD and _use_threads():
+        for s in prange(quarter, schedule='static', nogil=True):
+            base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+            idx = base | ctrl_bit | tgt_bit
+            sv[idx] = phase * sv[idx]
+    else:
+        with nogil:
+            for s in range(quarter):
+                base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+                idx = base | ctrl_bit | tgt_bit
+                sv[idx] = phase * sv[idx]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cdef void _apply_two_qubit_gate(
+    double complex* sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t qubit0,
+    Py_ssize_t qubit1,
+    double complex* gate,
+) noexcept:
+    cdef Py_ssize_t quarter = 1 << (num_qubits - 2)
+    cdef Py_ssize_t lo, hi
+    cdef Py_ssize_t lo_mask, hi_limit
+    cdef Py_ssize_t s, base, i00, i01, i10, i11
+    cdef double complex a00, a01, a10, a11
+    cdef Py_ssize_t q0_bit = 1 << qubit0
+    cdef Py_ssize_t q1_bit = 1 << qubit1
+    cdef double complex g0 = gate[0], g1 = gate[1], g2 = gate[2], g3 = gate[3]
+    cdef double complex g4 = gate[4], g5 = gate[5], g6 = gate[6], g7 = gate[7]
+    cdef double complex g8 = gate[8], g9 = gate[9], g10 = gate[10], g11_ = gate[11]
+    cdef double complex g12 = gate[12], g13 = gate[13], g14 = gate[14], g15 = gate[15]
+
+    if qubit0 < qubit1:
+        lo = qubit0
+        hi = qubit1
+    else:
+        lo = qubit1
+        hi = qubit0
+
+    lo_mask = (1 << lo) - 1
+    hi_limit = (1 << hi) - 1
+
+    if quarter >= PARALLEL_THRESHOLD and _use_threads():
+        for s in prange(quarter, schedule='static', nogil=True):
+            base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+            i00 = base
+            i01 = base | q0_bit
+            i10 = base | q1_bit
+            i11 = base | q0_bit | q1_bit
+            a00 = sv[i00]
+            a01 = sv[i01]
+            a10 = sv[i10]
+            a11 = sv[i11]
+            sv[i00] = g0 * a00 + g1 * a01 + g2 * a10 + g3 * a11
+            sv[i01] = g4 * a00 + g5 * a01 + g6 * a10 + g7 * a11
+            sv[i10] = g8 * a00 + g9 * a01 + g10 * a10 + g11_ * a11
+            sv[i11] = g12 * a00 + g13 * a01 + g14 * a10 + g15 * a11
+    else:
+        with nogil:
+            for s in range(quarter):
+                base = (s & lo_mask) | (((s >> lo) << (lo + 1)) & hi_limit) | ((s >> (hi - 1)) << (hi + 1))
+                i00 = base
+                i01 = base | q0_bit
+                i10 = base | q1_bit
+                i11 = base | q0_bit | q1_bit
+                a00 = sv[i00]
+                a01 = sv[i01]
+                a10 = sv[i10]
+                a11 = sv[i11]
+                sv[i00] = g0 * a00 + g1 * a01 + g2 * a10 + g3 * a11
+                sv[i01] = g4 * a00 + g5 * a01 + g6 * a10 + g7 * a11
+                sv[i10] = g8 * a00 + g9 * a01 + g10 * a10 + g11_ * a11
+                sv[i11] = g12 * a00 + g13 * a01 + g14 * a10 + g15 * a11
+
+
+# --- cpdef wrappers for Python API ---
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void apply_single_qubit_gate(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t target,
+    double complex[::1] gate,
+) noexcept:
+    cdef double complex* sv_ptr = &sv[0]
+    _apply_single_qubit_gate(sv_ptr, num_qubits, target, gate[0], gate[1], gate[2], gate[3])
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void apply_controlled_gate(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t control,
+    Py_ssize_t target,
+    double complex[::1] gate,
+) noexcept:
+    cdef double complex* sv_ptr = &sv[0]
+    _apply_controlled_gate(sv_ptr, num_qubits, control, target, gate[0], gate[1], gate[2], gate[3])
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void apply_diagonal_gate(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t target,
+    double complex phase0,
+    double complex phase1,
+) noexcept:
+    cdef double complex* sv_ptr = &sv[0]
+    _apply_diagonal_gate(sv_ptr, num_qubits, target, phase0, phase1)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void apply_controlled_diagonal_gate(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t control,
+    Py_ssize_t target,
+    double complex phase,
+) noexcept:
+    cdef double complex* sv_ptr = &sv[0]
+    _apply_controlled_diagonal_gate(sv_ptr, num_qubits, control, target, phase)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cpdef void apply_two_qubit_gate(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    Py_ssize_t qubit0,
+    Py_ssize_t qubit1,
+    double complex[::1] gate,
+) noexcept:
+    cdef double complex* sv_ptr = &sv[0]
+    cdef double complex* gate_ptr = &gate[0]
+    _apply_two_qubit_gate(sv_ptr, num_qubits, qubit0, qubit1, gate_ptr)
+
+
+# --- Batch dispatch: single Python->C crossing for entire circuit ---
+
+DEF OP_SINGLE = 0
+DEF OP_CONTROLLED = 1
+DEF OP_DIAGONAL = 2
+DEF OP_CTRL_DIAGONAL = 3
+DEF OP_TWO_QUBIT = 4
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cdivision(True)
+cpdef void apply_circuit(
+    double complex[::1] sv,
+    Py_ssize_t num_qubits,
+    int[::1] opcodes,
+    int[::1] targets,
+    int[::1] controls,
+    double complex[::1] gate_params,
+    double complex[::1] diag_phases,
+    int[::1] two_qubit_offsets,
+    double complex[::1] two_qubit_gates,
+    Py_ssize_t n_instructions,
+) noexcept:
+    """Execute an entire circuit. Each gate releases the GIL internally."""
+    cdef Py_ssize_t i, gp_offset, dp_offset, tq_offset
+    cdef int op, tgt, ctrl
+    cdef double complex* sv_ptr = &sv[0]
+    cdef double complex* gp_ptr = &gate_params[0]
+    cdef double complex* dp_ptr = &diag_phases[0]
+    cdef double complex* tq_ptr = &two_qubit_gates[0]
+
+    for i in range(n_instructions):
+        op = opcodes[i]
+        tgt = targets[i]
+        ctrl = controls[i]
+        gp_offset = i * 4
+        dp_offset = i * 2
+
+        if op == OP_SINGLE:
+            _apply_single_qubit_gate(
+                sv_ptr, num_qubits, tgt,
+                gp_ptr[gp_offset], gp_ptr[gp_offset + 1],
+                gp_ptr[gp_offset + 2], gp_ptr[gp_offset + 3],
+            )
+        elif op == OP_CONTROLLED:
+            _apply_controlled_gate(
+                sv_ptr, num_qubits, ctrl, tgt,
+                gp_ptr[gp_offset], gp_ptr[gp_offset + 1],
+                gp_ptr[gp_offset + 2], gp_ptr[gp_offset + 3],
+            )
+        elif op == OP_DIAGONAL:
+            _apply_diagonal_gate(
+                sv_ptr, num_qubits, tgt,
+                dp_ptr[dp_offset], dp_ptr[dp_offset + 1],
+            )
+        elif op == OP_CTRL_DIAGONAL:
+            _apply_controlled_diagonal_gate(
+                sv_ptr, num_qubits, ctrl, tgt,
+                dp_ptr[dp_offset],
+            )
+        elif op == OP_TWO_QUBIT:
+            tq_offset = two_qubit_offsets[i]
+            _apply_two_qubit_gate(
+                sv_ptr, num_qubits, ctrl, tgt,
+                &tq_ptr[tq_offset],
+            )
diff --git a/src/pyqasm/modules/base.py b/src/pyqasm/modules/base.py
index 3b21330a..b5317ba2 100644
--- a/src/pyqasm/modules/base.py
+++ b/src/pyqasm/modules/base.py
@@ -607,6 +607,10 @@ def unroll(self, **kwargs):
         if not kwargs:
             kwargs = {}
 
+        # Cache: skip if already unrolled (no kwargs means default unroll)
+        if not kwargs and len(self._unrolled_ast.statements) > 0:
+            return
+
         try:
             self.num_qubits, self.num_clbits = 0, 0
             if ext_gates := kwargs.get("external_gates"):
diff --git a/src/pyqasm/simulator/__init__.py b/src/pyqasm/simulator/__init__.py
new file mode 100644
index 00000000..a1d56354
--- /dev/null
+++ b/src/pyqasm/simulator/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2026 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Module containing the PyQASM statevector simulator.
+
+.. currentmodule:: pyqasm.simulator
+
+Classes
+---------
+
+.. autosummary::
+   :toctree: ../stubs/
+
+   Simulator
+   SimulatorResult
+
+"""
+
+from .statevector import Simulator, SimulatorResult
+
+__all__ = ["Simulator", "SimulatorResult"]
diff --git a/src/pyqasm/simulator/statevector.py b/src/pyqasm/simulator/statevector.py
new file mode 100644
index 00000000..e9ea74d9
--- /dev/null
+++ b/src/pyqasm/simulator/statevector.py
@@ -0,0 +1,589 @@
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Statevector simulator for PyQASM.
+
+"""
+
+# pylint: disable=no-name-in-module,too-many-return-statements
+# pylint: disable=too-many-locals,too-many-branches,too-many-statements
+# pylint: disable=arguments-out-of-order,unused-argument
+
+from collections import Counter
+from dataclasses import dataclass
+
+import numpy as np
+from openqasm3.ast import (
+    BinaryExpression,
+    BinaryOperator,
+    FloatLiteral,
+    Identifier,
+    IntegerLiteral,
+    QuantumGate,
+    UnaryExpression,
+    UnaryOperator,
+)
+
+from pyqasm import loads
+from pyqasm.accelerate.sv_sim import apply_circuit  # type: ignore[import-not-found]
+from pyqasm.modules.base import QasmModule
+
+try:
+    import numba as nb  # type: ignore[import-not-found]
+except ImportError:  # pragma: no cover - performance-only optional dependency
+
+    class _NumbaCompat:
+        """Fallback decorator shim when numba is not installed."""
+
+        @staticmethod
+        def njit(*args, **kwargs):  # noqa: ARG004
+            if args and callable(args[0]):
+                return args[0]
+
+            def decorator(fn):
+                return fn
+
+            return decorator
+
+    nb = _NumbaCompat()
+
+
+_COMPLEX_ZERO = 0j
+_PI = np.pi
+_E = np.e
+_T_PHASE = np.exp(1j * _PI / 4)
+_TDG_PHASE = np.exp(-1j * _PI / 4)
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def _rz_phases(theta: float) -> tuple[complex, complex]:
+    """Parameterized Rz gate as diagonal phases."""
+    half_theta = theta / 2
+    return np.exp(-1j * half_theta), np.exp(1j * half_theta)
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def ry(theta: float) -> np.ndarray:
+    """Parameterized Ry gate."""
+    c, s = np.cos(theta / 2), np.sin(theta / 2)
+    mat = np.empty((2, 2), dtype=np.complex128)
+    mat[0, 0] = c
+    mat[0, 1] = -s
+    mat[1, 0] = s
+    mat[1, 1] = c
+    return mat
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def rx(theta: float) -> np.ndarray:
+    """Parameterized Rx gate."""
+    c, s = np.cos(theta / 2), np.sin(theta / 2)
+    mat = np.empty((2, 2), dtype=np.complex128)
+    mat[0, 0] = c
+    mat[0, 1] = -1j * s
+    mat[1, 0] = -1j * s
+    mat[1, 1] = c
+    return mat
+
+
+def rz(theta: float) -> np.ndarray:
+    """Parameterized Rz gate."""
+    phase0, phase1 = _rz_phases(theta)
+    return _diag_to_matrix(phase0, phase1)
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def u3(theta: float, phi: float, lam: float) -> np.ndarray:
+    """Parameterized U3 gate (generic single-qubit rotation)."""
+    c, s = np.cos(theta / 2), np.sin(theta / 2)
+    exp_phi = np.exp(1j * phi)
+    exp_lam = np.exp(1j * lam)
+    exp_philam = np.exp(1j * (phi + lam))
+    mat = np.empty((2, 2), dtype=np.complex128)
+    mat[0, 0] = c
+    mat[0, 1] = -exp_lam * s
+    mat[1, 0] = exp_phi * s
+    mat[1, 1] = exp_philam * c
+    return mat
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def _diag_to_matrix(phase0: complex, phase1: complex) -> np.ndarray:
+    """Convert diagonal phases to a 2x2 matrix."""
+    mat = np.zeros((2, 2), dtype=np.complex128)
+    mat[0, 0] = phase0
+    mat[1, 1] = phase1
+    return mat
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def _is_diagonal_matrix(mat: np.ndarray, tol: float = 1e-10) -> bool:
+    """Return whether a 2x2 matrix is diagonal within tolerance."""
+    return (np.abs(mat[0, 1]) < tol) and (np.abs(mat[1, 0]) < tol)
+
+
+@nb.njit(cache=True, nogil=True, fastmath=True)
+def _diagonal_phases_from_matrix(mat: np.ndarray) -> tuple[complex, complex]:
+    """Extract diagonal phases from a diagonal 2x2 matrix."""
+    return mat[0, 0], mat[1, 1]
+
+
+PARAMETERIZED_GATES = {
+    "rz": rz,
+    "ry": ry,
+    "rx": rx,
+    "u3": u3,
+}
+
+_CONST_X = np.array([[0, 1], [1, 0]], dtype=np.complex128)
+_CONST_Y = np.array([[0, -1j], [1j, 0]], dtype=np.complex128)
+_CONST_Z = np.array([[1, 0], [0, -1]], dtype=np.complex128)
+_CONST_H = np.array([[1, 1], [1, -1]], dtype=np.complex128) / np.sqrt(2)
+_CONST_ID = np.eye(2, dtype=np.complex128)
+_CONST_S = np.array([[1, 0], [0, 1j]], dtype=np.complex128)
+_CONST_T = np.array([[1, 0], [0, _T_PHASE]], dtype=np.complex128)
+_CONST_SDG = np.array([[1, 0], [0, -1j]], dtype=np.complex128)
+_CONST_TDG = np.array([[1, 0], [0, _TDG_PHASE]], dtype=np.complex128)
+_CONST_SWAP = np.array(
+    [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], dtype=np.complex128
+)
+
+NON_PARAMETERIZED_GATES: dict[str, np.ndarray] = {
+    "x": _CONST_X,
+    "y": _CONST_Y,
+    "z": _CONST_Z,
+    "h": _CONST_H,
+    "id": _CONST_ID,
+    "s": _CONST_S,
+    "t": _CONST_T,
+    "sdg": _CONST_SDG,
+    "tdg": _CONST_TDG,
+    "swap": _CONST_SWAP,
+}
+
+# Fast-path single-qubit gate properties.  Diagonal gates are kept as phase
+# pairs while fusing so they can flush directly to the diagonal Cython kernel
+# instead of materializing a 2x2 matrix.
+NON_PARAMETERIZED_GATE_PROPS: dict[str, tuple[bool, complex, complex, np.ndarray | None]] = {
+    "x": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_X),
+    "y": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_Y),
+    "z": (True, 1.0 + _COMPLEX_ZERO, -1.0 + _COMPLEX_ZERO, None),
+    "h": (False, _COMPLEX_ZERO, _COMPLEX_ZERO, _CONST_H),
+    "id": (True, 1.0 + _COMPLEX_ZERO, 1.0 + _COMPLEX_ZERO, None),
+    "s": (True, 1.0 + _COMPLEX_ZERO, 1j, None),
+    "t": (True, 1.0 + _COMPLEX_ZERO, _T_PHASE, None),
+    "sdg": (True, 1.0 + _COMPLEX_ZERO, -1j, None),
+    "tdg": (True, 1.0 + _COMPLEX_ZERO, _TDG_PHASE, None),
+}
+
+CONTROLLED_GATE_SUB_UNITARIES: dict[str, np.ndarray] = {
+    "cx": _CONST_X,
+    "cy": _CONST_Y,
+    "cz": _CONST_Z,
+}
+
+# Pre-flatten non-parameterized gates for Cython (contiguous 1D arrays)
+GATE_CACHE: dict[str, np.ndarray] = {}
+for _name, _mat in NON_PARAMETERIZED_GATES.items():
+    GATE_CACHE[_name] = np.ascontiguousarray(_mat.ravel())
+for _name, _mat in CONTROLLED_GATE_SUB_UNITARIES.items():
+    GATE_CACHE[_name] = np.ascontiguousarray(_mat.ravel())
+
+# Diagonal gate phases: gate_name -> (phase0, phase1) for diag(phase0, phase1)
+DIAGONAL_PHASES: dict[str, tuple[complex, complex]] = {
+    "z": (1.0, -1.0),
+    "s": (1.0, 1j),
+    "t": (1.0, _T_PHASE),
+    "sdg": (1.0, -1j),
+    "tdg": (1.0, _TDG_PHASE),
+    "id": (1.0, 1.0),
+}
+
+# Controlled diagonal phases: gate_name -> phase (applied when control=1 AND target=1)
+CONTROLLED_DIAGONAL_PHASES: dict[str, complex] = {
+    "cz": -1.0,
+}
+
+# Integer opcodes for preprocessed instructions
+_OP_SINGLE = 0
+_OP_CONTROLLED = 1
+_OP_DIAGONAL = 2
+_OP_CTRL_DIAGONAL = 3
+_OP_TWO_QUBIT = 4
+
+
+def _try_eval_expression(expr) -> float | None:
+    """Try to evaluate an AST expression to a float value."""
+    if isinstance(expr, (IntegerLiteral, FloatLiteral)):
+        return float(expr.value)
+    if isinstance(expr, Identifier):
+        if expr.name == "pi":
+            return _PI
+        if expr.name == "tau":
+            return 2 * _PI
+        if expr.name == "euler":
+            return _E
+        return None
+    if isinstance(expr, BinaryExpression):
+        lhs = _try_eval_expression(expr.lhs)
+        rhs = _try_eval_expression(expr.rhs)
+        if lhs is None or rhs is None:
+            return None
+        if expr.op is BinaryOperator["+"]:
+            return lhs + rhs
+        if expr.op is BinaryOperator["-"]:
+            return lhs - rhs
+        if expr.op is BinaryOperator["*"]:
+            return lhs * rhs
+        if expr.op is BinaryOperator["/"]:
+            return lhs / rhs
+        if expr.op is BinaryOperator["**"]:
+            return lhs**rhs
+        return None
+    if isinstance(expr, UnaryExpression):
+        operand = _try_eval_expression(expr.expression)
+        if operand is None:
+            return None
+        if expr.op is UnaryOperator["-"]:
+            return -operand
+        return None
+    return None
+
+
+def _extract_params(statement):
+    """Extract float parameters from a QuantumGate's arguments."""
+    params = []
+    for arg in statement.arguments:
+        val = _try_eval_expression(arg)
+        if val is not None:
+            params.append(val)
+    return params
+
+
+def _preprocess(program, num_qubits):
+    """Walk the AST once, producing packed numpy arrays with inline gate fusion.
+
+    This keeps the existing no-cache semantics: every call preprocesses its
+    input program.  The speedup comes from avoiding Python list growth,
+    preserving diagonal gates as phase pairs while fusing, and using optional
+    numba-compiled helpers for small matrix/phase constructors.
+    """
+    statements = (
+        program._unrolled_ast.statements
+        if len(program._unrolled_ast.statements) > 0
+        else program.original_program.statements
+    )
+
+    # Conservative bound: pending one-qubit flushes plus SWAP expansion can
+    # emit more instructions than source statements.  The bound is deliberately
+    # simple and over-allocates a little to avoid dynamic list growth.
+    max_instructions = max(1, len(statements) * 4 + num_qubits)
+    opcodes = np.empty(max_instructions, dtype=np.int32)
+    targets = np.empty(max_instructions, dtype=np.int32)
+    controls = np.empty(max_instructions, dtype=np.int32)
+    gate_params = np.empty(max_instructions * 4, dtype=np.complex128)
+    diag_phases = np.empty(max_instructions * 2, dtype=np.complex128)
+    two_qubit_offsets = np.empty(max_instructions, dtype=np.int32)
+    two_qubit_gates = np.empty(max_instructions * 16, dtype=np.complex128)
+
+    zero4 = np.zeros(4, dtype=np.complex128)
+    zero2 = np.zeros(2, dtype=np.complex128)
+    instruction_idx = 0
+    tq_offset = 0
+
+    # target -> (is_diagonal, phase0, phase1, matrix)
+    pending: dict[int, tuple[bool, complex, complex, np.ndarray | None]] = {}
+
+    def _write_single(target: int, mat: np.ndarray):
+        nonlocal instruction_idx
+        opcodes[instruction_idx] = _OP_SINGLE
+        targets[instruction_idx] = target
+        controls[instruction_idx] = -1
+        gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = mat.ravel()
+        diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2
+        two_qubit_offsets[instruction_idx] = -1
+        instruction_idx += 1
+
+    def _write_diagonal(target: int, phase0: complex, phase1: complex):
+        nonlocal instruction_idx
+        opcodes[instruction_idx] = _OP_DIAGONAL
+        targets[instruction_idx] = target
+        controls[instruction_idx] = -1
+        gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4
+        diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = (phase0, phase1)
+        two_qubit_offsets[instruction_idx] = -1
+        instruction_idx += 1
+
+    def _write_controlled(control: int, target: int, flat: np.ndarray):
+        nonlocal instruction_idx
+        opcodes[instruction_idx] = _OP_CONTROLLED
+        targets[instruction_idx] = target
+        controls[instruction_idx] = control
+        gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = flat
+        diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2
+        two_qubit_offsets[instruction_idx] = -1
+        instruction_idx += 1
+
+    def _write_controlled_diagonal(control: int, target: int, phase: complex):
+        nonlocal instruction_idx
+        opcodes[instruction_idx] = _OP_CTRL_DIAGONAL
+        targets[instruction_idx] = target
+        controls[instruction_idx] = control
+        gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4
+        diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = (phase, 0j)
+        two_qubit_offsets[instruction_idx] = -1
+        instruction_idx += 1
+
+    def _write_two_qubit(control: int, target: int, flat: np.ndarray):
+        nonlocal instruction_idx, tq_offset
+        opcodes[instruction_idx] = _OP_TWO_QUBIT
+        targets[instruction_idx] = target
+        controls[instruction_idx] = control
+        gate_params[instruction_idx * 4 : instruction_idx * 4 + 4] = zero4
+        diag_phases[instruction_idx * 2 : instruction_idx * 2 + 2] = zero2
+        two_qubit_offsets[instruction_idx] = tq_offset
+        two_qubit_gates[tq_offset : tq_offset + 16] = flat
+        tq_offset += 16
+        instruction_idx += 1
+
+    def _flush_pending(target: int):
+        if target not in pending:
+            return
+        is_diagonal, phase0, phase1, mat = pending.pop(target)
+
+        if is_diagonal and (np.abs(phase0 - 1.0) < 1e-10) and (np.abs(phase1 - 1.0) < 1e-10):
+            return
+        if not is_diagonal and mat is not None and _is_diagonal_matrix(mat):
+            is_diagonal = True
+            phase0, phase1 = _diagonal_phases_from_matrix(mat)
+            mat = None
+
+        if is_diagonal:
+            _write_diagonal(target, phase0, phase1)
+        else:
+            if mat is None:
+                mat = _diag_to_matrix(phase0, phase1)
+            _write_single(target, mat)
+
+    def _flush_all():
+        for target in list(pending.keys()):
+            _flush_pending(target)
+
+    def _accumulate(
+        target: int,
+        new_is_diagonal: bool,
+        new_phase0: complex,
+        new_phase1: complex,
+        new_mat: np.ndarray | None,
+    ):
+        if target not in pending:
+            pending[target] = (new_is_diagonal, new_phase0, new_phase1, new_mat)
+            return
+
+        current_is_diagonal, current_phase0, current_phase1, current_mat = pending[target]
+        if current_is_diagonal and new_is_diagonal:
+            pending[target] = (
+                True,
+                new_phase0 * current_phase0,
+                new_phase1 * current_phase1,
+                None,
+            )
+            return
+
+        current_matrix = (
+            _diag_to_matrix(current_phase0, current_phase1) if current_is_diagonal else current_mat
+        )
+        new_matrix = _diag_to_matrix(new_phase0, new_phase1) if new_is_diagonal else new_mat
+        if current_matrix is None or new_matrix is None:
+            raise ValueError("Invalid pending gate fusion state.")
+        fused = new_matrix @ current_matrix
+        if _is_diagonal_matrix(fused):
+            phase0, phase1 = _diagonal_phases_from_matrix(fused)
+            pending[target] = (True, phase0, phase1, None)
+        else:
+            pending[target] = (False, _COMPLEX_ZERO, _COMPLEX_ZERO, fused)
+
+    for statement in statements:
+        if not isinstance(statement, QuantumGate):
+            continue
+
+        gate_name = statement.name.name
+        params = _extract_params(statement)
+
+        if len(statement.qubits) == 1:
+            target = statement.qubits[0].indices[0][0].value
+
+            if gate_name in NON_PARAMETERIZED_GATE_PROPS:
+                is_diagonal, phase0, phase1, mat = NON_PARAMETERIZED_GATE_PROPS[gate_name]
+                _accumulate(target, is_diagonal, phase0, phase1, mat)
+            elif gate_name == "rz" and len(params) == 1:
+                phase0, phase1 = _rz_phases(params[0])
+                _accumulate(target, True, phase0, phase1, None)
+            elif gate_name in PARAMETERIZED_GATES:
+                gate_fn = PARAMETERIZED_GATES[gate_name]
+                required_params = 3 if gate_name == "u3" else 1
+                if len(params) != required_params:
+                    raise ValueError(f"Gate {gate_name} requires {required_params} parameter(s).")
+                mat = gate_fn(*params)
+                _accumulate(target, False, _COMPLEX_ZERO, _COMPLEX_ZERO, mat)
+            else:
+                raise ValueError(
+                    f"Gate '{gate_name}' not supported by simulator. "
+                    f"Call module.unroll() first."
+                )
+
+        elif len(statement.qubits) == 2:
+            target = statement.qubits[1].indices[0][0].value
+            control = statement.qubits[0].indices[0][0].value
+
+            # Flush pending gates only when they do not commute with the
+            # incoming two-qubit gate.  Diagonal one-qubit gates commute with
+            # diagonal two-qubit gates (CZ/CRZ), so they can remain fused until
+            # a later non-diagonal interaction or the final flush.
+            for qubit in (control, target):
+                if qubit not in pending:
+                    continue
+                is_diagonal, _, _, _ = pending[qubit]
+                if (not is_diagonal) or gate_name not in {"cz", "crz"}:
+                    _flush_pending(qubit)
+
+            if gate_name == "swap":
+                # Decompose SWAP into three controlled-X instructions.  This
+                # stays on the optimized controlled-gate kernel and avoids the
+                # generic 4x4 two-qubit path for this common gate.
+                flat = GATE_CACHE["cx"]
+                _write_controlled(control, target, flat)
+                _write_controlled(target, control, flat)
+                _write_controlled(control, target, flat)
+            elif gate_name in CONTROLLED_DIAGONAL_PHASES:
+                phase = CONTROLLED_DIAGONAL_PHASES[gate_name]
+                _write_controlled_diagonal(control, target, phase)
+            elif gate_name == "crz" and len(params) == 1:
+                theta = params[0]
+                _, phase_at_11 = _rz_phases(theta)
+                _write_controlled_diagonal(control, target, phase_at_11)
+            elif gate_name in CONTROLLED_GATE_SUB_UNITARIES:
+                flat = GATE_CACHE[gate_name]
+                _write_controlled(control, target, flat)
+            else:
+                raise ValueError(
+                    f"Gate '{gate_name}' not supported by simulator. "
+                    f"Call module.unroll() first."
+                )
+
+    # Flush remaining pending single-qubit gates
+    _flush_all()
+
+    n = instruction_idx
+    if n == 0:
+        return n, None, None, None, None, None, None, None
+
+    if tq_offset > 0:
+        two_qubit_gates_trimmed = np.ascontiguousarray(two_qubit_gates[:tq_offset])
+    else:
+        two_qubit_gates_trimmed = np.zeros(1, dtype=np.complex128)
+
+    return (
+        n,
+        opcodes[:n],
+        targets[:n],
+        controls[:n],
+        np.ascontiguousarray(gate_params[: n * 4]),
+        np.ascontiguousarray(diag_phases[: n * 2]),
+        two_qubit_offsets[:n],
+        two_qubit_gates_trimmed,
+    )
+
+
+@dataclass(frozen=True)
+class SimulatorResult:
+    """Class to store the result of a statevector simulation."""
+
+    probabilities: np.ndarray
+    measurement_counts: Counter[str]
+    final_statevector: np.ndarray
+
+
+class Simulator:
+    """
+    Statevector simulator
+
+    """
+
+    def __init__(self, seed: None | int = None):
+        """
+        Initialize the statevector simulator.
+
+        Parameters:
+            seed (None | int): A seed to initialize the `np.random.BitGenerator`.
+                If None, then fresh, unpredictable entropy will be pulled from
+                the OS using `np.random.SeedSequence`.
+        """
+        self._rng = np.random.default_rng(seed)
+
+    def run(self, program: QasmModule | str, shots: int = 1) -> SimulatorResult:
+        """Run the statevector simulator.
+
+        For QasmModule input, the module is used as-is. Call module.unroll()
+        and module.remove_idle_qubits() beforehand if needed.
+
+        For string input, loads/unrolls/removes idle qubits automatically.
+
+        Args:
+            program (QasmModule | str): The program to simulate.
+            shots (int): The number of shots to simulate. Defaults to 1.
+
+        Returns:
+            SimulatorResult: The result of the simulation.
+
+        Raises:
+            ValueError: If shots is less than 0, or if the program contains
+                an unsupported gate or invalid syntax.
+        """
+        if isinstance(program, str):
+            program = loads(program)
+            program.unroll()
+            program.remove_idle_qubits()
+
+        num_qubits = program.num_qubits
+        sv = np.zeros(2**num_qubits, dtype=np.complex128)
+        sv[0] = 1.0
+
+        n, opcodes, targets, controls, gate_params, diag_phases, tq_offsets, tq_gates = _preprocess(
+            program, num_qubits
+        )
+
+        if n > 0:
+            apply_circuit(
+                sv,
+                num_qubits,
+                opcodes,
+                targets,
+                controls,
+                gate_params,
+                diag_phases,
+                tq_offsets,
+                tq_gates,
+                n,
+            )
+
+        if shots < 0:
+            raise ValueError("Shots must be greater than or equal to 0.")
+
+        probabilities = np.abs(sv) ** 2
+        samples = self._rng.choice(len(probabilities), size=shots, p=probabilities)
+        counts = Counter(format(s, f"0{num_qubits}b")[::-1] for s in samples)
+
+        return SimulatorResult(probabilities, counts, sv)
diff --git a/tests/test_perf_regression.py b/tests/test_perf_regression.py
new file mode 100644
index 00000000..ab56caf7
--- /dev/null
+++ b/tests/test_perf_regression.py
@@ -0,0 +1,186 @@
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=redefined-outer-name
+
+"""
+Performance regression tests for the PyQASM statevector simulator.
+
+These tests enforce time budgets to catch performance regressions. Each test
+runs the simulator multiple times and checks that the median execution time
+stays within a generous upper bound (3x the baseline measured on Apple Silicon).
+
+Run with: pytest tests/test_perf_regression.py -v
+Skip in CI: pytest -m "not benchmark"
+
+Time budgets are intentionally loose (3x baseline) to avoid flakiness on
+different hardware. The goal is catching order-of-magnitude regressions,
+not enforcing tight timings.
+
+Baselines (Apple Silicon M-series, March 2026):
+  Random 16q/200d:   ~24 ms
+  Random 20q/300d:  ~330 ms
+  Random 22q/400d: ~1625 ms
+  QFT 10q:           ~27 ms
+  QFT 14q:           ~58 ms
+"""
+
+import random
+import time
+
+import numpy as np
+import pytest
+
+from pyqasm import loads as pyqasm_loads
+from pyqasm.simulator.statevector import Simulator
+
+SINGLE_QUBIT_GATES = ["h", "x", "y", "z", "s", "t", "rx(1.0)", "ry(0.5)", "rz(0.3)"]
+TWO_QUBIT_GATES = ["cx", "cy", "cz", "swap"]
+
+N_REPEATS = 5
+BUDGET_MULTIPLIER = 3  # allow 3x baseline for portability across hardware
+
+
+def generate_random_qasm(num_qubits: int, depth: int, seed: int = 42) -> str:
+    rng = random.Random(seed)
+    lines = [
+        "OPENQASM 3;",
+        'include "stdgates.inc";',
+        f"qubit[{num_qubits}] q;",
+    ]
+    for _ in range(depth):
+        if num_qubits >= 2 and rng.random() < 0.4:
+            gate = rng.choice(TWO_QUBIT_GATES)
+            q0, q1 = rng.sample(range(num_qubits), 2)
+            lines.append(f"{gate} q[{q0}], q[{q1}];")
+        else:
+            gate = rng.choice(SINGLE_QUBIT_GATES)
+            q = rng.randint(0, num_qubits - 1)
+            lines.append(f"{gate} q[{q}];")
+    return "\n".join(lines)
+
+
+def generate_qft_qasm(num_qubits: int) -> str:
+    lines = [
+        "OPENQASM 3;",
+        'include "stdgates.inc";',
+        f"qubit[{num_qubits}] q;",
+    ]
+    for i in range(num_qubits):
+        lines.append(f"h q[{i}];")
+        for j in range(i + 1, num_qubits):
+            k = j - i
+            angle = f"pi/{2**k}"
+            lines.append(f"crz({angle}) q[{j}], q[{i}];")
+    for i in range(num_qubits // 2):
+        lines.append(f"swap q[{i}], q[{num_qubits - 1 - i}];")
+    return "\n".join(lines)
+
+
+def median_sim_time(qasm: str, sim: Simulator, n_repeats: int = N_REPEATS) -> float:
+    """Return the median wall-clock time (seconds) of sim.run() over n_repeats."""
+    module = pyqasm_loads(qasm)
+    module.unroll()
+    module.remove_idle_qubits()
+    times = []
+    for _ in range(n_repeats):
+        start = time.perf_counter()
+        sim.run(module, shots=0)
+        times.append(time.perf_counter() - start)
+    return float(np.median(times))
+
+
+@pytest.fixture
+def sim():
+    return Simulator(seed=0)
+
+
+# ---------------------------------------------------------------------------
+# Random circuit performance tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize(
+    "num_qubits, depth, baseline_ms",
+    [
+        (16, 200, 24),
+        (20, 300, 330),
+    ],
+    ids=["random-16q", "random-20q"],
+)
+def test_random_circuit_perf(sim, num_qubits, depth, baseline_ms):
+    """Median simulation time must stay within budget for random circuits."""
+    qasm = generate_random_qasm(num_qubits, depth)
+    elapsed = median_sim_time(qasm, sim)
+    budget_ms = baseline_ms * BUDGET_MULTIPLIER
+    elapsed_ms = elapsed * 1000
+    assert elapsed_ms < budget_ms, (
+        f"Random {num_qubits}q/{depth}d: {elapsed_ms:.1f} ms exceeded "
+        f"budget {budget_ms:.0f} ms (baseline {baseline_ms} ms x{BUDGET_MULTIPLIER})"
+    )
+
+
+# ---------------------------------------------------------------------------
+# QFT circuit performance tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark
+@pytest.mark.parametrize(
+    "num_qubits, baseline_ms",
+    [
+        (10, 27),
+        (14, 58),
+    ],
+    ids=["qft-10q", "qft-14q"],
+)
+def test_qft_circuit_perf(sim, num_qubits, baseline_ms):
+    """Median simulation time must stay within budget for QFT circuits."""
+    qasm = generate_qft_qasm(num_qubits)
+    elapsed = median_sim_time(qasm, sim)
+    budget_ms = baseline_ms * BUDGET_MULTIPLIER
+    elapsed_ms = elapsed * 1000
+    assert elapsed_ms < budget_ms, (
+        f"QFT {num_qubits}q: {elapsed_ms:.1f} ms exceeded "
+        f"budget {budget_ms:.0f} ms (baseline {baseline_ms} ms x{BUDGET_MULTIPLIER})"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Scaling sanity check: ensure O(2^n) per gate, not worse
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.benchmark
+def test_scaling_not_superexponential(sim):
+    """Verify that doubling qubits roughly doubles per-gate time (not worse).
+
+    Compares 14q vs 16q random circuits with the same gate count. The ratio
+    of per-gate times should be near 4x (since state size quadruples with +2
+    qubits). We allow up to 8x to account for cache effects and noise.
+    """
+    depth = 150
+    qasm_14 = generate_random_qasm(14, depth)
+    qasm_16 = generate_random_qasm(16, depth)
+
+    t14 = median_sim_time(qasm_14, sim)
+    t16 = median_sim_time(qasm_16, sim)
+
+    ratio = t16 / t14
+    # +2 qubits => 4x state size => expect ~4x time. Allow up to 8x.
+    assert ratio < 8.0, (
+        f"Scaling ratio 16q/14q = {ratio:.1f}x, expected <8x "
+        f"(14q={t14*1000:.1f}ms, 16q={t16*1000:.1f}ms)"
+    )
diff --git a/tests/test_sv_sim.py b/tests/test_sv_sim.py
new file mode 100644
index 00000000..d9b92263
--- /dev/null
+++ b/tests/test_sv_sim.py
@@ -0,0 +1,350 @@
+# Copyright 2025 qBraid
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pylint: disable=redefined-outer-name
+
+"""
+Module containing tests for the PyQASM statevector simulator.
+
+"""
+
+import numpy as np
+import pytest
+from qiskit import transpile
+from qiskit.qasm3 import loads
+from qiskit_aer import AerSimulator
+
+from pyqasm.simulator.statevector import Simulator
+
+
+@pytest.fixture
+def aer_simulator():
+    """Fixture to create and return a Qiskit AerSimulator."""
+    return AerSimulator(method="statevector")
+
+
+@pytest.fixture
+def pyqasm_simulator():
+    """Fixture to create and return a pyqasm simulator."""
+    return Simulator(seed=22)
+
+
+@pytest.mark.parametrize(
+    "qasm, description",
+    [
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            h q[0];
+            """,
+            "Hadamard gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            rx(pi) q[0];
+            """,
+            "RX(pi) gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            ry(pi) q[0];
+            """,
+            "RY(pi) gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            rz(pi) q[0];
+            """,
+            "RZ(pi) gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            s q[0];
+            """,
+            "S gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            t q[0];
+            """,
+            "T gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            sdg q[0];
+            """,
+            "Sdg gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            tdg q[0];
+            """,
+            "Tdg gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            id q[0];
+            """,
+            "Identity gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            u3(1.0, 0.5, 0.3) q[0];
+            """,
+            "U3 gate (global phase)",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[1] q;
+            h q[0];
+            rx(pi) q[0];
+            ry(pi) q[0];
+            rz(pi) q[0];
+            s q[0];
+            t q[0];
+            sdg q[0];
+            tdg q[0];
+            id q[0];
+            """,
+            "Combined gates: H, RX(pi), RY(pi), RZ(pi), S, T, Sdg, Tdg, ID",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            cx q[0], q[1];
+            """,
+            "CX gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            x q[0];
+            x q[1];
+            cy q[0], q[1];
+            """,
+            "CY on |11>",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            x q[0];
+            x q[1];
+            cz q[0], q[1];
+            """,
+            "CZ on |11>",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            x q[1];
+            swap q[0], q[1];
+            """,
+            "SWAP on |01>",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[3] q;
+            x q[0];
+            id q[1];
+            id q[2];
+            """,
+            "X on qubit 0 of 3-qubit system",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            cy q[0], q[1];
+            """,
+            "CY gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            cz q[0], q[1];
+            """,
+            "CZ gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            swap q[0], q[1];
+            """,
+            "Swap gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            h q[0];
+            crz(pi/4) q[0], q[1];
+            """,
+            "CRZ gate",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            h q[0];
+            h q[1];
+            """,
+            "Two H Gates",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            h q[0];
+            cx q[0], q[1];
+            """,
+            "Bell state",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[3] q;
+            h q[0];
+            cx q[0], q[1];
+            cx q[1], q[2];
+            """,
+            "GHZ state",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            h q[0];
+            swap q[0], q[1];
+            cx q[0], q[1];
+            """,
+            "Bell state with swap",
+        ),
+        (
+            """
+            OPENQASM 3;
+            include "stdgates.inc";
+            qubit[2] q;
+            h q[0];
+            swap q[1], q[0];
+            cx q[0], q[1];
+            """,
+            "Bell state with swap rev qubits",
+        ),
+    ],
+)
+def test_simulator_sv_results(qasm, description, aer_simulator, pyqasm_simulator):
+    """Test PyQASM simulator by comparing SV results against the Qiskit Aer simulator."""
+    circuit = loads(qasm)
+    circuit.save_statevector()
+    compiled_circuit = transpile(circuit, aer_simulator, optimization_level=0)
+    result = aer_simulator.run(compiled_circuit).result()
+    sv_qiskit = result.get_statevector(compiled_circuit)
+    sv_expected = np.asarray(sv_qiskit)
+
+    result = pyqasm_simulator.run(qasm, shots=1000)
+
+    sv_actual = result.final_statevector
+    if "global phase" in description:
+        # Compare up to global phase: find first nonzero element and normalize
+        idx = np.argmax(np.abs(sv_expected) > 1e-10)
+        phase = sv_actual[idx] / sv_expected[idx]
+        sv_actual = sv_actual / phase
+
+    assert np.allclose(sv_actual, sv_expected), (
+        f"Test failed for: {description}\n"
+        f"PyQASM simulator statevector: {result.final_statevector}\n"
+        f"Expected statevector: {sv_expected}"
+    )
+
+
+def test_simulator_large_circuit(aer_simulator, pyqasm_simulator):
+    """Test PyQASM simulator on an 8-qubit circuit."""
+    qasm = """
+    OPENQASM 3;
+    include "stdgates.inc";
+    qubit[8] q;
+    h q[0];
+    cx q[0], q[1];
+    cx q[1], q[2];
+    cx q[2], q[3];
+    h q[4];
+    cx q[4], q[5];
+    cx q[5], q[6];
+    cx q[6], q[7];
+    swap q[3], q[4];
+    """
+    circuit = loads(qasm)
+    circuit.save_statevector()
+    compiled_circuit = transpile(circuit, aer_simulator, optimization_level=0)
+    result_qiskit = aer_simulator.run(compiled_circuit).result()
+    sv_expected = np.asarray(result_qiskit.get_statevector(compiled_circuit))
+
+    result = pyqasm_simulator.run(qasm, shots=0)
+
+    assert np.allclose(result.final_statevector, sv_expected), (
+        f"Test failed for 8-qubit circuit\n"
+        f"PyQASM: {result.final_statevector}\n"
+        f"Expected: {sv_expected}"
+    )