diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000..2da9bf0
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,52 @@
+# mkl_umath ASV Benchmarks
+
+Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).
+
+The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads.
+
+### Coverage
+
+| File | Ufuncs | Dtypes | Sizes/Presets |
+|------|--------|--------|---------------|
+| `micro/bench_micro.py` | 24 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M |
+| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) |
+| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) |
+| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
+| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |
+
+## Running Benchmarks
+
+Prerequisites:
+
+```bash
+pip install asv psutil
+```
+
+Run benchmarks against the current commit:
+
+```bash
+asv run --python=same --quick HEAD^!
+```
+
+Compare two commits:
+
+```bash
+asv continuous --python=same HEAD~1 HEAD
+```
+
+View results in a browser:
+
+```bash
+asv publish
+asv preview
+```
+
+## Threading
+
+Set `MKL_NUM_THREADS` to control the thread count used by MKL:
+
+```bash
+MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
+```
+
+If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 0000000..7848275
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,20 @@
+{
+    "version": 1,
+    "project": "mkl_umath",
+    "project_url": "https://github.com/IntelPython/mkl_umath",
+    "repo": "..",
+    "branches": [
+        "main"
+    ],
+    "environment_type": "existing",
+    "benchmark_dir": "benchmarks",
+    "env_dir": ".asv/env",
+    "results_dir": ".asv/results",
+    "html_dir": ".asv/html",
+    "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/",
+    "build_cache_size": 2,
+    "default_benchmark_timeout": 1500,
+    "regressions_thresholds": {
+        ".*": 0.2
+    }
+}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
new file mode 100644
index 0000000..665b2e1
--- /dev/null
+++ b/benchmarks/benchmarks/__init__.py
@@ -0,0 +1,26 @@
+"""ASV benchmarks for mkl_umath"""
+
+import os
+
+import psutil
+
+from ._patch_setup import _apply_patches
+
+_MIN_THREADS = 4  # minimum physical cores required for multi-threaded mode
+
+
+def _physical_cores():
+    """Return physical core count; fall back to 1 (conservative)."""
+    return psutil.cpu_count(logical=False) or 1
+
+
+def _thread_count():
+    physical = _physical_cores()
+    return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"
+
+
+_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
+os.environ["MKL_NUM_THREADS"] = _THREADS
+
+_apply_patches()
+del _apply_patches
diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py
new file mode 100644
index 0000000..9383b1c
--- /dev/null
+++ b/benchmarks/benchmarks/_patch_setup.py
@@ -0,0 +1,69 @@
+"""MKL patch setup — executed once per ASV worker process at import time.
+
+Patches NumPy with Intel MKL implementations for fft, random, and umath.
+Hard-fails with a descriptive RuntimeError if any package is missing or the
+patch does not take effect, so benchmarks never silently run on stock NumPy.
+"""
+
+_PATCH_MAP = [
+    ("mkl_fft", "patch_numpy_fft"),
+    ("mkl_random", "patch_numpy_random"),
+    ("mkl_umath", "patch_numpy_umath"),
+]
+
+
+def _apply_patches():
+    import numpy as np
+
+    patched = {}
+
+    for mod_name, patch_fn_name in _PATCH_MAP:
+        try:
+            mod = __import__(mod_name)
+        except ImportError as exc:
+            raise RuntimeError(
+                f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
+                f"  Ensure the conda env contains {mod_name} "
+                f"from the Intel channel.\n"
+                "  Required channels: "
+                "https://software.repos.intel.com/python/conda"
+            ) from exc
+
+        patch_fn = getattr(mod, patch_fn_name, None)
+        if patch_fn is None:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
+                f"Upgrade {mod_name} to a version that exposes "
+                "the stock-numpy patch API."
+            )
+
+        try:
+            patch_fn()
+        except Exception as exc:
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}"
+            ) from exc
+
+        is_patched_fn = getattr(mod, "is_patched", None)
+        if callable(is_patched_fn) and not is_patched_fn():
+            raise RuntimeError(
+                f"[mkl-patch] {mod_name}.is_patched() returned False "
+                "after patching. NumPy may have been imported before "
+                "patching in a conflicting state."
+            )
+
+        patched[mod_name] = mod
+
+    _attr_checks = {
+        "mkl_fft": lambda: np.fft.fft.__module__,
+        "mkl_random": lambda: np.random.random.__module__,
+        "mkl_umath": lambda: np.exp.__module__,
+    }
+    for mod_name in patched:
+        try:
+            attr = _attr_checks[mod_name]()
+        except Exception:
+            attr = "unknown"
+        print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}")
+
+    print("[mkl-patch] ALL OK -- mkl_fft, mkl_random, mkl_umath active")
diff --git a/benchmarks/benchmarks/micro/__init__.py b/benchmarks/benchmarks/micro/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py
new file mode 100644
index 0000000..1d6e4bb
--- /dev/null
+++ b/benchmarks/benchmarks/micro/bench_micro.py
@@ -0,0 +1,87 @@
+"""Micro-benchmarks for mkl_umath unary ufuncs.
+
+Times each ufunc over a Cartesian product of
+  dtype  in [float32, float64]
+  size   in [10_000, 100_000, 1_000_000]
+
+Arrays are pre-allocated in setup() and reused across timing calls.
+Patching is applied once at package import via benchmarks._patch_setup.
+"""
+
+import numpy as np
+
+_UFUNC_CONFIGS = {
+    "exp": {"func": np.exp, "low": -10.0, "high": 10.0},
+    "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},
+    "expm1": {"func": np.expm1, "low": -10.0, "high": 10.0},
+    "log": {"func": np.log, "low": 1e-3, "high": 1e3},
+    "log2": {"func": np.log2, "low": 1e-3, "high": 1e3},
+    "log10": {"func": np.log10, "low": 1e-3, "high": 1e3},
+    "log1p": {"func": np.log1p, "low": 0.0, "high": 10.0},
+    "sin": {"func": np.sin, "low": -np.pi, "high": np.pi},
+    "cos": {"func": np.cos, "low": -np.pi, "high": np.pi},
+    "tan": {"func": np.tan, "low": -1.4, "high": 1.4},
+    "arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0},
+    "arccos": {"func": np.arccos, "low": -1.0, "high": 1.0},
+    "arctan": {"func": np.arctan, "low": -10.0, "high": 10.0},
+    "sinh": {"func": np.sinh, "low": -5.0, "high": 5.0},
+    "cosh": {"func": np.cosh, "low": -5.0, "high": 5.0},
+    "tanh": {"func": np.tanh, "low": -5.0, "high": 5.0},
+    "arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0},
+    "arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0},
+    "arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99},
+    "sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0},
+    "cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0},
+    "square": {"func": np.square, "low": -10.0, "high": 10.0},
+    "fabs": {"func": np.fabs, "low": -100.0, "high": 100.0},
+    "absolute": {"func": np.absolute, "low": -100.0, "high": 100.0},
+    "reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0},
+}
+
+
+class BenchMicro:
+    params = (
+        sorted(_UFUNC_CONFIGS.keys()),
+        ["float32", "float64"],
+        [10_000, 100_000, 1_000_000],
+    )
+    param_names = ["ufunc", "dtype", "size"]
+
+    def setup(self, ufunc, dtype, size):
+        cfg = _UFUNC_CONFIGS[ufunc]
+        rng = np.random.default_rng(42)
+        self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype)
+        self._func = cfg["func"]
+
+    def time_micro(self, ufunc, dtype, size):
+        self._func(self.x)
+
+
+class BenchArctan2:
+    """Binary ufunc arctan2"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
+        self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)
+
+    def time_arctan2(self, dtype, size):
+        np.arctan2(self.y, self.x)
+
+
+class BenchPower:
+    """Binary ufunc power (arbitrary exponent via MKL vdPow)"""
+
+    params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
+    param_names = ["dtype", "size"]
+
+    def setup(self, dtype, size):
+        rng = np.random.default_rng(42)
+        self.base = rng.uniform(0.1, 10.0, size).astype(dtype)
+        self.exp = rng.uniform(0.5, 3.0, size).astype(dtype)
+
+    def time_power(self, dtype, size):
+        np.power(self.base, self.exp)
diff --git a/benchmarks/benchmarks/npbench/__init__.py b/benchmarks/benchmarks/npbench/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py
new file mode 100644
index 0000000..a57a3fa
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py
@@ -0,0 +1,53 @@
+"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.
+
+Preset sizes from npbench bench_info/arc_distance.json:
+  M: N=1_000_000
+  L: N=10_000_000
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
+def _initialize(N):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    t0 = rng.random((N,))
+    p0 = rng.random((N,))
+    t1 = rng.random((N,))
+    p1 = rng.random((N,))
+    return t0, p0, t1, p1
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py
+def _arc_distance(theta_1, phi_1, theta_2, phi_2):
+    temp = (
+        np.sin((theta_2 - theta_1) / 2) ** 2
+        + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2
+    )
+    return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
+
+
+_PRESETS = {
+    "M": {"N": 1_000_000},
+    "L": {"N": 10_000_000},
+}
+
+
+class BenchArcDistance:
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset]
+
+    def time_arc_distance(self, cache, preset):
+        _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py
new file mode 100644
index 0000000..f4dca7e
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_go_fast.py
@@ -0,0 +1,74 @@
+"""npbench wrapper: GoFast — mkl_umath ops: tanh.
+
+Preset sizes from npbench bench_info/go_fast.json:
+  M: N=6_000
+  L: N=20_000
+
+Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
+(go_fast_loop).  A vectorized variant (go_fast_vec) using np.tanh on the
+full diagonal is included for direct MKL VM throughput measurement.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
+def _initialize(N):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    a = rng.random((N, N))
+    return (a,)
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py
+def _go_fast(a):
+    trace = 0.0
+    for i in range(a.shape[0]):
+        trace += np.tanh(a[i, i])
+    return a + trace
+
+
+_PRESETS = {
+    "M": {"N": 6_000},
+    "L": {"N": 20_000},
+}
+
+
+class BenchGoFastLoop:
+    """Original npbench kernel — Python loop calling np.tanh per element."""
+
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+
+    def time_go_fast_loop(self, cache, preset):
+        _go_fast(self.a)
+
+
+class BenchGoFastVec:
+    """Vectorized variant — np.tanh on the full diagonal array at once."""
+
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.a,) = cache[preset]
+        self.diag = np.copy(np.diag(self.a))
+
+    def time_go_fast_vec(self, cache, preset):
+        np.tanh(self.diag)
diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
new file mode 100644
index 0000000..47f3f14
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py
@@ -0,0 +1,132 @@
+"""npbench wrapper: Mandelbrot set (two variants).
+
+mkl_umath ops: abs, multiply, add.
+
+Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json:
+  M: XN=YN=250/500, maxiter=150/80
+  L: XN=YN=833/1000, maxiter=200/100
+
+mandelbrot1 (slow): uses np.less mask + index-based update loop.
+mandelbrot2 (fast): uses dynamic array compaction; more cache-friendly.
+
+Both kernels operate on complex128 arrays.  The dominant mkl_umath op is
+np.abs() on complex arrays at each iteration step.
+"""
+
+import numpy as np
+
+# --- mandelbrot1 ---
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py
+def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0):
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)
+    C = X + Y[:, None] * 1j
+    N = np.zeros(C.shape, dtype=np.int64)
+    Z = np.zeros(C.shape, dtype=np.complex128)
+    for n in range(maxiter):
+        mask = np.less(abs(Z), horizon)
+        N[mask] = n
+        Z[mask] = Z[mask] ** 2 + C[mask]
+    N[N == maxiter - 1] = 0
+    return Z, N
+
+
+# --- mandelbrot2 ---
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py
+def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0):
+    Xi, Yi = np.mgrid[0:xn, 0:yn]
+    X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi]
+    Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi]
+    C = X + Y * 1j
+    N_ = np.zeros(C.shape, dtype=np.int64)
+    Z_ = np.zeros(C.shape, dtype=np.complex128)
+    Xi.shape = Yi.shape = C.shape = xn * yn
+
+    Z = np.zeros(C.shape, np.complex128)
+    for i in range(itermax):
+        if not len(Z):
+            break
+        np.multiply(Z, Z, Z)
+        np.add(Z, C, Z)
+        rem = np.abs(Z) > horizon
+        Z_[Xi[rem], Yi[rem]] = Z[rem]
+        N_[Xi[rem], Yi[rem]] = i + 1
+        ind = ~rem
+        Z = Z[ind]
+        C = C[ind]
+        Xi = Xi[ind]
+        Yi = Yi[ind]
+    return Z_, N_
+
+
+_PRESETS_M1 = {
+    "M": {
+        "xmin": -1.75,
+        "xmax": 0.25,
+        "ymin": -1.00,
+        "ymax": 1.00,
+        "xn": 250,
+        "yn": 250,
+        "maxiter": 150,
+        "horizon": 2.0,
+    },
+    "L": {
+        "xmin": -2.00,
+        "xmax": 0.50,
+        "ymin": -1.25,
+        "ymax": 1.25,
+        "xn": 833,
+        "yn": 833,
+        "maxiter": 200,
+        "horizon": 2.0,
+    },
+}
+
+_PRESETS_M2 = {
+    "M": {
+        "xmin": -2.00,
+        "xmax": 0.50,
+        "ymin": -1.25,
+        "ymax": 1.25,
+        "xn": 500,
+        "yn": 500,
+        "itermax": 80,
+        "horizon": 2.0,
+    },
+    "L": {
+        "xmin": -2.25,
+        "xmax": 0.75,
+        "ymin": -1.50,
+        "ymax": 1.50,
+        "xn": 1000,
+        "yn": 1000,
+        "itermax": 100,
+        "horizon": 2.0,
+    },
+}
+
+
+class BenchMandelbrot1:
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def time_mandelbrot1(self, preset):
+        _mandelbrot1(**_PRESETS_M1[preset])
+
+
+class BenchMandelbrot2:
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def time_mandelbrot2(self, preset):
+        _mandelbrot2(**_PRESETS_M2[preset])
diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py
new file mode 100644
index 0000000..5fdfe32
--- /dev/null
+++ b/benchmarks/benchmarks/npbench/bench_softmax.py
@@ -0,0 +1,51 @@
+"""npbench wrapper: Softmax — mkl_umath ops: exp, max, sum.
+
+Preset sizes from npbench bench_info/softmax.json:
+  M: N=32,  H=8,   SM=256   (float32)
+  L: N=64,  H=16,  SM=448   (float32)
+
+npbench initializes this benchmark with float32 explicitly.
+"""
+
+import numpy as np
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py
+def _initialize(N, H, SM):
+    from numpy.random import default_rng
+
+    rng = default_rng(42)
+    x = rng.random((N, H, SM, SM), dtype=np.float32)
+    return (x,)
+
+
+# Inlined from spcl/npbench @ main
+# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax_numpy.py
+def _softmax(x):
+    tmp_max = np.max(x, axis=-1, keepdims=True)
+    tmp_out = np.exp(x - tmp_max)
+    tmp_sum = np.sum(tmp_out, axis=-1, keepdims=True)
+    return tmp_out / tmp_sum
+
+
+_PRESETS = {
+    "M": {"N": 32, "H": 8, "SM": 256},
+    "L": {"N": 64, "H": 16, "SM": 448},
+}
+
+
+class BenchSoftmax:
+    params = (["M", "L"],)
+    param_names = ["preset"]
+    number = 1
+    repeat = 20
+
+    def setup_cache(self):
+        return {p: _initialize(**kw) for p, kw in _PRESETS.items()}
+
+    def setup(self, cache, preset):
+        (self.x,) = cache[preset]
+
+    def time_softmax(self, cache, preset):
+        _softmax(self.x)