diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..2da9bf0 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,52 @@ +# mkl_umath ASV Benchmarks + +Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/). + +The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads. + +### Coverage + +| File | Ufuncs | Dtypes | Sizes/Presets | +|------|--------|--------|---------------| +| `micro/bench_micro.py` | 24 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M | +| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) | +| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) | +| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) | +| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) | + +## Running Benchmarks + +Prerequisites: + +```bash +pip install asv psutil +``` + +Run benchmarks against the current commit: + +```bash +asv run --python=same --quick HEAD^! +``` + +Compare two commits: + +```bash +asv continuous --python=same HEAD~1 HEAD +``` + +View results in a browser: + +```bash +asv publish +asv preview +``` + +## Threading + +Set `MKL_NUM_THREADS` to control the thread count used by MKL: + +```bash +MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^! +``` + +If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation). diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 0000000..7848275 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,20 @@ +{ + "version": 1, + "project": "mkl_umath", + "project_url": "https://github.com/IntelPython/mkl_umath", + "repo": "..", + "branches": [ + "main" + ], + "environment_type": "existing", + "benchmark_dir": "benchmarks", + "env_dir": ".asv/env", + "results_dir": ".asv/results", + "html_dir": ".asv/html", + "show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/", + "build_cache_size": 2, + "default_benchmark_timeout": 1500, + "regressions_thresholds": { + ".*": 0.2 + } +} diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000..665b2e1 --- /dev/null +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1,26 @@ +"""ASV benchmarks for mkl_umath""" + +import os + +import psutil + +from ._patch_setup import _apply_patches + +_MIN_THREADS = 4 # minimum physical cores required for multi-threaded mode + + +def _physical_cores(): + """Return physical core count; fall back to 1 (conservative).""" + return psutil.cpu_count(logical=False) or 1 + + +def _thread_count(): + physical = _physical_cores() + return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1" + + +_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count()) +os.environ["MKL_NUM_THREADS"] = _THREADS + +_apply_patches() +del _apply_patches diff --git a/benchmarks/benchmarks/_patch_setup.py b/benchmarks/benchmarks/_patch_setup.py new file mode 100644 index 0000000..9383b1c --- /dev/null +++ b/benchmarks/benchmarks/_patch_setup.py @@ -0,0 +1,69 @@ +"""MKL patch setup — executed once per ASV worker process at import time. + +Patches NumPy with Intel MKL implementations for fft, random, and umath. +Hard-fails with a descriptive RuntimeError if any package is missing or the +patch does not take effect, so benchmarks never silently run on stock NumPy. +""" + +_PATCH_MAP = [ + ("mkl_fft", "patch_numpy_fft"), + ("mkl_random", "patch_numpy_random"), + ("mkl_umath", "patch_numpy_umath"), +] + + +def _apply_patches(): + import numpy as np + + patched = {} + + for mod_name, patch_fn_name in _PATCH_MAP: + try: + mod = __import__(mod_name) + except ImportError as exc: + raise RuntimeError( + f"[mkl-patch] Cannot import {mod_name}: {exc}\n" + f" Ensure the conda env contains {mod_name} " + f"from the Intel channel.\n" + " Required channels: " + "https://software.repos.intel.com/python/conda" + ) from exc + + patch_fn = getattr(mod, patch_fn_name, None) + if patch_fn is None: + raise RuntimeError( + f"[mkl-patch] {mod_name} has no {patch_fn_name}(). " + f"Upgrade {mod_name} to a version that exposes " + "the stock-numpy patch API." + ) + + try: + patch_fn() + except Exception as exc: + raise RuntimeError( + f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}" + ) from exc + + is_patched_fn = getattr(mod, "is_patched", None) + if callable(is_patched_fn) and not is_patched_fn(): + raise RuntimeError( + f"[mkl-patch] {mod_name}.is_patched() returned False " + "after patching. NumPy may have been imported before " + "patching in a conflicting state." + ) + + patched[mod_name] = mod + + _attr_checks = { + "mkl_fft": lambda: np.fft.fft.__module__, + "mkl_random": lambda: np.random.random.__module__, + "mkl_umath": lambda: np.exp.__module__, + } + for mod_name in patched: + try: + attr = _attr_checks[mod_name]() + except Exception: + attr = "unknown" + print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}") + + print("[mkl-patch] ALL OK -- mkl_fft, mkl_random, mkl_umath active") diff --git a/benchmarks/benchmarks/micro/__init__.py b/benchmarks/benchmarks/micro/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/benchmarks/micro/bench_micro.py b/benchmarks/benchmarks/micro/bench_micro.py new file mode 100644 index 0000000..1d6e4bb --- /dev/null +++ b/benchmarks/benchmarks/micro/bench_micro.py @@ -0,0 +1,87 @@ +"""Micro-benchmarks for mkl_umath unary ufuncs. + +Times each ufunc over a Cartesian product of + dtype in [float32, float64] + size in [10_000, 100_000, 1_000_000] + +Arrays are pre-allocated in setup() and reused across timing calls. +Patching is applied once at package import via benchmarks._patch_setup. +""" + +import numpy as np + +_UFUNC_CONFIGS = { + "exp": {"func": np.exp, "low": -10.0, "high": 10.0}, + "exp2": {"func": np.exp2, "low": -10.0, "high": 10.0}, + "expm1": {"func": np.expm1, "low": -10.0, "high": 10.0}, + "log": {"func": np.log, "low": 1e-3, "high": 1e3}, + "log2": {"func": np.log2, "low": 1e-3, "high": 1e3}, + "log10": {"func": np.log10, "low": 1e-3, "high": 1e3}, + "log1p": {"func": np.log1p, "low": 0.0, "high": 10.0}, + "sin": {"func": np.sin, "low": -np.pi, "high": np.pi}, + "cos": {"func": np.cos, "low": -np.pi, "high": np.pi}, + "tan": {"func": np.tan, "low": -1.4, "high": 1.4}, + "arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0}, + "arccos": {"func": np.arccos, "low": -1.0, "high": 1.0}, + "arctan": {"func": np.arctan, "low": -10.0, "high": 10.0}, + "sinh": {"func": np.sinh, "low": -5.0, "high": 5.0}, + "cosh": {"func": np.cosh, "low": -5.0, "high": 5.0}, + "tanh": {"func": np.tanh, "low": -5.0, "high": 5.0}, + "arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0}, + "arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0}, + "arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99}, + "sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0}, + "cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0}, + "square": {"func": np.square, "low": -10.0, "high": 10.0}, + "fabs": {"func": np.fabs, "low": -100.0, "high": 100.0}, + "absolute": {"func": np.absolute, "low": -100.0, "high": 100.0}, + "reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0}, +} + + +class BenchMicro: + params = ( + sorted(_UFUNC_CONFIGS.keys()), + ["float32", "float64"], + [10_000, 100_000, 1_000_000], + ) + param_names = ["ufunc", "dtype", "size"] + + def setup(self, ufunc, dtype, size): + cfg = _UFUNC_CONFIGS[ufunc] + rng = np.random.default_rng(42) + self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype) + self._func = cfg["func"] + + def time_micro(self, ufunc, dtype, size): + self._func(self.x) + + +class BenchArctan2: + """Binary ufunc arctan2""" + + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.y = rng.uniform(-1.0, 1.0, size).astype(dtype) + self.x = rng.uniform(-1.0, 1.0, size).astype(dtype) + + def time_arctan2(self, dtype, size): + np.arctan2(self.y, self.x) + + +class BenchPower: + """Binary ufunc power (arbitrary exponent via MKL vdPow)""" + + params = (["float32", "float64"], [10_000, 100_000, 1_000_000]) + param_names = ["dtype", "size"] + + def setup(self, dtype, size): + rng = np.random.default_rng(42) + self.base = rng.uniform(0.1, 10.0, size).astype(dtype) + self.exp = rng.uniform(0.5, 3.0, size).astype(dtype) + + def time_power(self, dtype, size): + np.power(self.base, self.exp) diff --git a/benchmarks/benchmarks/npbench/__init__.py b/benchmarks/benchmarks/npbench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/benchmarks/npbench/bench_arc_distance.py b/benchmarks/benchmarks/npbench/bench_arc_distance.py new file mode 100644 index 0000000..a57a3fa --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_arc_distance.py @@ -0,0 +1,53 @@ +"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt. + +Preset sizes from npbench bench_info/arc_distance.json: + M: N=1_000_000 + L: N=10_000_000 +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py +def _initialize(N): + from numpy.random import default_rng + + rng = default_rng(42) + t0 = rng.random((N,)) + p0 = rng.random((N,)) + t1 = rng.random((N,)) + p1 = rng.random((N,)) + return t0, p0, t1, p1 + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py +def _arc_distance(theta_1, phi_1, theta_2, phi_2): + temp = ( + np.sin((theta_2 - theta_1) / 2) ** 2 + + np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2 + ) + return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp)) + + +_PRESETS = { + "M": {"N": 1_000_000}, + "L": {"N": 10_000_000}, +} + + +class BenchArcDistance: + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset] + + def time_arc_distance(self, cache, preset): + _arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2) diff --git a/benchmarks/benchmarks/npbench/bench_go_fast.py b/benchmarks/benchmarks/npbench/bench_go_fast.py new file mode 100644 index 0000000..f4dca7e --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_go_fast.py @@ -0,0 +1,74 @@ +"""npbench wrapper: GoFast — mkl_umath ops: tanh. + +Preset sizes from npbench bench_info/go_fast.json: + M: N=6_000 + L: N=20_000 + +Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop +(go_fast_loop). A vectorized variant (go_fast_vec) using np.tanh on the +full diagonal is included for direct MKL VM throughput measurement. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py +def _initialize(N): + from numpy.random import default_rng + + rng = default_rng(42) + a = rng.random((N, N)) + return (a,) + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py +def _go_fast(a): + trace = 0.0 + for i in range(a.shape[0]): + trace += np.tanh(a[i, i]) + return a + trace + + +_PRESETS = { + "M": {"N": 6_000}, + "L": {"N": 20_000}, +} + + +class BenchGoFastLoop: + """Original npbench kernel — Python loop calling np.tanh per element.""" + + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.a,) = cache[preset] + + def time_go_fast_loop(self, cache, preset): + _go_fast(self.a) + + +class BenchGoFastVec: + """Vectorized variant — np.tanh on the full diagonal array at once.""" + + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.a,) = cache[preset] + self.diag = np.copy(np.diag(self.a)) + + def time_go_fast_vec(self, cache, preset): + np.tanh(self.diag) diff --git a/benchmarks/benchmarks/npbench/bench_mandelbrot.py b/benchmarks/benchmarks/npbench/bench_mandelbrot.py new file mode 100644 index 0000000..47f3f14 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_mandelbrot.py @@ -0,0 +1,132 @@ +"""npbench wrapper: Mandelbrot set (two variants). + +mkl_umath ops: abs, multiply, add. + +Preset sizes from npbench bench_info/mandelbrot1.json and mandelbrot2.json: + M: XN=YN=250/500, maxiter=150/80 + L: XN=YN=833/1000, maxiter=200/100 + +mandelbrot1 (slow): uses np.less mask + index-based update loop. +mandelbrot2 (fast): uses dynamic array compaction; more cache-friendly. + +Both kernels operate on complex128 arrays. The dominant mkl_umath op is +np.abs() on complex arrays at each iteration step. +""" + +import numpy as np + +# --- mandelbrot1 --- + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot1/mandelbrot1_numpy.py +def _mandelbrot1(xmin, xmax, ymin, ymax, xn, yn, maxiter, horizon=2.0): + X = np.linspace(xmin, xmax, xn, dtype=np.float64) + Y = np.linspace(ymin, ymax, yn, dtype=np.float64) + C = X + Y[:, None] * 1j + N = np.zeros(C.shape, dtype=np.int64) + Z = np.zeros(C.shape, dtype=np.complex128) + for n in range(maxiter): + mask = np.less(abs(Z), horizon) + N[mask] = n + Z[mask] = Z[mask] ** 2 + C[mask] + N[N == maxiter - 1] = 0 + return Z, N + + +# --- mandelbrot2 --- + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/mandelbrot2/mandelbrot2_numpy.py +def _mandelbrot2(xmin, xmax, ymin, ymax, xn, yn, itermax, horizon=2.0): + Xi, Yi = np.mgrid[0:xn, 0:yn] + X = np.linspace(xmin, xmax, xn, dtype=np.float64)[Xi] + Y = np.linspace(ymin, ymax, yn, dtype=np.float64)[Yi] + C = X + Y * 1j + N_ = np.zeros(C.shape, dtype=np.int64) + Z_ = np.zeros(C.shape, dtype=np.complex128) + Xi.shape = Yi.shape = C.shape = xn * yn + + Z = np.zeros(C.shape, np.complex128) + for i in range(itermax): + if not len(Z): + break + np.multiply(Z, Z, Z) + np.add(Z, C, Z) + rem = np.abs(Z) > horizon + Z_[Xi[rem], Yi[rem]] = Z[rem] + N_[Xi[rem], Yi[rem]] = i + 1 + ind = ~rem + Z = Z[ind] + C = C[ind] + Xi = Xi[ind] + Yi = Yi[ind] + return Z_, N_ + + +_PRESETS_M1 = { + "M": { + "xmin": -1.75, + "xmax": 0.25, + "ymin": -1.00, + "ymax": 1.00, + "xn": 250, + "yn": 250, + "maxiter": 150, + "horizon": 2.0, + }, + "L": { + "xmin": -2.00, + "xmax": 0.50, + "ymin": -1.25, + "ymax": 1.25, + "xn": 833, + "yn": 833, + "maxiter": 200, + "horizon": 2.0, + }, +} + +_PRESETS_M2 = { + "M": { + "xmin": -2.00, + "xmax": 0.50, + "ymin": -1.25, + "ymax": 1.25, + "xn": 500, + "yn": 500, + "itermax": 80, + "horizon": 2.0, + }, + "L": { + "xmin": -2.25, + "xmax": 0.75, + "ymin": -1.50, + "ymax": 1.50, + "xn": 1000, + "yn": 1000, + "itermax": 100, + "horizon": 2.0, + }, +} + + +class BenchMandelbrot1: + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def time_mandelbrot1(self, preset): + _mandelbrot1(**_PRESETS_M1[preset]) + + +class BenchMandelbrot2: + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def time_mandelbrot2(self, preset): + _mandelbrot2(**_PRESETS_M2[preset]) diff --git a/benchmarks/benchmarks/npbench/bench_softmax.py b/benchmarks/benchmarks/npbench/bench_softmax.py new file mode 100644 index 0000000..5fdfe32 --- /dev/null +++ b/benchmarks/benchmarks/npbench/bench_softmax.py @@ -0,0 +1,51 @@ +"""npbench wrapper: Softmax — mkl_umath ops: exp, max, sum. + +Preset sizes from npbench bench_info/softmax.json: + M: N=32, H=8, SM=256 (float32) + L: N=64, H=16, SM=448 (float32) + +npbench initializes this benchmark with float32 explicitly. +""" + +import numpy as np + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax.py +def _initialize(N, H, SM): + from numpy.random import default_rng + + rng = default_rng(42) + x = rng.random((N, H, SM, SM), dtype=np.float32) + return (x,) + + +# Inlined from spcl/npbench @ main +# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/deep_learning/softmax/softmax_numpy.py +def _softmax(x): + tmp_max = np.max(x, axis=-1, keepdims=True) + tmp_out = np.exp(x - tmp_max) + tmp_sum = np.sum(tmp_out, axis=-1, keepdims=True) + return tmp_out / tmp_sum + + +_PRESETS = { + "M": {"N": 32, "H": 8, "SM": 256}, + "L": {"N": 64, "H": 16, "SM": 448}, +} + + +class BenchSoftmax: + params = (["M", "L"],) + param_names = ["preset"] + number = 1 + repeat = 20 + + def setup_cache(self): + return {p: _initialize(**kw) for p, kw in _PRESETS.items()} + + def setup(self, cache, preset): + (self.x,) = cache[preset] + + def time_softmax(self, cache, preset): + _softmax(self.x)