Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# mkl_umath ASV Benchmarks

Performance benchmarks for [mkl_umath](https://github.com/IntelPython/mkl_umath) using [Airspeed Velocity (ASV)](https://asv.readthedocs.io/en/stable/).

The `npbench/` suite uses kernels from [npbench](https://github.com/spcl/npbench) to measure end-to-end impact of MKL ufunc acceleration in realistic workloads.

### Coverage

| File | Ufuncs | Dtypes | Sizes/Presets |
|------|--------|--------|---------------|
| `micro/bench_micro.py` | 24 unary (`exp`, `log`, `sin`, `cos`, `sqrt`, `cbrt`, etc.) + `arctan2`, `power` | float32, float64 | 10k, 100k, 1M |
| `npbench/bench_softmax.py` | `exp`, `max`, `sum` | float32 | M (32x8x256x256), L (64x16x448x448) |
| `npbench/bench_arc_distance.py` | `sin`, `cos`, `arctan2`, `sqrt` | float64 | M (1M), L (10M) |
| `npbench/bench_go_fast.py` | `tanh` | float64 | M (6k x 6k), L (20k x 20k) |
| `npbench/bench_mandelbrot.py` | `abs`, `multiply`, `add` | complex128 | M (250/500), L (833/1000) |

## Running Benchmarks

Prerequisites:

```bash
pip install asv psutil
```

Run benchmarks against the current commit:

```bash
asv run --python=same --quick HEAD^!
```

Compare two commits:

```bash
asv continuous --python=same HEAD~1 HEAD
```

View results in a browser:

```bash
asv publish
asv preview
```

## Threading

Set `MKL_NUM_THREADS` to control the thread count used by MKL:

```bash
MKL_NUM_THREADS=8 asv run --python=same --quick HEAD^!
```

If `MKL_NUM_THREADS` is not set, `__init__.py` applies a default: **4** threads when the machine has 4 or more physical cores, or **1** (single-threaded) otherwise. This keeps results comparable across CI machines in the shared pool regardless of their total core count. Physical cores are detected via `psutil.cpu_count(logical=False)` (hyperthreads excluded per MKL recommendation).
20 changes: 20 additions & 0 deletions benchmarks/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"version": 1,
"project": "mkl_umath",
"project_url": "https://github.com/IntelPython/mkl_umath",
"repo": "..",
"branches": [
"main"
],
"environment_type": "existing",
"benchmark_dir": "benchmarks",
"env_dir": ".asv/env",
"results_dir": ".asv/results",
"html_dir": ".asv/html",
"show_commit_url": "https://github.com/IntelPython/mkl_umath/commit/",
"build_cache_size": 2,
"default_benchmark_timeout": 1500,
"regressions_thresholds": {
".*": 0.2
}
}
26 changes: 26 additions & 0 deletions benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""ASV benchmarks for mkl_umath"""

import os

import psutil

from ._patch_setup import _apply_patches

_MIN_THREADS = 4 # minimum physical cores required for multi-threaded mode


def _physical_cores():
"""Return physical core count; fall back to 1 (conservative)."""
return psutil.cpu_count(logical=False) or 1


def _thread_count():
physical = _physical_cores()
return str(_MIN_THREADS) if physical >= _MIN_THREADS else "1"


_THREADS = os.environ.get("MKL_NUM_THREADS", _thread_count())
os.environ["MKL_NUM_THREADS"] = _THREADS

_apply_patches()
del _apply_patches
69 changes: 69 additions & 0 deletions benchmarks/benchmarks/_patch_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""MKL patch setup — executed once per ASV worker process at import time.

Patches NumPy with Intel MKL implementations for fft, random, and umath.
Hard-fails with a descriptive RuntimeError if any package is missing or the
patch does not take effect, so benchmarks never silently run on stock NumPy.
"""

_PATCH_MAP = [
("mkl_fft", "patch_numpy_fft"),
("mkl_random", "patch_numpy_random"),
("mkl_umath", "patch_numpy_umath"),
]


def _apply_patches():
import numpy as np

patched = {}

for mod_name, patch_fn_name in _PATCH_MAP:
try:
mod = __import__(mod_name)
except ImportError as exc:
raise RuntimeError(
f"[mkl-patch] Cannot import {mod_name}: {exc}\n"
f" Ensure the conda env contains {mod_name} "
f"from the Intel channel.\n"
" Required channels: "
"https://software.repos.intel.com/python/conda"
) from exc

patch_fn = getattr(mod, patch_fn_name, None)
if patch_fn is None:
raise RuntimeError(
f"[mkl-patch] {mod_name} has no {patch_fn_name}(). "
f"Upgrade {mod_name} to a version that exposes "
"the stock-numpy patch API."
)

try:
patch_fn()
except Exception as exc:
raise RuntimeError(
f"[mkl-patch] {mod_name}.{patch_fn_name}() raised: {exc!r}"
) from exc

is_patched_fn = getattr(mod, "is_patched", None)
if callable(is_patched_fn) and not is_patched_fn():
raise RuntimeError(
f"[mkl-patch] {mod_name}.is_patched() returned False "
"after patching. NumPy may have been imported before "
"patching in a conflicting state."
)

patched[mod_name] = mod

_attr_checks = {
"mkl_fft": lambda: np.fft.fft.__module__,
"mkl_random": lambda: np.random.random.__module__,
"mkl_umath": lambda: np.exp.__module__,
}
for mod_name in patched:
try:
attr = _attr_checks[mod_name]()
except Exception:
attr = "unknown"
print(f"[mkl-patch] {mod_name}: numpy dispatch -> {attr}")

print("[mkl-patch] ALL OK -- mkl_fft, mkl_random, mkl_umath active")
Empty file.
87 changes: 87 additions & 0 deletions benchmarks/benchmarks/micro/bench_micro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Micro-benchmarks for mkl_umath unary ufuncs.

Times each ufunc over a Cartesian product of
dtype in [float32, float64]
size in [10_000, 100_000, 1_000_000]

Arrays are pre-allocated in setup() and reused across timing calls.
Patching is applied once at package import via benchmarks._patch_setup.
"""

import numpy as np

_UFUNC_CONFIGS = {
"exp": {"func": np.exp, "low": -10.0, "high": 10.0},
"exp2": {"func": np.exp2, "low": -10.0, "high": 10.0},
"expm1": {"func": np.expm1, "low": -10.0, "high": 10.0},
"log": {"func": np.log, "low": 1e-3, "high": 1e3},
"log2": {"func": np.log2, "low": 1e-3, "high": 1e3},
"log10": {"func": np.log10, "low": 1e-3, "high": 1e3},
"log1p": {"func": np.log1p, "low": 0.0, "high": 10.0},
"sin": {"func": np.sin, "low": -np.pi, "high": np.pi},
"cos": {"func": np.cos, "low": -np.pi, "high": np.pi},
"tan": {"func": np.tan, "low": -1.4, "high": 1.4},
"arcsin": {"func": np.arcsin, "low": -1.0, "high": 1.0},
"arccos": {"func": np.arccos, "low": -1.0, "high": 1.0},
"arctan": {"func": np.arctan, "low": -10.0, "high": 10.0},
"sinh": {"func": np.sinh, "low": -5.0, "high": 5.0},
"cosh": {"func": np.cosh, "low": -5.0, "high": 5.0},
"tanh": {"func": np.tanh, "low": -5.0, "high": 5.0},
"arcsinh": {"func": np.arcsinh, "low": -10.0, "high": 10.0},
"arccosh": {"func": np.arccosh, "low": 1.0, "high": 100.0},
"arctanh": {"func": np.arctanh, "low": -0.99, "high": 0.99},
"sqrt": {"func": np.sqrt, "low": 0.0, "high": 100.0},
"cbrt": {"func": np.cbrt, "low": -100.0, "high": 100.0},
"square": {"func": np.square, "low": -10.0, "high": 10.0},
"fabs": {"func": np.fabs, "low": -100.0, "high": 100.0},
"absolute": {"func": np.absolute, "low": -100.0, "high": 100.0},
"reciprocal": {"func": np.reciprocal, "low": 0.01, "high": 100.0},
}


class BenchMicro:
params = (
sorted(_UFUNC_CONFIGS.keys()),
["float32", "float64"],
[10_000, 100_000, 1_000_000],
Comment thread
vchamarthi marked this conversation as resolved.
)
param_names = ["ufunc", "dtype", "size"]

def setup(self, ufunc, dtype, size):
cfg = _UFUNC_CONFIGS[ufunc]
rng = np.random.default_rng(42)
self.x = rng.uniform(cfg["low"], cfg["high"], size).astype(dtype)
self._func = cfg["func"]

def time_micro(self, ufunc, dtype, size):
self._func(self.x)


class BenchArctan2:
"""Binary ufunc arctan2"""

params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
param_names = ["dtype", "size"]

def setup(self, dtype, size):
rng = np.random.default_rng(42)
self.y = rng.uniform(-1.0, 1.0, size).astype(dtype)
self.x = rng.uniform(-1.0, 1.0, size).astype(dtype)

def time_arctan2(self, dtype, size):
np.arctan2(self.y, self.x)


class BenchPower:
"""Binary ufunc power (arbitrary exponent via MKL vdPow)"""

params = (["float32", "float64"], [10_000, 100_000, 1_000_000])
param_names = ["dtype", "size"]

def setup(self, dtype, size):
rng = np.random.default_rng(42)
self.base = rng.uniform(0.1, 10.0, size).astype(dtype)
self.exp = rng.uniform(0.5, 3.0, size).astype(dtype)

def time_power(self, dtype, size):
np.power(self.base, self.exp)
Empty file.
53 changes: 53 additions & 0 deletions benchmarks/benchmarks/npbench/bench_arc_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""npbench wrapper: Arc Distance — mkl_umath ops: sin, cos, arctan2, sqrt.

Preset sizes from npbench bench_info/arc_distance.json:
M: N=1_000_000
L: N=10_000_000
"""

import numpy as np


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance.py
def _initialize(N):
from numpy.random import default_rng

rng = default_rng(42)
t0 = rng.random((N,))
p0 = rng.random((N,))
t1 = rng.random((N,))
p1 = rng.random((N,))
return t0, p0, t1, p1


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/pythran/arc_distance/arc_distance_numpy.py
def _arc_distance(theta_1, phi_1, theta_2, phi_2):
temp = (
np.sin((theta_2 - theta_1) / 2) ** 2
+ np.cos(theta_1) * np.cos(theta_2) * np.sin((phi_2 - phi_1) / 2) ** 2
)
return 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))


_PRESETS = {
"M": {"N": 1_000_000},
"L": {"N": 10_000_000},
}


class BenchArcDistance:
params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
self.theta_1, self.phi_1, self.theta_2, self.phi_2 = cache[preset]

def time_arc_distance(self, cache, preset):
_arc_distance(self.theta_1, self.phi_1, self.theta_2, self.phi_2)
74 changes: 74 additions & 0 deletions benchmarks/benchmarks/npbench/bench_go_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""npbench wrapper: GoFast — mkl_umath ops: tanh.

Preset sizes from npbench bench_info/go_fast.json:
M: N=6_000
L: N=20_000

Note: the npbench ``go_fast`` kernel iterates diagonals in a Python loop
(go_fast_loop). A vectorized variant (go_fast_vec) using np.tanh on the
full diagonal is included for direct MKL VM throughput measurement.
"""

import numpy as np


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast.py
def _initialize(N):
from numpy.random import default_rng

rng = default_rng(42)
a = rng.random((N, N))
return (a,)


# Inlined from spcl/npbench @ main
# https://github.com/spcl/npbench/blob/main/npbench/benchmarks/go_fast/go_fast_numpy.py
def _go_fast(a):
trace = 0.0
for i in range(a.shape[0]):
trace += np.tanh(a[i, i])
return a + trace


_PRESETS = {
"M": {"N": 6_000},
"L": {"N": 20_000},
}


class BenchGoFastLoop:
"""Original npbench kernel — Python loop calling np.tanh per element."""

params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
(self.a,) = cache[preset]

def time_go_fast_loop(self, cache, preset):
_go_fast(self.a)


class BenchGoFastVec:
"""Vectorized variant — np.tanh on the full diagonal array at once."""

params = (["M", "L"],)
param_names = ["preset"]
number = 1
repeat = 20

def setup_cache(self):
return {p: _initialize(**kw) for p, kw in _PRESETS.items()}

def setup(self, cache, preset):
(self.a,) = cache[preset]
self.diag = np.copy(np.diag(self.a))

def time_go_fast_vec(self, cache, preset):
np.tanh(self.diag)
Loading
Loading