diff --git a/.gitignore b/.gitignore
index fca0d9389..6aae86cad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .venv
+benchmarks/microbenchmarks/asv/results/
 *.o
 *.swp
 *.ii
diff --git a/benchmarks/microbenchmarks/asv/README.md b/benchmarks/microbenchmarks/asv/README.md
new file mode 100644
index 000000000..10644f8f7
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/README.md
@@ -0,0 +1,171 @@
+# TransformerEngine Microbenchmarks
+
+GPU microbenchmarks for TE ops (GEMM, FP8 GEMM, grouped GEMM, attention,
+casting, normalization), run in-process by `driver.py`. Each suite is a
+`bench_*.py` file with a `Bench*` class; the driver times every `time_*` method,
+prints a table with throughput, and saves raw per-call samples to JSON for
+statistical comparison.
+
+## Prerequisites
+
+- TransformerEngine built and installed in the current Python environment.
+- A ROCm or CUDA GPU.
+
+## Running
+
+```bash
+cd benchmarks/microbenchmarks/asv
+python driver.py --all                    # run every suite
+python driver.py bench_gemm               # run one suite via the driver
+python bench_gemm.py                      # run one suite directly
+python bench_gemm.py time_forward         # filter to methods containing a string
+python bench_gemm.py -w 5 -n 20           # custom warmup / timed iterations
+python bench_casting.py --no-save         # don't write a result file
+python bench_casting.py --cold-cache      # flush GPU cache before each sample
+python bench_gemm.py --inner 50           # fix the inner-loop count to 50
+python bench_gemm.py --kernel-profile     # per-kernel CUDA-time breakdown
+```
+
+Results are written to `benchmarks/microbenchmarks/asv/results/<commit-hash>.json`
+(gitignored), one raw-sample record per benchmark + parameter combination.
+
+## Timing model: inner loop and cache state
+
+Each `time_*` method runs its kernel `_inner` times inside one CUDA-event window
+and divides by `_inner`, amortizing kernel-launch and CUDA-event jitter
+(`~0.5 µs` on AMD). By default the driver auto-tunes `_inner` per (combo, method)
+so each window lasts at least `--target-window-ms` (default `1.0 ms`).
+
+| Flag | Effect |
+|---|---|
+| `--inner auto` (default) | Probe one invocation, then pick `_inner` so the next window lasts ≥ `--target-window-ms` (capped at 10000). |
+| `--inner N` | Force a fixed `_inner = N`. |
+| `--target-window-ms T` | Target window duration for `--inner auto` (default `1.0`). |
+| `--cold-cache` | Write a `--cache-flush-mb` scratch buffer before each sample to evict L2 + Infinity Cache. Implies `--inner=1` (otherwise later inner iterations refill the cache). |
+| `--cache-flush-mb M` | Scratch buffer size for `--cold-cache` (default `256`, sized for the MI300 Infinity Cache). |
+
+- **Warm cache, large `_inner`** (default): steady-state throughput, lowest variance.
+- **Cold cache, `_inner=1`**: isolated cold-memory cost — higher variance; bandwidth-bound benches (cast, norm) run ~1.5–3× slower than warm.
+
+## Kernel profiling
+
+`--kernel-profile` runs each benchmark once under `torch.profiler` instead of
+collecting timing distributions, and prints the GPU kernels it launched, sorted
+by total device time:
+
+```bash
+python driver.py bench_gemm --kernel-profile
+python bench_attention.py time_forward --kernel-profile   # one method
+```
+
+For each `(method, parameter combo)` it reports per-kernel total/avg CUDA time,
+launch count, and share of total — useful for spotting which kernel dominates or
+whether an op is launch-bound. This bypasses the timing machinery (`--inner`,
+`--cold-cache`, interleaving); `--profile-inner N` sets how many invocations are
+profiled per run (default `1`). Output is saved to
+`results/<commit-hash>-kernelprofile.json` unless `--no-save`.
+
+## Sample scheduling: interleaving
+
+By default the driver does **not** collect a benchmark's samples in one
+contiguous block. It samples in round-robin chunks: it sets up a group of
+`(method, combo)` benchmarks, then takes one sample from each per round, for `-n`
+rounds. Sequential scheduling (all of A, then all of B) makes wall-clock time a
+proxy for benchmark identity, so any time-correlated GPU noise (thermal ramp,
+DVFS throttle, a neighbor on a shared GPU) becomes a systematic **bias** between
+benchmarks rather than noise. Round-robin spreads every benchmark across the same
+window, so a transient lands on one sample of each. The per-round visit order is
+also randomly permuted (seeded, so runs are reproducible) to remove residual
+within-round phase/predecessor bias.
+
+| Flag | Effect |
+|---|---|
+| `--interleave-group N` (default `8`) | Benchmarks sampled round-robin together. Each keeps a live GPU instance, so **lower this if a group runs out of memory**. |
+| `--sequential` | Collect each benchmark's samples contiguously (≡ `--interleave-group 1`). Lowest memory, biased under thermal drift. |
+| `--seed S` (default `0`) | Seed for the per-round shuffle. |
+| `--no-shuffle` | Fixed round-robin order instead of permuting each round (debugging). |
+
+Interleaving removes *within-run* time-position bias. It does **not** remove a
+whole-run thermal offset between two separately produced result files, so for the
+comparison below, produce the baseline and candidate files back-to-back under
+similar conditions.
+
+## Comparing two checkouts statistically
+
+The driver records raw per-call samples; `compare_results.py` compares two result
+files with a Brunner-Munzel test via
+[benchstats](https://github.com/Arech/benchstats):
+
+```bash
+pip install -r requirements.txt   # benchstats (pulls rich, scipy, numpy)
+cd benchmarks/microbenchmarks/asv
+
+python driver.py --all -n 20      # on the baseline checkout -> results/<base>.json
+python driver.py --all -n 20      # on the candidate checkout -> results/<cand>.json
+python compare_results.py results/<base>.json results/<cand>.json
+```
+
+It marks each `(benchmark, parameter combination)` faster (`>`), slower (`<`), or
+not significant (`~`), and exits `1` on a significant difference (CI gating).
+
+Two runs on the **same** commit (e.g. a dirty working tree, where `HEAD` is
+unchanged) would overwrite each other; pass `--label` to keep them distinct:
+
+```bash
+python driver.py --all -n 20 --label base   # -> results/<hash>-base.json
+python driver.py --all -n 20 --label cand   # -> results/<hash>-cand.json
+python compare_results.py results/<hash>-base.json results/<hash>-cand.json
+```
+
+| Flag | Effect |
+|---|---|
+| `--alpha A` | Significance level (default `0.001`). |
+| `--method M` | Statistical test (default `brunnermunzel`). |
+| `--filter REGEX` | Only compare benchmarks whose name matches `REGEX`. |
+| `--always-show-pvalues` | Show p-values for non-significant rows too. |
+| `--export-to FILE` | Save the report to `.txt`/`.svg`/`.html`. |
+
+The rank test needs a reasonable sample count (≥ ~10); the default `-n 20`
+satisfies this. Only timing is tested — throughput is a constant-work transform
+of time, so a rank test on it is identical.
+
+## Writing a new benchmark
+
+Add `bench_<name>.py` with a `Bench*` class subclassing `BenchBase`. Pull model
+shapes from `models.py` so configs stay in one place.
+
+```python
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES
+
+class BenchSomething(BenchBase):
+    params = [M_SIZES, ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+
+    def setup(self, M, config):
+        # Allocate tensors / modules. Runs once per (combo, method); the same
+        # instance is reused for warmup and timed iterations.
+        self.module = ...
+        self.x = ...
+
+    def time_forward(self, M, config):
+        # self._time runs the callable _inner times in one CUDA-event window
+        # and returns seconds per single invocation (handles --cold-cache).
+        return self._time(lambda: self.module(self.x))
+
+    # Optional: work_<name> returns per-call work for throughput columns.
+    def work_forward(self, M, config):
+        return {"flops": 2 * M * self.N * self.K}   # or {"bytes": ...}
+
+if __name__ == "__main__":
+    run_as_main(__file__)
+```
+
+Rules:
+- `time_*` methods are timed automatically; time through `self._time(fn)`.
+- `work_<name>` companions return **per-call** work and yield TFLOPS (`flops`) or GB/s (`bytes`) columns.
+- Clear `.grad` attributes in backward benchmarks to prevent accumulation.
+- `params` is a cross-product — keep the matrix size reasonable.
diff --git a/benchmarks/microbenchmarks/asv/bench_attention.py b/benchmarks/microbenchmarks/asv/bench_attention.py
new file mode 100644
index 000000000..395bb07cb
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_attention.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Attention benchmarks via te.DotProductAttention (causal, GQA).
+
+Forward FLOPs  = 4 * batch * num_q_heads * seq_len^2 * head_dim
+  (Q@K^T and attn@V, each 2*b*h*s^2*d).
+Backward FLOPs ~= 2 * Forward FLOPs.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, attention_configs
+
+BATCH = 2
+MODELS = attention_configs()  # name -> (num_q_heads, num_kv_heads, head_dim, tp)
+
+
+class BenchAttention(BenchBase):
+    params = [M_SIZES, list(MODELS)]  # M_SIZES used as seq_len
+    param_names = ["seq_len", "model"]
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = MODELS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def work_forward(self, seq_len, model):
+        n_q, _, hd, tp = MODELS[model]
+        return {"flops": 4 * BATCH * (n_q // tp) * seq_len * seq_len * hd}
+
+    def work_forward_backward(self, seq_len, model):
+        n_q, _, hd, tp = MODELS[model]
+        return {"flops": 3 * 4 * BATCH * (n_q // tp) * seq_len * seq_len * hd}
+
+    def time_forward(self, seq_len, model):
+        return self._time(lambda: self.attn(self.q, self.k, self.v))
+
+    def time_forward_backward(self, seq_len, model):
+        t = self._time(lambda: self.attn(self.q, self.k, self.v).backward(self.grad_out))
+        self.q.grad = self.k.grad = self.v.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_casting.py b/benchmarks/microbenchmarks/asv/bench_casting.py
new file mode 100644
index 000000000..9f4399b03
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_casting.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) benchmarks.
+
+Covers E4M3 (activations/weights) and E5M2 (gradients). These casts are
+memory-bound, so we report GB/s (input + output bytes).
+"""
+
+import torch
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, hidden_sizes
+
+HIDDEN = hidden_sizes()
+
+# cast name -> (direction, fp8 dtype)
+CAST_CONFIGS = {
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
+}
+
+
+class BenchCasting(BenchBase):
+    params = [M_SIZES, list(HIDDEN), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN[model]
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype, device=torch.device("cuda"),
+            rowwise=True, columnwise=False,
+        )
+        if direction == "dequantize":
+            x = quantizer.quantize(torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda"))
+            self._call = lambda: x.dequantize(dtype=torch.bfloat16)
+        else:
+            x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self._call = lambda: quantizer.quantize(x)
+
+    def work_cast(self, M, model, cast):
+        # quantize: read BF16 (2B) + write FP8 (1B) + scale; dequantize: the
+        # reverse -- 3 bytes/element either way.
+        return {"bytes": M * HIDDEN[model] * 3}
+
+    def time_cast(self, M, model, cast):
+        return self._time(self._call)
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_gemm.py b/benchmarks/microbenchmarks/asv/bench_gemm.py
new file mode 100644
index 000000000..24319cf80
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_gemm.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+Shapes are the four transformer projections (QKV, AttnOut, GateUp, Down)
+derived from the models in models.py.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+
+
+class BenchGemm(BenchBase):
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        return self._time(lambda: self.linear(self.x))
+
+    def time_forward_backward(self, M, shape):
+        t = self._time(lambda: self.linear(self.x).backward(self.grad_out))
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py b/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py
new file mode 100644
index 000000000..a6f761afa
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_gemm_fp8.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 (HYBRID) quantized compute.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8(BenchBase):
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def _forward(self):
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            return self.linear(self.x)
+
+    def time_forward(self, M, shape):
+        return self._time(self._forward)
+
+    def time_forward_backward(self, M, shape):
+        t = self._time(lambda: self._forward().backward(self.grad_out))
+        self.x.grad = None
+        self.linear.weight.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py b/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py
new file mode 100644
index 000000000..58b1d27fb
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_grouped_gemm.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Grouped GEMM benchmarks via te.GroupedLinear (MoE GateUp / Down)."""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES_MOE, grouped_gemm_configs
+
+CONFIGS = grouped_gemm_configs()  # name -> (num_gemms, N, K)
+
+
+class BenchGroupedGemm(BenchBase):
+    params = [M_SIZES_MOE, list(CONFIGS)]
+    param_names = ["M", "config"]
+
+    def setup(self, M, config):
+        B, N, K = CONFIGS[config]
+        dtype = torch.bfloat16
+        self.module = te.GroupedLinear(
+            num_gemms=B, in_features=K, out_features=N, bias=False,
+        ).to(device="cuda", dtype=dtype)
+        self.xs = [
+            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+            for _ in range(B)
+        ]
+        self.grad_outs = [torch.randn_like(o) for o in self.module(self.xs)]
+
+    def work_forward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 2 * M * N * K}
+
+    def work_forward_backward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 3 * 2 * M * N * K}
+
+    def time_forward(self, M, config):
+        return self._time(lambda: self.module(self.xs))
+
+    def time_forward_backward(self, M, config):
+        t = self._time(lambda: torch.autograd.backward(self.module(self.xs), self.grad_outs))
+        for x in self.xs:
+            x.grad = None
+        for p in self.module.parameters():
+            p.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/bench_normalization.py b/benchmarks/microbenchmarks/asv/bench_normalization.py
new file mode 100644
index 000000000..3412e4170
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/bench_normalization.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""RMSNorm and LayerNorm benchmarks on activation-sized tensors.
+
+Memory-bound; we report GB/s. The hidden dimension is swept over the distinct
+model hidden sizes and M (batch * seq_len) over typical training sizes.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import BenchBase, run_as_main
+from models import M_SIZES, unique_hidden_sizes
+
+NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
+
+
+class BenchNormalization(BenchBase):
+    params = [M_SIZES, unique_hidden_sizes(), list(NORMS)]
+    param_names = ["M", "hidden", "norm_type"]
+
+    def setup(self, M, hidden, norm_type):
+        dtype = torch.bfloat16
+        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.norm(self.x))
+
+    def work_forward(self, M, hidden, norm_type):
+        # read input (2B) + write output (2B)
+        return {"bytes": M * hidden * 4}
+
+    def work_forward_backward(self, M, hidden, norm_type):
+        # fwd read+write (4B) + bwd read input+grad_out, write grad_in (6B)
+        return {"bytes": M * hidden * 10}
+
+    def time_forward(self, M, hidden, norm_type):
+        return self._time(lambda: self.norm(self.x))
+
+    def time_forward_backward(self, M, hidden, norm_type):
+        t = self._time(lambda: self.norm(self.x).backward(self.grad_out))
+        self.x.grad = None
+        for p in self.norm.parameters():
+            p.grad = None
+        return t
+
+
+if __name__ == "__main__":
+    run_as_main(__file__)
diff --git a/benchmarks/microbenchmarks/asv/compare_results.py b/benchmarks/microbenchmarks/asv/compare_results.py
new file mode 100755
index 000000000..18ea2dd3b
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/compare_results.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Statistically compare two result JSON files written by ``driver.py``.
+
+A point-estimate (median) cannot tell a real regression from measurement noise.
+This tool compares the raw per-call samples stored in two result files (one per
+checkout) with a statistical test (Brunner-Munzel by default) via the benchstats
+package. It marks each (benchmark, parameter combination) as faster (``>``),
+slower (``<``), or not significantly different (``~``), prints a per-direction
+summary, and exits ``1`` when a significant timing difference is found so it can
+gate CI. Requires ``pip install -r requirements.txt``.
+
+Usage:
+    # run the suite on each checkout (each saves <hash>.json), then:
+    python compare_results.py results/<base>.json results/<cand>.json
+    python compare_results.py base.json cand.json --alpha 0.01
+    python compare_results.py base.json cand.json --export-to report.svg
+"""
+
+import argparse
+import json
+import os
+import re
+import sys
+
+import numpy as np
+
+_TIME_KEY = "time_s"  # metric exposed to benchstats (seconds, lower is better)
+
+
+def _load_samples(path, name_filter=None):
+    """Load a driver result JSON into ``{bench_name: {"time_s": ndarray}}``.
+
+    One benchstats "benchmark" per (benchmark, parameter combination); the name
+    is ``<suite>.<Class>.<method> | name=val, ...``. Only timing is exposed:
+    throughput is a constant-work transform of time, so a rank test on it is
+    identical.
+    """
+    with open(path) as f:
+        data = json.load(f)
+    pattern = re.compile(name_filter) if name_filter else None
+
+    stats = {}
+    for bench_key, rec in data.get("results", {}).items():
+        param_names = rec.get("param_names") or []
+        for combo, samples in zip(rec.get("combos") or [], rec.get("samples") or []):
+            if not samples:
+                continue
+            arr = np.asarray(samples, dtype=np.float64)
+            arr = arr[np.isfinite(arr)]
+            if arr.size == 0:
+                continue
+            if param_names and len(param_names) == len(combo):
+                label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
+            else:
+                label = ", ".join(str(v) for v in combo)
+            name = bench_key + (" | " + label if label else "")
+            if pattern is not None and pattern.search(name) is None:
+                continue
+            stats[name] = {_TIME_KEY: arr}
+    return stats
+
+
+def run_stats(args):
+    """Compare two result JSONs; return 1 if a significant difference is found."""
+    from benchstats.compare import compareStats
+    from benchstats.render import renderComparisonResults
+    from benchstats.common import LoggingConsole, detectExportFormat
+
+    main_metrics = [_TIME_KEY]
+    export_fmt = detectExportFormat(args.export_to, None) if args.export_to else None
+    if export_fmt is not None and os.path.isfile(args.export_to):
+        os.remove(args.export_to)
+
+    console = LoggingConsole(
+        record=export_fmt is not None, log_level=LoggingConsole.LogLevel.Warning,
+    )
+
+    s1 = _load_samples(args.baseline_json, args.filter)
+    s2 = _load_samples(args.candidate_json, args.filter)
+
+    cr = compareStats(
+        s1, s2, method=args.method, alpha=args.alpha,
+        main_metrics=main_metrics, debug_log=console,
+    )
+    renderComparisonResults(
+        cr, console, main_metrics=main_metrics,
+        always_show_pvalues=args.always_show_pvalues,
+    )
+
+    # benchstats encodes each comparison as baseline-vs-candidate: "<" means
+    # baseline < candidate (candidate slower -> regression), ">" means candidate
+    # faster, "~" means not significant at alpha.
+    for metric in main_metrics:
+        counts = {"<": 0, ">": 0, "~": 0}
+        for bm_res in cr.results.values():
+            res = bm_res.get(metric)
+            if res is not None:
+                counts[res.result] = counts.get(res.result, 0) + 1
+        total = sum(counts.values())
+        console.print(
+            f"\nSummary for '{metric}' ({cr.method}, alpha={cr.alpha:g}, "
+            f"{total} benchmarks):"
+        )
+        console.print(f"  candidate faster (significant, '>'): {counts['>']}")
+        console.print(f"  candidate slower (significant, '<'): {counts['<']}")
+        console.print(f"  no significant difference ('~'):     {counts['~']}")
+
+    if export_fmt is not None:
+        {"txt": lambda: console.save_text(args.export_to),
+         "svg": lambda: console.save_svg(args.export_to, title=""),
+         "html": lambda: console.save_html(args.export_to)}[export_fmt]()
+
+    if cr.at_least_one_differs:
+        console.warning("At least one significant timing difference was detected (exit 1).")
+        return 1
+    return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Statistically compare two driver result JSONs via benchstats.")
+    parser.add_argument("baseline_json", help="Baseline result JSON")
+    parser.add_argument("candidate_json", help="Candidate result JSON")
+    parser.add_argument("--filter", default=None,
+                        help="Only compare benchmarks whose name matches this regex.")
+    parser.add_argument("--alpha", type=float, default=0.001,
+                        help="Significance level for the test (default: 0.001).")
+    parser.add_argument("--method", default="brunnermunzel",
+                        help="Statistical test to use (default: brunnermunzel).")
+    parser.add_argument("--always-show-pvalues", action="store_true",
+                        help="Show p-values for non-significant rows too.")
+    parser.add_argument("--export-to", default=None, metavar="FILE",
+                        help="Export the report to a .txt/.svg/.html file (format from extension).")
+    return run_stats(parser.parse_args())
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/microbenchmarks/asv/driver.py b/benchmarks/microbenchmarks/asv/driver.py
new file mode 100644
index 000000000..1443515f7
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/driver.py
@@ -0,0 +1,593 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""In-process microbenchmark driver.
+
+Discovers ``Bench*`` classes in ``bench_*.py`` files, runs their ``time_*``
+methods with robust GPU timing (inner-loop amortization, optional cold cache,
+round-robin interleaving), prints a table with throughput, and saves the raw
+per-call samples to JSON for ``compare_results.py``.
+
+Usage:
+    python driver.py <suite> [method_filter] [-w W] [-n N] [--no-save]
+    python driver.py --all [-w W] [-n N]
+    python bench_gemm.py [method_filter] [-w W] [-n N]      # bench file as main
+"""
+
+import argparse
+import glob
+import importlib
+import itertools
+import json
+import os
+import random
+import re
+import subprocess
+import sys
+import time
+
+import numpy as np
+
+
+# ---------------------------------------------------------------------------
+# Benchmark base class
+# ---------------------------------------------------------------------------
+
+class BenchBase:
+    """Base for benchmark classes: driver-controlled knobs + the timing helper.
+
+    The driver sets ``_inner`` (kernel invocations per CUDA-event window, to
+    amortize launch + event overhead) and ``_scratch`` (a buffer written before
+    each sample to evict the GPU cache in ``--cold-cache`` mode) per
+    (combo, method). Subclasses time their kernels through :meth:`_time`.
+    """
+
+    _inner = 1
+    _scratch = None
+
+    def _time(self, fn):
+        """Run *fn* ``_inner`` times in one CUDA-event window; return seconds/call.
+
+        Honors ``--cold-cache`` (flush scratch before the window) and ``--inner``
+        (loop count). The per-call value is what the driver and throughput
+        columns consume regardless of inner-loop count.
+        """
+        import torch  # deferred: driver stays importable without torch
+        evt = getattr(self, "_evt", None)
+        if evt is None:
+            evt = self._evt = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
+        if self._scratch is not None:
+            self._scratch.fill_(1.0)
+        evt[0].record()
+        for _ in range(self._inner):
+            fn()
+        evt[1].record()
+        torch.cuda.synchronize()
+        return evt[0].elapsed_time(evt[1]) / 1000 / self._inner
+
+
+# ---------------------------------------------------------------------------
+# Results
+# ---------------------------------------------------------------------------
+
+def _get_commit_hash():
+    """Current git HEAD hash, or 'unknown' outside a checkout."""
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+def _results_dir():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
+
+
+def save_results(all_results, label=None, results_dir=None):
+    """Write raw per-call samples to ``<results_dir>/<hash>[-<label>].json``.
+
+    *label*, when given, is folded into the filename so multiple runs on the same
+    commit (e.g. a dirty working tree, where HEAD is unchanged) land in distinct
+    files that ``compare_results.py`` can compare instead of overwriting.
+    """
+    commit = _get_commit_hash()
+    results_dir = results_dir or _results_dir()
+    os.makedirs(results_dir, exist_ok=True)
+
+    suffix = ""
+    if label:
+        suffix = "-" + re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
+    path = os.path.join(results_dir, f"{commit[:8]}{suffix}.json")
+
+    if os.path.exists(path):
+        with open(path) as f:
+            data = json.load(f)
+    else:
+        data = {"commit_hash": commit, "date": int(time.time() * 1000), "results": {}}
+    data["results"].update(all_results)
+
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"\nResults saved to {path}")
+
+
+def _compute_stats(samples):
+    """Return ``(median, mean, stdev, q25, q75)`` for *samples*.
+
+    Quartiles use linear interpolation (numpy default), more meaningful at small
+    n than index-floor; stdev is the population standard deviation.
+    """
+    s = np.asarray(samples, dtype=np.float64)
+    median, q25, q75 = (float(x) for x in np.quantile(s, [0.5, 0.25, 0.75]))
+    return median, float(s.mean()), float(s.std(ddof=0)), q25, q75
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+def _make_scratch(mb):
+    """Allocate a scratch buffer used to evict the GPU cache between samples.
+
+    Sized by default to exceed the MI300 Infinity Cache (256 MB) and the L2
+    (16 MB), so a single fill writes through every level of cache.
+    """
+    import torch  # deferred: only needed when cold-cache is on
+    n = max(1, (mb * 1024 * 1024) // 4)  # float32 = 4 bytes
+    return torch.empty(n, dtype=torch.float32, device="cuda")
+
+
+def _autotune_inner(instance, method_name, combo, target_s, max_inner=10000):
+    """Pick an inner-loop count so one timed window lasts >= *target_s*.
+
+    Runs two single invocations: one to settle algorithm selection / cache
+    state, and one to estimate the per-call cost.
+    """
+    method = getattr(instance, method_name)
+    saved_inner = instance._inner
+    instance._inner = 1
+    try:
+        method(*combo)               # discard: cold cache + autotuner warmup
+        t_per = method(*combo)       # seconds per single invocation
+    finally:
+        instance._inner = saved_inner
+    if t_per is None or t_per <= 0:
+        return 1
+    return max(1, min(max_inner, int(target_s / t_per) + 1))
+
+
+def _free_gpu_cache():
+    """Release cached GPU memory between interleave chunks (no-op without torch)."""
+    torch = sys.modules.get("torch")
+    if torch is not None:
+        try:
+            torch.cuda.empty_cache()
+        except Exception:
+            pass
+
+
+def run_class(
+    suite_name, cls, class_name, method_filter=None,
+    warmup=3, iters=7,
+    inner="auto", target_window_ms=1.0,
+    cold_cache=False, cache_flush_mb=256,
+    interleave_group=8, rng=None, shuffle=True,
+):
+    """Run all ``time_*`` methods in *cls*, returning a ``{bench_key: record}`` dict.
+
+    Samples are collected in round-robin chunks of ``interleave_group``
+    ``(method, combo)`` benchmarks: one sample from each per round, for *iters*
+    rounds. This spreads every benchmark's samples across the same wall-clock
+    window so time-correlated GPU noise (thermal ramp, DVFS throttle) becomes
+    shared variance rather than a bias on whichever benchmark owned a contiguous
+    block of time. ``interleave_group=1`` reproduces sequential behavior; larger
+    groups interleave more but keep that many GPU instances live at once.
+
+    When *shuffle* is true the per-round visit order is randomly permuted (seeded
+    by *rng*), making each benchmark's within-round phase and predecessor uniform
+    in expectation, turning residual ordering bias into variance. The per-round
+    structure is kept (each benchmark still gets exactly *iters* evenly-spread
+    samples) -- a balanced randomized design, not a global shuffle.
+    """
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return {}
+
+    params = getattr(cls, "params", [[]])
+    param_names = list(getattr(cls, "param_names", []))
+    combos = list(itertools.product(*params))
+    n_combos = len(combos)
+
+    # Discover throughput columns from work_* companions.
+    # Each entry: (dict_key, column_header, unit_divisor).
+    probe_keys = set()
+    for m in methods:
+        wfn = getattr(cls, "work_" + m[5:], None)
+        if wfn:
+            try:
+                probe_keys.update(wfn(cls(), *combos[0]))
+            except Exception:
+                pass
+    throughput_cols = []
+    if "flops" in probe_keys:
+        throughput_cols.append(("flops", "TFLOPS", 1e12))
+    if "bytes" in probe_keys:
+        throughput_cols.append(("bytes", "GB/s", 1e9))
+
+    target_window_s = target_window_ms / 1000.0
+    group = max(1, int(interleave_group))
+    if rng is None:
+        rng = random.Random(0)
+    inner_desc = (
+        "cold-cache (inner=1)" if cold_cache
+        else f"inner={inner}" if inner != "auto"
+        else f"inner=auto (>={target_window_ms:g}ms window)"
+    )
+    sched_desc = ("sequential" if group == 1
+                  else f"interleaved group={group}, " + ("shuffled" if shuffle else "fixed-order"))
+    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods, "
+          f"{warmup} warmup, {iters} timed, {inner_desc}, {sched_desc})")
+    extra_hdr = "".join(f"  {label:>10}" for _, label, _ in throughput_cols)
+    HDR = (f"  {'median':>10}  {'mean':>10}  {'stdev':>10}"
+           f"  {'q25':>10}  {'q75':>10}  {'min':>10}  {'max':>10}"
+           + extra_hdr + f"  {'inner':>5}  {'method':<30}  params")
+    print("-" * len(HDR))
+    print(HDR)
+    print("-" * len(HDR))
+
+    def _label(combo):
+        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
+
+    # Samples per method, indexed by combo position. Filling by index decouples
+    # the wire format from the order samples are actually collected in, so
+    # interleaved scheduling leaves the saved JSON identical to sequential.
+    samples_by_method = {m: [None] * n_combos for m in methods}
+
+    # Flatten to (method, combo) tasks, method-major so printed rows keep their
+    # grouping, then sample them in round-robin chunks.
+    tasks = [(mi, ci) for mi in range(len(methods)) for ci in range(n_combos)]
+
+    for chunk_start in range(0, len(tasks), group):
+        chunk = tasks[chunk_start:chunk_start + group]
+
+        # Setup phase: prepare every benchmark in the chunk (allocate tensors,
+        # pick _inner, warm up) and keep its instance live for round-robin timing.
+        live = []  # (instance, method_name, combo, combo_idx)
+        for mi, ci in chunk:
+            method_name = methods[mi]
+            combo = combos[ci]
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
+                continue  # leaves None in this (method, combo) slot
+
+            # Cold-cache mode forces inner=1 so only the first invocation in the
+            # window sees a cold cache; otherwise the 2nd..Nth would refill it.
+            if cold_cache:
+                instance._scratch = _make_scratch(cache_flush_mb)
+                instance._inner = 1
+            elif inner == "auto":
+                instance._inner = _autotune_inner(
+                    instance, method_name, combo, target_window_s)
+            else:
+                instance._inner = max(1, int(inner))
+
+            method = getattr(instance, method_name)
+            for _ in range(warmup):
+                method(*combo)
+            live.append((instance, method_name, combo, ci))
+
+        # Timed phase: one sample from each live benchmark per round, so a
+        # transient spike lands on one sample of each rather than corrupting a
+        # whole benchmark's contiguous block. Visit order is re-permuted each
+        # round (when shuffle is on); chunk_samples stays keyed by index i.
+        chunk_samples = [[] for _ in live]
+        order = list(range(len(live)))
+        for _ in range(iters):
+            if shuffle and len(order) > 1:
+                rng.shuffle(order)
+            for i in order:
+                instance, method_name, combo, ci = live[i]
+                method = getattr(instance, method_name)
+                t0 = time.perf_counter()
+                result = method(*combo)
+                wall = time.perf_counter() - t0
+                chunk_samples[i].append(wall if result is None else result)
+
+        # Finalize: stats, throughput, print, store into the combo slot.
+        for i, (instance, method_name, combo, ci) in enumerate(live):
+            samples = chunk_samples[i]
+            median, mean, stdev, q25, q75 = _compute_stats(samples)
+            s_min, s_max = min(samples), max(samples)
+
+            # Raw samples (seconds) for statistical comparison; rounded to 1 ns
+            # to keep the JSON compact without losing timing resolution.
+            samples_by_method[method_name][ci] = [round(x, 9) for x in samples]
+
+            work = {}
+            wfn = getattr(instance, "work_" + method_name[5:], None)
+            if wfn and median > 0:
+                try:
+                    work = wfn(*combo)
+                except Exception:
+                    pass
+            extra_cols = ""
+            for key, _, divisor in throughput_cols:
+                if key in work and median > 0:
+                    extra_cols += f"  {work[key] / median / divisor:>10.1f}"
+                else:
+                    extra_cols += f"  {'':>10}"
+
+            print(f"  {median*1000:>8.3f}ms  {mean*1000:>8.3f}ms  "
+                  f"{stdev*1000:>8.3f}ms  {q25*1000:>8.3f}ms  {q75*1000:>8.3f}ms  "
+                  f"{s_min*1000:>8.3f}ms  {s_max*1000:>8.3f}ms"
+                  f"{extra_cols}  "
+                  f"{instance._inner:>5}  {method_name:<30}  {_label(combo)}")
+
+        live.clear()
+        _free_gpu_cache()
+
+    combos_json = [list(c) for c in combos]
+    return {
+        f"{suite_name}.{class_name}.{m}": {
+            "param_names": param_names,
+            "combos": combos_json,
+            "samples": samples_by_method[m],
+        }
+        for m in methods
+    }
+
+
+# ---------------------------------------------------------------------------
+# Kernel profiling
+# ---------------------------------------------------------------------------
+
+_KERNEL_NAME_MAX_WIDTH = 80
+
+
+def _shorten_kernel_name(name):
+    """Shorten verbose C++/HIP kernel names for readable output.
+
+    Strips a leading 'void ', removes template arguments (one level of nesting),
+    collapses whitespace, and truncates to ``_KERNEL_NAME_MAX_WIDTH``.
+    """
+    s = name[5:] if name.startswith("void ") else name
+    s = re.sub(r"<[^<>]*(?:<[^<>]*>[^<>]*)*>", "", s)
+    s = " ".join(s.split())
+    if len(s) > _KERNEL_NAME_MAX_WIDTH:
+        s = s[:_KERNEL_NAME_MAX_WIDTH - 3] + "..."
+    return s
+
+
+def profile_class(suite_name, cls, class_name, method_filter=None, warmup=3, inner=1):
+    """Per-kernel CUDA-time breakdown for each time_* method x parameter combo.
+
+    Unlike :func:`run_class` (timing distributions), this runs each benchmark
+    once under ``torch.profiler`` and reports the GPU kernels it launched, sorted
+    by total device time. Returns ``{bench_key: {combo_label: [kernel_row, ...]}}``.
+    """
+    import torch
+    from torch.profiler import profile, ProfilerActivity
+
+    methods = sorted(m for m in dir(cls) if m.startswith("time_"))
+    if method_filter:
+        methods = [m for m in methods if method_filter in m]
+    if not methods:
+        return {}
+
+    params = getattr(cls, "params", [[]])
+    param_names = list(getattr(cls, "param_names", []))
+    combos = list(itertools.product(*params))
+
+    def _label(combo):
+        return ", ".join(f"{nm}={v}" for nm, v in zip(param_names, combo))
+
+    out = {}
+    for method_name in methods:
+        bench_key = f"{suite_name}.{class_name}.{method_name}"
+        out[bench_key] = {}
+        for combo in combos:
+            instance = cls()
+            try:
+                instance.setup(*combo)
+            except Exception as e:
+                print(f"  SKIP  {_label(combo)}  setup failed: {e}")
+                continue
+            instance._inner = max(1, int(inner))
+            method = getattr(instance, method_name)
+            for _ in range(warmup):
+                method(*combo)
+            with profile(activities=[ProfilerActivity.CUDA]) as prof:
+                method(*combo)
+                torch.cuda.synchronize()
+
+            events = [e for e in prof.key_averages() if e.self_device_time_total > 0]
+            events.sort(key=lambda e: e.self_device_time_total, reverse=True)
+            total = sum(e.self_device_time_total for e in events)
+
+            w = _KERNEL_NAME_MAX_WIDTH
+            hdr = (f"  {'kernel':<{w}}  {'total us':>11}  {'calls':>6}"
+                   f"  {'avg us':>10}  {'%':>6}")
+            print(f"\n{bench_key}  ({_label(combo)})")
+            print(hdr)
+            print("  " + "-" * (len(hdr) - 2))
+            rows = []
+            for e in events:
+                avg = e.self_device_time_total / e.count if e.count else 0.0
+                pct = 100.0 * e.self_device_time_total / total if total else 0.0
+                print(f"  {_shorten_kernel_name(e.key):<{w}}  {e.self_device_time_total:>11.1f}"
+                      f"  {e.count:>6}  {avg:>10.2f}  {pct:>5.1f}%")
+                rows.append({
+                    "kernel": e.key, "total_us": round(e.self_device_time_total, 1),
+                    "calls": e.count, "avg_us": round(avg, 2), "pct": round(pct, 1),
+                })
+            print(f"  {'TOTAL':<{w}}  {total:>11.1f}")
+            out[bench_key][_label(combo)] = rows
+    return out
+
+
+def save_kernel_profile(all_profiles, label=None, results_dir=None):
+    """Write per-kernel profiles to ``<results_dir>/<hash>[-<label>]-kernelprofile.json``."""
+    commit = _get_commit_hash()
+    results_dir = results_dir or _results_dir()
+    os.makedirs(results_dir, exist_ok=True)
+    suffix = ""
+    if label:
+        suffix = "-" + re.sub(r"[^A-Za-z0-9._-]+", "_", label).strip("_")
+    path = os.path.join(results_dir, f"{commit[:8]}{suffix}-kernelprofile.json")
+    with open(path, "w") as f:
+        json.dump(
+            {"commit_hash": commit, "date": int(time.time() * 1000),
+             "kernel_profile": all_profiles}, f, indent=2,
+        )
+    print(f"\nKernel profile saved to {path}")
+
+
+def run_as_main(caller_file=None):
+    """Run benchmarks from a bench file's ``__main__`` block or the command line.
+
+    From a bench file::
+
+        if __name__ == "__main__":
+            from driver import run_as_main
+            run_as_main(__file__)
+    """
+    parser = argparse.ArgumentParser(
+        description="Run microbenchmarks in-process (no subprocess overhead).")
+    if caller_file is None:
+        parser.add_argument("suite", nargs="?", default=None,
+                            help="Benchmark module name (e.g. bench_casting)")
+        parser.add_argument("--all", action="store_true",
+                            help="Run all bench_*.py suites in the directory")
+    parser.add_argument("method_filter", nargs="?", default=None,
+                        help="Only run time_* methods containing this string")
+    parser.add_argument("-w", "--warmup", type=int, default=10,
+                        help="Number of warmup iterations (default: 10)")
+    parser.add_argument("-n", "--iters", type=int, default=20,
+                        help="Number of timed iterations (default: 20)")
+    parser.add_argument("--inner", default="auto",
+                        help="Inner kernel invocations per timed window: 'auto' "
+                             "(tune to --target-window-ms) or an integer "
+                             "(default: auto). Larger values amortize CUDA event "
+                             "and kernel-launch overhead.")
+    parser.add_argument("--target-window-ms", type=float, default=1.0,
+                        help="Target duration of one timed window when "
+                             "--inner=auto (default: 1.0 ms).")
+    parser.add_argument("--cold-cache", action="store_true",
+                        help="Flush the GPU cache (write a >LLC scratch buffer) "
+                             "before each sample. Forces --inner=1 because "
+                             "subsequent inner calls would refill the cache.")
+    parser.add_argument("--cache-flush-mb", type=int, default=256,
+                        help="Size in MB of the cache-flush buffer for "
+                             "--cold-cache (default: 256, sized for the MI300 "
+                             "Infinity Cache).")
+    parser.add_argument("--interleave-group", type=int, default=8,
+                        help="Number of (method, combo) benchmarks sampled "
+                             "round-robin together so time-correlated GPU noise "
+                             "is shared across them instead of biasing whichever "
+                             "benchmark owns a contiguous block of time "
+                             "(default: 8). Each keeps a live GPU instance, so "
+                             "lower this on out-of-memory. 1 = sequential.")
+    parser.add_argument("--sequential", action="store_true",
+                        help="Collect each benchmark's samples contiguously "
+                             "(equivalent to --interleave-group 1). Lowest "
+                             "memory, but biased under thermal drift.")
+    parser.add_argument("--seed", type=int, default=0,
+                        help="Seed for the per-round shuffle of the interleave "
+                             "order (default: 0), kept fixed for reproducibility.")
+    parser.add_argument("--no-shuffle", action="store_true",
+                        help="Disable the per-round random permutation and use a "
+                             "fixed round-robin order, leaving a small residual "
+                             "ordering bias.")
+    parser.add_argument("--kernel-profile", action="store_true",
+                        help="Profile per-kernel CUDA time via torch.profiler "
+                             "instead of measuring timing distributions. Runs each "
+                             "benchmark once and prints a per-kernel breakdown "
+                             "(saved to <hash>-kernelprofile.json unless --no-save).")
+    parser.add_argument("--profile-inner", type=int, default=1,
+                        help="Kernel invocations per profiled run in "
+                             "--kernel-profile mode (default: 1).")
+    parser.add_argument("--no-save", action="store_true",
+                        help="Skip saving results to JSON.")
+    parser.add_argument("--label", default=None,
+                        help="Tag folded into the result filename "
+                             "(<hash>-<label>.json). Use it to keep multiple runs "
+                             "on the same commit in distinct files for comparison.")
+    args = parser.parse_args()
+    if args.inner != "auto":
+        try:
+            args.inner = max(1, int(args.inner))
+        except ValueError:
+            parser.error("--inner must be 'auto' or a positive integer")
+    if args.sequential:
+        args.interleave_group = 1
+    args.interleave_group = max(1, args.interleave_group)
+
+    if caller_file is not None:
+        script_dir = os.path.dirname(os.path.abspath(caller_file))
+        suite_names = [os.path.splitext(os.path.basename(caller_file))[0]]
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        if getattr(args, "all", False):
+            suite_names = sorted(
+                os.path.splitext(os.path.basename(f))[0]
+                for f in glob.glob(os.path.join(script_dir, "bench_*.py"))
+            )
+        elif args.suite:
+            suite_names = [args.suite]
+        else:
+            parser.error("provide a suite name or use --all")
+
+    os.chdir(script_dir)
+    if script_dir not in sys.path:
+        sys.path.insert(0, script_dir)
+
+    # One RNG for the whole run so the interleave order is reproducible given
+    # --seed; shared across classes so the stream is deterministic end-to-end.
+    rng = random.Random(args.seed)
+    shuffle = not args.no_shuffle
+    if not args.kernel_profile and args.interleave_group > 1 and shuffle:
+        print(f"Interleave: group={args.interleave_group}, shuffled (seed={args.seed})")
+
+    all_results = {}
+    all_profiles = {}
+    for suite_name in suite_names:
+        mod = importlib.import_module(suite_name)
+        for name in sorted(dir(mod)):
+            obj = getattr(mod, name)
+            # Any Bench* class that defines a time_* method (excludes BenchBase,
+            # and is robust to the bench-file/driver __main__ double-import).
+            if not (isinstance(obj, type) and name.startswith("Bench")
+                    and any(m.startswith("time_") for m in dir(obj))):
+                continue
+            if args.kernel_profile:
+                all_profiles.update(profile_class(
+                    suite_name, obj, name, args.method_filter,
+                    warmup=args.warmup, inner=args.profile_inner,
+                ))
+            else:
+                all_results.update(run_class(
+                    suite_name, obj, name, args.method_filter,
+                    warmup=args.warmup, iters=args.iters,
+                    inner=args.inner, target_window_ms=args.target_window_ms,
+                    cold_cache=args.cold_cache, cache_flush_mb=args.cache_flush_mb,
+                    interleave_group=args.interleave_group, rng=rng, shuffle=shuffle,
+                ))
+
+    if args.kernel_profile:
+        if all_profiles and not args.no_save:
+            save_kernel_profile(all_profiles, label=args.label)
+    elif all_results and not args.no_save:
+        save_results(all_results, label=args.label)
+
+
+if __name__ == "__main__":
+    run_as_main()
diff --git a/benchmarks/microbenchmarks/asv/models.py b/benchmarks/microbenchmarks/asv/models.py
new file mode 100644
index 000000000..50a71393a
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/models.py
@@ -0,0 +1,89 @@
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Shared model configurations and shape derivations for the microbenchmarks.
+
+Single source of truth for the model shapes every ``bench_*.py`` sweeps over,
+so a new model is added in one place. Config sources:
+
+  - Llama 3.1 8B   https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  - Llama 3.1 70B  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  - Llama 3.1 405B https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  - Qwen 2.5 7B    https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  - Qwen 2.5 72B   https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+  - MoE configs    https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py
+"""
+
+# Token-count (batch * seq_len) sweeps shared across suites.
+M_SIZES = [1024, 2048, 4096, 8192]
+M_SIZES_MOE = [512, 1024, 2048, 4096]
+
+# Dense transformer models, keyed by "<family>_TP<tp>".
+# Value = (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp).
+MODELS = {
+    "Llama3-8B_TP1":   (4096,  14336,  32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096,  14336,  32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192,  28672,  64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584,  18944,  28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192,  29568,  64, 8, 128, 8),
+}
+
+# MoE models for grouped GEMM: (num_routed_experts, moe_intermediate, hidden).
+MOE_MODELS = {
+    "DSV2-Lite": (64, 1408, 2048),
+    "DSV2":      (160, 1536, 5120),
+    "DSV3":      (256, 2048, 7168),
+    "Grok-V2":   (8, 16384, 8192),
+}
+
+
+def attention_configs(models=MODELS):
+    """Return ``{name: (num_q_heads, num_kv_heads, head_dim, tp)}``."""
+    return {name: cfg[2:6] for name, cfg in models.items()}
+
+
+def gemm_shapes(models=MODELS):
+    """Return ``{shape_name: (N, K)}`` for the four transformer projections.
+
+    Each model contributes QKV, AttnOut, GateUp (SwiGLU), and Down GEMMs.
+    """
+    shapes = {}
+    for name, (hidden, inter, n_q, n_kv, hd, tp) in models.items():
+        shapes[f"{name}-QKV"] = ((n_q * hd + 2 * n_kv * hd) // tp, hidden)
+        shapes[f"{name}-AttnOut"] = (hidden, (n_q * hd) // tp)
+        shapes[f"{name}-GateUp"] = ((2 * inter) // tp, hidden)
+        shapes[f"{name}-Down"] = (hidden, inter // tp)
+    return shapes
+
+
+def grouped_gemm_configs(models=MOE_MODELS, eps=(32, 16, 8)):
+    """Return ``{config_name: (num_gemms, N, K)}`` for MoE GateUp/Down GEMMs.
+
+    One entry per (model, expert-parallel size) where the experts divide evenly.
+    """
+    configs = {}
+    for model, (n_experts, inter, hidden) in models.items():
+        for ep in eps:
+            if n_experts % ep != 0:
+                continue
+            num_gemms = n_experts // ep
+            configs[f"{model}_EP{ep}-GateUp"] = (num_gemms, 2 * inter, hidden)
+            configs[f"{model}_EP{ep}-Down"] = (num_gemms, hidden, inter)
+    return configs
+
+
+def hidden_sizes(models=MODELS):
+    """Return ``{model_family: hidden}`` (TP-independent) for element-wise benches."""
+    out = {}
+    for name, cfg in models.items():
+        family = name.split("_TP")[0]
+        out.setdefault(family, cfg[0])
+    return out
+
+
+def unique_hidden_sizes(models=MODELS):
+    """Return the sorted distinct hidden dimensions across all models."""
+    return sorted(set(hidden_sizes(models).values()))
diff --git a/benchmarks/microbenchmarks/asv/requirements.txt b/benchmarks/microbenchmarks/asv/requirements.txt
new file mode 100644
index 000000000..fb32ecc15
--- /dev/null
+++ b/benchmarks/microbenchmarks/asv/requirements.txt
@@ -0,0 +1,4 @@
+# Extra dependencies for statistical benchmark comparison (compare_results.py).
+# benchstats pulls in rich, scipy and numpy.
+numpy
+benchstats>=3.4.1