diff --git a/.github/ISSUE_TEMPLATE/new_suite.md b/.github/ISSUE_TEMPLATE/new_suite.md
new file mode 100644
index 00000000..4c358a18
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/new_suite.md
@@ -0,0 +1,75 @@
+---
+name: Propose a new suite
+about: Propose a new benchmark suite (new model, scenario mix, or scaling axis)
+title: "[Suite] "
+labels: suite-proposal
+assignees: ''
+---
+
+
+
+## Why this suite?
+
+
+
+## Suite contract (draft)
+
+| Field | Proposed value |
+|---|---|
+| **Suite ID** | `suite_` |
+| **Model** | `` |
+| **Model revision** | `` |
+| **Chip count** | `1` / `auto` / specific number |
+| **Precision** | `BF16` / `FP16` / list of allowed precisions |
+| **Dataset** | existing (`sharegpt_standard_v1`, `sharegpt_edge_v1`, `sharegpt_longctx_v1`) or new |
+| **Max model length** | tokens |
+| **Output tokens (max)** | tokens |
+| **Concurrency levels** | e.g. `[8, 32, 128]` |
+| **Default scenarios** | subset of `accuracy / offline / online / interactive / sustained` |
+| **Extra scenarios** | optional: `sustained / speculative / burst / …` |
+| **Primary metric** | `offline_throughput`, `max_valid_qps`, … |
+| **Expected run time on A100** | minutes |
+
+## Accuracy baseline
+
+
+
+- [ ] I will provide an A100 (or equivalent reference) BF16 baseline score
+ to add to `schema/accuracy_baselines.json`.
+- [ ] If a new dataset is required, I will submit it under
+ `datasets/_v1/` with a `README.md` that documents the source
+ and upstream license (see [`datasets/README.md`](../../datasets/README.md)).
+
+## Custom orchestration?
+
+
+
+- [ ] Standard scenario dispatch is enough — no `suite.py` needed.
+- [ ] A `suite.py` plugin is required. Reason:
+
+## Reference result plan
+
+
+
+- Reference hardware:
+- Runner: ``
+- Who will run it: <@your-handle / vendor / community member>
+
+## Open questions
+
+
diff --git a/.github/workflows/generate_leaderboard.yml b/.github/workflows/generate_leaderboard.yml
index 31dec72d..04d51173 100644
--- a/.github/workflows/generate_leaderboard.yml
+++ b/.github/workflows/generate_leaderboard.yml
@@ -11,8 +11,9 @@ on:
paths:
- 'results/**'
- 'leaderboard/**'
+ - 'suites/**'
+ - 'schema/**'
- 'tools/generate_platforms_matrix.py'
- - 'schema/platforms.json'
- 'runners/*/meta.json'
# Allow manual trigger from Actions tab (useful for first deploy or to
@@ -37,6 +38,9 @@ jobs:
- name: Validate all runner meta.json files and hashes
run: python runners/validate_runners.py
+ - name: Validate all suite definitions
+ run: python runners/validate_suites.py
+
generate:
name: Generate and deploy leaderboard
runs-on: ubuntu-latest
diff --git a/.github/workflows/validate_pr.yml b/.github/workflows/validate_pr.yml
index d24bbf16..541af133 100644
--- a/.github/workflows/validate_pr.yml
+++ b/.github/workflows/validate_pr.yml
@@ -8,7 +8,8 @@ on:
paths:
- 'results/**'
- 'runners/**'
- - 'schema/platforms.json'
+ - 'suites/**'
+ - 'schema/**'
- 'tools/generate_platforms_matrix.py'
- 'README.md'
- 'leaderboard/site/**'
@@ -89,6 +90,29 @@ jobs:
python tools/generate_platforms_matrix.py --check
echo "::endgroup::"
+ validate-suites:
+ name: Validate suite definitions
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+ cache: pip
+
+ - name: Install dependencies
+ run: pip install jsonschema
+
+ # Always validate every suite (and re-validate on schema changes too).
+ # This catches drift introduced by shared changes — e.g. a
+ # suite.schema.json edit that breaks an unrelated existing suite.
+ - name: Validate all suite folders (drift check)
+ run: |
+ echo "::group::Validating every suite folder in the repo"
+ python runners/validate_suites.py
+ echo "::endgroup::"
+
validate:
name: Validate result submissions
runs-on: ubuntu-latest
@@ -225,4 +249,47 @@ jobs:
# extra files to leaderboard/site/test/ to widen coverage; the
# glob below picks them up automatically.
- name: Run leaderboard frontend tests
- run: node --test leaderboard/site/test/*.test.mjs
\ No newline at end of file
+ run: node --test leaderboard/site/test/*.test.mjs
+
+ python-tests:
+ name: Python unit tests (serve + skill)
+ runs-on: ubuntu-latest
+ # Lightweight checks for the FastAPI serve layer and the OpenClaw skill
+ # entry point. No GPU, no real model — everything is mocked. Tests are
+ # opt-in per package so missing deps in one folder don't take the rest
+ # of the suite down with them.
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+ cache: pip
+
+ - name: Install test dependencies
+ # numpy is pulled in transitively by loadgen (imported when serve.server
+ # touches runners.benchmark_runner). Keep this list lean — these are the
+ # only packages required to *collect and run* the unit tests; no torch,
+ # no vendor SDKs, no real runner.
+ run: |
+ pip install --quiet pytest pydantic fastapi httpx pyyaml jsonschema numpy
+
+ - name: Run serve unit tests
+ run: |
+ if [ -d serve/tests ]; then
+ echo "::group::pytest serve/tests"
+ python -m pytest serve/tests -q --no-header --color=no
+ echo "::endgroup::"
+ else
+ echo "serve/tests/ not present — skipping."
+ fi
+
+ - name: Run OpenClaw skill unit tests
+ run: |
+ if [ -d openclaw_skill/tests ]; then
+ echo "::group::pytest openclaw_skill/tests"
+ python -m pytest openclaw_skill/tests -q --no-header --color=no
+ echo "::endgroup::"
+ else
+ echo "openclaw_skill/tests/ not present — skipping."
+ fi
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 07e5795e..36e1481f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,11 +12,20 @@ env/
# ── Editor / IDE ────────────────────────────────────────────────────────────
.idea/
.vscode/
+.cursor/
*.swp
*.swo
*~
*.tmp
.DS_Store
+.aider*
+.envrc
+.direnv/
+
+# ── Node / frontend tooling ─────────────────────────────────────────────────
+node_modules/
+.eslintcache
+npm-debug.log*
# ── Test / lint caches ──────────────────────────────────────────────────────
.pytest_cache/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6ff8d5d1..a3044cdc 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -320,6 +320,21 @@ CI then re-runs the schema validator and the runner-folder integrity check.
When both pass and a contributor reviews the diff, the PR is merged and your
result shows up on the leaderboard on the next site build.
+### Optional: preview the leaderboard locally
+
+The static site is generated from `results/` by `leaderboard/generate.py`.
+After dropping your result into `results/community//`, you can
+preview the final UI before opening the PR:
+
+```bash
+python leaderboard/generate.py # writes leaderboard/site/leaderboard.js + api/
+python -m http.server -d leaderboard/site 8000 # serve the static site
+# open http://localhost:8000
+```
+
+Both `leaderboard.js` and `leaderboard/site/api/` are gitignored — the GitHub
+Actions workflow regenerates them on every merge to `main`.
+
### Alternative: open a submission issue (no git required)
If you'd rather not use git, paste your `result.json` into a
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
index 343a3132..98cddbca 100644
--- a/DEVELOPMENT.md
+++ b/DEVELOPMENT.md
@@ -32,13 +32,14 @@ AccelMark/
│ ├── loadgen.py ← Shared timing and measurement engine
│ └── types.py ← InferenceResult, SampleRecord
├── suites/
-│ ├── suite_A/suite.json + requests.jsonl
-│ ├── suite_B/suite.json + requests.jsonl
-│ ├── suite_C/suite.json + suite.py + requests.jsonl
-│ ├── suite_D/suite.json + requests.jsonl
-│ ├── suite_E/suite.json + suite.py + requests.jsonl
-│ ├── suite_F/suite.json + requests.jsonl
-│ └── suite_G/suite.json + requests.jsonl
+│ ├── suite_A/suite.json
+│ ├── suite_B/suite.json
+│ ├── suite_C/suite.json + suite.py ← suite.py is optional; only C and E ship one
+│ ├── suite_D/suite.json
+│ ├── suite_E/suite.json + suite.py
+│ ├── suite_F/suite.json
+│ └── suite_G/suite.json
+│ (request data lives in datasets/, referenced by "dataset" in suite.json)
├── datasets/
│ ├── sharegpt_standard_v1/requests.jsonl ← 500 prompts, ~280/310 tok
│ ├── sharegpt_longctx_v1/requests.jsonl ← 200 prompts, ~28K input tok (Suite D)
@@ -554,12 +555,15 @@ descriptions and distributions.
If you need a custom distribution:
1. Create `datasets/{your_dataset}_v1/requests.jsonl`
-2. Create `datasets/{your_dataset}_v1/README.md`
+2. Create `datasets/{your_dataset}_v1/README.md` (must document source +
+ upstream license — see `datasets/README.md`)
3. Set `"dataset": "{your_dataset}_v1"` in your suite.json
-If your suite needs a custom dataset only used by that suite, you can
-also place `requests.jsonl` directly in `suites/suite_X/` — the
-benchmark runner checks there as a fallback.
+The `dataset` field is **required** — `BenchmarkRunner._resolve_requests_path`
+loads `datasets//requests.jsonl` and raises `FileNotFoundError` if it
+cannot find the file. Earlier versions allowed putting `requests.jsonl`
+directly under `suites/suite_X/`; that fallback has been removed in favor
+of the immutable, versioned `datasets/` layout.
Dataset format (one JSON object per line):
```json
@@ -622,6 +626,38 @@ not shown on the main leaderboard.
---
+## Adding a new scenario type
+
+If you need a scenario name that none of `accuracy / offline / online /
+interactive / sustained / speculative / burst` covers, you can register
+one without forking the dispatch logic:
+
+1. Open `runners/benchmark_runner.py` and add a row to
+ `_SCENARIO_REGISTRY` near the top of the file:
+
+ ```python
+ "your_scenario": ScenarioSpec(
+ name="your_scenario",
+ inference_kind="streaming", # or "offline"
+ needs_streaming=True, # require SUPPORTS_STREAMING?
+ use_async=True, # passed to load_model()
+ merge_key="your_scenario", # None = no-merge (e.g. accuracy)
+ ),
+ ```
+
+2. If the scenario needs special LoadGen behaviour (e.g. like `sustained`),
+ add a branch under "Run benchmark" inside `_run_single_scenario`.
+
+3. List the new scenario name in your suite's
+ `scenarios.{default,extra}` array — the merge order is derived from
+ the registry automatically.
+
+Without a registry entry the base class falls back to a streaming
+inference path with `merge_key = `. Register an entry whenever
+you want the scenario to be treated differently (offline, no merge, etc.).
+
+---
+
## Suite plugin system
Suites with custom orchestration logic (multiple subprocesses, special
@@ -1098,6 +1134,6 @@ python runners/validate_submission.py --dir /tmp/accelmark_test/
## Questions and Support
- **Bug in LoadGen or schema:** Open a GitHub Issue
-- **New suite proposal:** Open a GitHub Issue with the "Request new suite" template
+- **New suite proposal:** Open a GitHub Issue with the [**Propose a new suite**](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=new_suite.md) template
- **New platform support:** Open a PR with a working platform script and at least one verified result
- **Leaderboard question:** Check `leaderboard/generate.py` — it's well-commented
\ No newline at end of file
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 00000000..d904b638
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,71 @@
+AccelMark
+Copyright 2024-2026 Juhao Liang and The AccelMark Contributors
+
+This product includes software developed as part of the AccelMark project
+(https://github.com/JuhaoLiang1997/AccelMark).
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+================================================================================
+Third-party bundled data
+================================================================================
+
+The AccelMark source tree includes a small amount of third-party data so that
+benchmark runs are fully reproducible without network access. Each bundled
+dataset retains its upstream license; the Apache 2.0 license above covers only
+the AccelMark code, schemas, and configuration around it.
+
+--------------------------------------------------------------------------------
+1. datasets/sharegpt_standard_v1/requests.jsonl (500 prompts)
+ datasets/sharegpt_edge_v1/requests.jsonl (500 prompts)
+ datasets/sharegpt_longctx_v1/requests.jsonl (200 prompts)
+--------------------------------------------------------------------------------
+
+ Derived from the ShareGPT GPT-4 conversational dataset curated by:
+
+ shibing624/sharegpt_gpt4
+ https://huggingface.co/datasets/shibing624/sharegpt_gpt4
+ License: CC BY 4.0
+ (https://creativecommons.org/licenses/by/4.0/)
+
+ The upstream corpus was assembled from publicly shared ChatGPT/GPT-4
+ conversations. AccelMark's variants are filtered subsets used as fixed
+ benchmark inputs; no derivation is intended as the authoritative copy.
+
+ Attribution: shibing624/sharegpt_gpt4 contributors, distributed under CC BY 4.0.
+
+ See datasets//README.md for the per-subset filtering criteria and
+ token statistics.
+
+--------------------------------------------------------------------------------
+2. schema/accuracy_subset.jsonl (100 multiple-choice items)
+--------------------------------------------------------------------------------
+
+ A 100-question subset of MMLU (Massive Multitask Language Understanding):
+
+ Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D.,
+ & Steinhardt, J. (2021). "Measuring Massive Multitask Language
+ Understanding." International Conference on Learning Representations.
+ https://arxiv.org/abs/2009.03300
+ https://github.com/hendrycks/test
+
+ License: MIT
+ (https://opensource.org/licenses/MIT)
+
+ AccelMark uses this subset purely as an accuracy gate (model-quality
+ sanity check) — it is NOT a measurement of MMLU performance. The subset
+ is immutable; see CONTRIBUTING.md "A few rules".
+
+================================================================================
+Third-party software dependencies
+================================================================================
+
+AccelMark's Python runtime dependencies (jsonschema, numpy, pyyaml, …) and
+the framework backends invoked by each runner (vLLM, SGLang, mlx-lm,
+vllm-ascend, vllm-rocm, vllm-tpu, vllm-musa, …) retain their own licenses.
+See each runner's requirements.txt for pinned versions; see the upstream
+projects for the corresponding license terms.
diff --git a/README.md b/README.md
index 7fac6641..5fb2f691 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
-
+
@@ -23,6 +23,16 @@
Development
+
+
+
+
+
+ From workload spec to published result — every row on the leaderboard carries its runner hash, environment fingerprint, and accuracy receipt.
+
+
---
## Why AccelMark?
@@ -53,9 +63,11 @@ python run.py --runner nvidia_vllm_47f5d58e --suite suite_A
# 4. Submit your result — open a pull request:
# git checkout -b submit/
-# cp results/your-result.json results/community//result.json
-# git add results/ env_info.json && git commit -m "results: "
+# git add results/community// && git commit -m "results: "
# gh pr create # or open via the GitHub web UI
+#
+# is the directory auto-created by run.py — it already contains
+# your result.json and env_info.json; no manual file moves are needed.
```
See [CONTRIBUTING.md](CONTRIBUTING.md) for the full guide. If you'd rather skip the PR workflow, [open a submission issue](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=community_submission.md) instead and a bot will draft the PR for you.
@@ -80,6 +92,18 @@ See [suites/README.md](suites/README.md) for full specs, time budgets, SLA defin
---
+## Currently on the leaderboard
+
+
+
+
+
+A snapshot of accelerators that have at least one submission on the leaderboard. Tile size is proportional to submission count; colour denotes vendor. See the [**live leaderboard**](https://juhaoliang1997.github.io/AccelMark) for current rankings, per-suite breakdowns, and the underlying `result.json` files.
+
+---
+
## Supported platforms
Reference runners live under `runners/` (see each folder’s `meta.json`). The table below is **auto-generated** from each runner's `meta.json` — never hand-edited. Add a runner, declare its `suite_support` in `meta.json`, and the matrix updates on its own.
@@ -140,8 +164,8 @@ If you use AccelMark results in research, please cite:
```bibtex
@misc{accelmark2026,
- title = {AccelMark: Open Benchmark Leaderboard for AI Accelerators on LLM Workloads},
- author = {Liang, Juhao and {The AccelMark Contributors}},
+ title = {Beyond NVIDIA! A Multi-Regime Framework for Benchmarking Heterogeneous AI Accelerators},
+ author = {Liang, Juhao and Zhang, Zhiyuan and Li, Siyu and Lin, Zhihang and Yu, Minchen and Zeng, Li and Chen, Zizhong and Sun, Ruoyu and Wang, Benyou},
year = {2026},
url = {https://github.com/JuhaoLiang1997/AccelMark}
}
@@ -151,5 +175,6 @@ If you use AccelMark results in research, please cite:
## License
-MIT — see [LICENSE](LICENSE).
-Submitted benchmark results are contributed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).
\ No newline at end of file
+Apache 2.0 — see [LICENSE](LICENSE).
+Submitted benchmark results are contributed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/).
+Bundled third-party data (datasets, accuracy subsets) keeps its upstream license — see [NOTICE](NOTICE).
\ No newline at end of file
diff --git a/datasets/README.md b/datasets/README.md
index 737d2318..82c06df4 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -41,3 +41,19 @@ Each line in `requests.jsonl`:
"prompt_type": "conversational"
}
```
+
+## License & attribution
+
+Bundled prompt data keeps its **upstream license**, not AccelMark's
+Apache-2.0. The three ShareGPT-derived datasets shipped here are
+redistributed under **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**
+from [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4).
+
+If you add a new dataset, its `README.md` **must** include:
+
+1. The upstream source (URL or HuggingFace ID).
+2. The upstream license (link to the canonical text).
+3. A citation block if the upstream authors request one.
+
+See [`../NOTICE`](../NOTICE) for the full third-party attribution that ships
+with the repository.
diff --git a/datasets/sharegpt_edge_v1/README.md b/datasets/sharegpt_edge_v1/README.md
index 2d53ad7e..0626fd46 100644
--- a/datasets/sharegpt_edge_v1/README.md
+++ b/datasets/sharegpt_edge_v1/README.md
@@ -5,18 +5,30 @@ Short-turn ShareGPT conversational prompts. Used by Suite F (consumer/edge bench
Filtered from `shibing624/sharegpt_gpt4` to retain only short-turn exchanges,
producing a distribution representative of interactive consumer inference workloads.
-| Field | Value |
-|-------------------|----------------------------------|
-| Source | shibing624/sharegpt_gpt4 |
-| Prompts | 500 |
-| Input tokens p50 | ~95 |
-| Input tokens p99 | ~600 |
-| Output tokens p50 | ~150 |
-| Output tokens p99 | ~400 |
-| Type | Conversational, single-turn |
+| Field | Value |
+|-------------------|--------------------------------------------------------------------------------------|
+| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) |
+| Prompts | 500 |
+| Input tokens p50 | ~95 |
+| Input tokens p99 | ~600 |
+| Output tokens p50 | ~150 |
+| Output tokens p99 | ~400 |
+| Type | Conversational, single-turn |
## Difference from sharegpt_standard_v1
`sharegpt_standard_v1` (Suites A, B, C, and E) has p50 input ~280 tokens and p50 output ~310 tokens.
`sharegpt_edge_v1` uses shorter prompts to keep benchmark runtime practical on consumer GPUs
-and to reflect the latency-sensitive interactive use cases they are typically deployed for.
\ No newline at end of file
+and to reflect the latency-sensitive interactive use cases they are typically deployed for.
+
+## License & attribution
+
+The prompts are derived from `shibing624/sharegpt_gpt4` and are redistributed
+under the upstream license, **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**.
+
+Apache-2.0 (the AccelMark repository license) covers only the AccelMark code,
+schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE)
+for the full third-party attribution.
+
+If you use these prompts in research, please cite the upstream dataset and
+this repository.
\ No newline at end of file
diff --git a/datasets/sharegpt_longctx_v1/README.md b/datasets/sharegpt_longctx_v1/README.md
index 72f86980..6d70a1ef 100644
--- a/datasets/sharegpt_longctx_v1/README.md
+++ b/datasets/sharegpt_longctx_v1/README.md
@@ -4,8 +4,23 @@ Long-context prompts for Suite D (~28K-token inputs; `max_model_len` 30,208 in `
| Field | Value |
|---|---|
-| Source | Long-context subset of ShareGPT |
+| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) (long-context subset) |
| Prompts | 200 |
| Input tokens p50 | ~28,000 |
| Output tokens p50 | ~256 (suite caps generation) |
| Type | Document QA, long-form input |
+
+## License & attribution
+
+The prompts are derived from the same ShareGPT GPT-4 corpus as
+`sharegpt_standard_v1` and are redistributed under the upstream license,
+**[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**. Long-context
+items are selected by tokenized input length; no additional editorial
+modification beyond filtering is applied.
+
+Apache-2.0 (the AccelMark repository license) covers only the AccelMark code,
+schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE)
+for the full third-party attribution.
+
+If you use these prompts in research, please cite the upstream dataset and
+this repository.
diff --git a/datasets/sharegpt_standard_v1/README.md b/datasets/sharegpt_standard_v1/README.md
index 2f1655a6..f2413ee2 100644
--- a/datasets/sharegpt_standard_v1/README.md
+++ b/datasets/sharegpt_standard_v1/README.md
@@ -4,8 +4,20 @@ Standard ShareGPT conversational prompts. Used by Suite A, B, C, E.
| Field | Value |
|---|---|
-| Source | shibing624/sharegpt_gpt4 |
+| Source | [shibing624/sharegpt_gpt4](https://huggingface.co/datasets/shibing624/sharegpt_gpt4) |
| Prompts | 500 |
| Input tokens p50 | ~280 |
| Output tokens p50 | ~310 |
| Type | Conversational, single-turn |
+
+## License & attribution
+
+The prompts are derived from `shibing624/sharegpt_gpt4` and are redistributed
+under the upstream license, **[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)**.
+
+Apache-2.0 (the AccelMark repository license) covers only the AccelMark code,
+schemas, and selection logic — not the prompt text itself. See [`../../NOTICE`](../../NOTICE)
+for the full third-party attribution.
+
+If you use these prompts in research, please cite the upstream dataset and
+this repository.
diff --git a/docs/assets/chip-cloud.png b/docs/assets/chip-cloud.png
new file mode 100644
index 00000000..e3ccccec
Binary files /dev/null and b/docs/assets/chip-cloud.png differ
diff --git a/docs/assets/framework-overview.png b/docs/assets/framework-overview.png
new file mode 100644
index 00000000..ddd06573
Binary files /dev/null and b/docs/assets/framework-overview.png differ
diff --git a/leaderboard/site/assets/data/suite-meta.js b/leaderboard/site/assets/data/suite-meta.js
new file mode 100644
index 00000000..b0430207
--- /dev/null
+++ b/leaderboard/site/assets/data/suite-meta.js
@@ -0,0 +1,230 @@
+// suite-meta.js — editorial metadata for each suite shown on the leaderboard.
+//
+// This file is pure copy + display rules ("editorial"). The runtime
+// benchmark contract (model_id, dataset, scenario list, …) lives in
+// `suites//suite.json` on the Python side and is injected into
+// the page as `window.SUITE_SPECS`. Splitting the two keeps editorial
+// edits (taglines, descriptions, primary-metric units) out of the diff
+// when the actual benchmark contract changes — and vice versa.
+//
+// Consumers should keep importing `SUITE_META` from `../data.js`; this
+// file is its single source of truth and only exists to keep `data.js`
+// at a manageable size.
+
+export const SUITE_META = {
+ suite_A: {
+ letter: "A",
+ title: "Single-chip throughput",
+ tagline: "How fast can one accelerator serve an 8B model?",
+ description:
+ "The canonical bandwidth-bound regime. 8B Llama on a single accelerator is small enough to fit comfortably in HBM, large enough that decode is memory-bandwidth-bound rather than compute-bound. This is the bread-and-butter serving workload that anchors most other LLM benchmarks, and the suite where vendor marketing numbers usually land.",
+ primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
+ workload: {
+ model: "meta-llama/Meta-Llama-3-8B-Instruct",
+ chips: "1",
+ precision: "BF16",
+ dataset: "sharegpt_standard_v1",
+ inputTokens: "~280",
+ outputTokens: "~310",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "MMLU subset score against the baseline. Gate for a valid submission." },
+ { name: "offline", isExtra: false,
+ desc: "Max throughput with all requests batched at once.",
+ metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
+ { name: "online", isExtra: false,
+ desc: "Highest QPS that meets the 500 ms p99 TTFT SLA under Poisson arrivals.",
+ metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
+ { name: "interactive", isExtra: true,
+ desc: "Single-stream first-token latency. No concurrency.",
+ metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } },
+ { name: "sustained", isExtra: true,
+ desc: "30 min fixed-concurrency load. Reports throughput stability and throttle ratio.",
+ metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
+ { name: "speculative", isExtra: true,
+ desc: "Offline workload with a 1B draft model loaded. Reports acceptance rate." },
+ { name: "burst", isExtra: true,
+ desc: "TTFT p99 during 5x burst windows versus steady. KV pressure test." },
+ ],
+ },
+ suite_B: {
+ letter: "B",
+ title: "Multi-chip throughput",
+ tagline: "Large-model serving across multiple chips.",
+ description:
+ "70B Llama distributed across multiple accelerators. Two effects compound: the model itself no longer fits on one chip (capacity-bound) and tensor-parallel inference shards KV cache, activations, and all-reduce traffic over the interconnect. Both the framework's TP path and the chip's NVLink / Infinity Fabric / scale-out fabric come under test here.",
+ primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
+ workload: {
+ model: "meta-llama/Meta-Llama-3-70B-Instruct",
+ chips: "flexible (typ. 4 / 8)",
+ precision: "BF16",
+ dataset: "sharegpt_standard_v1",
+ inputTokens: "~280",
+ outputTokens: "~310",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "MMLU subset score against the 70B baseline." },
+ { name: "offline", isExtra: false,
+ desc: "Aggregate throughput across N chips serving the 70B model.",
+ metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
+ { name: "online", isExtra: false,
+ desc: "Highest QPS that meets the 500 ms p99 TTFT SLA at 70B scale.",
+ metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
+ { name: "interactive", isExtra: true,
+ desc: "Single-stream TTFT at 70B. Decode-bound." },
+ { name: "sustained", isExtra: true,
+ desc: "30 min fixed load; concurrency 4 (70B leaves less KV headroom than 8B).",
+ metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
+ { name: "burst", isExtra: true,
+ desc: "Burst vs steady TTFT p99 at 70B scale." },
+ ],
+ },
+ suite_C: {
+ letter: "C",
+ title: "Quantization efficiency",
+ tagline: "Quality-adjusted throughput across precision formats.",
+ description:
+ "The bandwidth-to-compute transition. The same 8B model is run at five precision formats (BF16, FP8, W8A8, W8A16, W4A16); quality efficiency multiplies throughput speedup by the accuracy drop so a chip can't trade quality for speed silently. Reveals which chips have working low-precision tensor cores and which fall back to BF16 on the same instruction.",
+ primary: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" },
+ workload: {
+ model: "meta-llama/Llama-3.1-8B-Instruct",
+ chips: "1",
+ precision: "BF16, FP8, W8A8, W8A16, W4A16",
+ dataset: "sharegpt_standard_v1",
+ inputTokens: "~280",
+ outputTokens: "~310",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "Per-format accuracy gate (each format has its own threshold)." },
+ { name: "offline (×5 formats)", isExtra: false,
+ desc: "Offline throughput at each precision. Quality efficiency = throughput × accuracy.",
+ metric: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" } },
+ { name: "online", isExtra: true,
+ desc: "Online QPS sweep per format. Extra: 5 formats × QPS levels is expensive." },
+ { name: "sustained", isExtra: true,
+ desc: "15 min sustained load per format." },
+ ],
+ },
+ suite_D: {
+ letter: "D",
+ title: "Long-context inference",
+ tagline: "28K-token prefill, compute-bound regime.",
+ description:
+ "Compute-bound prefill. ~28K-token prompts push arithmetic intensity past the roofline knee, so chips with more raw FLOPS pull ahead of bandwidth-rich ones. The output cap (256 tokens) keeps decode short on purpose; this suite isolates the prefill side and is where Suite A's bandwidth-bound rankings begin to invert.",
+ primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
+ workload: {
+ model: "meta-llama/Llama-3.1-8B-Instruct",
+ chips: "1",
+ precision: "BF16; max_model_len 30,208",
+ dataset: "sharegpt_longctx_v1",
+ inputTokens: "~28K",
+ outputTokens: "≤256",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "MMLU gate against the 8B Llama-3.1 baseline." },
+ { name: "offline", isExtra: false,
+ desc: "Offline throughput at ~28K input tokens. Prefill-bound, tests raw FLOPS.",
+ metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
+ { name: "interactive", isExtra: true,
+ desc: "Long-context TTFT (~11 s per request at 28K). p90 is primary." },
+ { name: "online", isExtra: true,
+ desc: "Sub-QPS levels (0.5 / 1 / 2). Rate-bound at long context." },
+ { name: "sustained", isExtra: true,
+ desc: "30 min sustained at concurrency 8. Throttle ratio is the headline." },
+ { name: "speculative", isExtra: true,
+ desc: "Long-context offline with 1B draft model. Prefill-bound speculative." },
+ ],
+ },
+ suite_E: {
+ letter: "E",
+ title: "Multi-chip scaling efficiency",
+ tagline: "How well does 8B throughput scale to 2 / 4 / 8 chips?",
+ description:
+ "The Amdahl penalty in numbers. The same 8B model runs at 1×, 2×, and (optionally) 4× / 8× chip counts; the headline metric is 2× scaling efficiency = T_2× / (2 · T_1×). Reveals NVLink / Infinity Fabric / PCIe ceilings, and exposes flagships whose per-chip throughput grew faster than the interconnect did.",
+ primary: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 },
+ workload: {
+ model: "meta-llama/Meta-Llama-3-8B-Instruct",
+ chips: "1× / 2× required; 4× / 8× optional",
+ precision: "BF16",
+ dataset: "sharegpt_standard_v1",
+ inputTokens: "~280",
+ outputTokens: "~310",
+ },
+ scenarios: [
+ { name: "offline (1× / 2×)", isExtra: false,
+ desc: "Two-chip scaling efficiency vs single chip. Required for a valid submission.",
+ metric: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } },
+ { name: "offline (4×)", isExtra: false,
+ desc: "Four-chip scaling efficiency. Optional but commonly reported.",
+ metric: { key: "scaling_efficiency_4x", label: "4× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } },
+ { name: "offline (8×)", isExtra: false,
+ desc: "Eight-chip scaling. Communication overhead is the binding constraint here." },
+ ],
+ },
+ suite_F: {
+ letter: "F",
+ title: "Edge / consumer hardware",
+ tagline: "Small models on single-GPU edge hardware.",
+ description:
+ "The pure-bandwidth lower bound. Qwen2.5-0.5B with ~95-token prompts strips away residual compute interference and short-circuits prefill, exposing raw HBM headroom and software overhead. Commodity GPUs (RTX 4090, A6000) tend to be most competitive per dollar here, and the suite doubles as a regression check for low-VRAM deployments.",
+ primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
+ workload: {
+ model: "Qwen/Qwen2.5-0.5B-Instruct",
+ chips: "1 (≥4 GB VRAM)",
+ precision: "BF16",
+ dataset: "sharegpt_edge_v1",
+ inputTokens: "~95",
+ outputTokens: "~150",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "MMLU gate against the 0.5B baseline." },
+ { name: "offline", isExtra: false,
+ desc: "Offline throughput on the edge dataset (~95 tok prompts).",
+ metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
+ { name: "online", isExtra: false,
+ desc: "Max QPS at the standard 500 ms p99 TTFT SLA.",
+ metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
+ { name: "interactive", isExtra: false,
+ desc: "Single-stream TTFT on consumer hardware.",
+ metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } },
+ { name: "sustained", isExtra: true,
+ desc: "15 min sustained load (shorter than datacenter suites)." },
+ ],
+ },
+ suite_G: {
+ letter: "G",
+ title: "Mixture-of-Experts (MoE)",
+ tagline: "Sparse routing; bandwidth-bound multi-chip serving.",
+ description:
+ "Sparse activation. Mixtral 8×7B activates only 2 of 8 experts per token, which keeps arithmetic intensity below dense 8B inference even at multi-chip scale. Chips with high aggregate HBM bandwidth (HBM3e generation) pay off here; pure-FLOPS advantages from compute-bound suites don't translate.",
+ primary: { key: "sustained_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
+ workload: {
+ model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
+ chips: "≥2 (auto)",
+ precision: "BF16",
+ dataset: "sharegpt_standard_v1",
+ inputTokens: "~280",
+ outputTokens: "~310",
+ },
+ scenarios: [
+ { name: "accuracy", isExtra: false,
+ desc: "MMLU gate against the Mixtral baseline." },
+ { name: "offline", isExtra: false,
+ desc: "Aggregate MoE throughput. Only 2 of 8 experts activate per token.",
+ metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
+ { name: "online", isExtra: false,
+ desc: "Max QPS under the 500 ms p99 TTFT SLA on MoE serving.",
+ metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
+ { name: "interactive", isExtra: true,
+ desc: "Single-stream TTFT on MoE inference." },
+ { name: "sustained", isExtra: true,
+ desc: "30 min sustained MoE load. Several chips show thermal onset on this suite.",
+ metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
+ ],
+ },
+};
diff --git a/leaderboard/site/assets/js/data.js b/leaderboard/site/assets/js/data.js
index 470af671..8c7bcba2 100644
--- a/leaderboard/site/assets/js/data.js
+++ b/leaderboard/site/assets/js/data.js
@@ -9,232 +9,22 @@
import { groupBy, chipSlug, toTitleCase } from "./utils.js";
-// Each suite has a "primary metric" most relevant to a buyer's question.
-// This drives default sort on the rankings page and the top-3 podium on home.
+// Editorial copy and per-suite display rules live in
+// `../data/suite-meta.js`. We re-export so existing consumers
+// ("import { SUITE_META } from './data.js'") keep working unchanged.
+//
+// Why split? `data.js` is the runtime / view-state hub; `suite-meta.js`
+// is pure editorial content (titles, taglines, descriptions, primary-
+// metric units). Keeping them separate lets copy edits land without
+// touching the data-processing diff, and vice versa.
//
// `primary.scale` multiplies raw value at display (e.g. 0.945 → 94.5 %).
// `primary.decimals` overrides automatic decimal selection.
-// Suite workload constants — fixed per suite definition (suites/README.md).
// `inputTokens` / `outputTokens` are the dataset p50s used at benchmark
// time and are NOT derived from data files; they're part of the suite
// contract and only change with a suite revision.
-export const SUITE_META = {
- suite_A: {
- letter: "A",
- title: "Single-chip throughput",
- tagline: "How fast can one accelerator serve an 8B model?",
- description:
- "The canonical bandwidth-bound regime. 8B Llama on a single accelerator is small enough to fit comfortably in HBM, large enough that decode is memory-bandwidth-bound rather than compute-bound. This is the bread-and-butter serving workload that anchors most other LLM benchmarks, and the suite where vendor marketing numbers usually land.",
- primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
- workload: {
- model: "meta-llama/Meta-Llama-3-8B-Instruct",
- chips: "1",
- precision: "BF16",
- dataset: "sharegpt_standard_v1",
- inputTokens: "~280",
- outputTokens: "~310",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "MMLU subset score against the baseline. Gate for a valid submission." },
- { name: "offline", isExtra: false,
- desc: "Max throughput with all requests batched at once.",
- metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
- { name: "online", isExtra: false,
- desc: "Highest QPS that meets the 500 ms p99 TTFT SLA under Poisson arrivals.",
- metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
- { name: "interactive", isExtra: true,
- desc: "Single-stream first-token latency. No concurrency.",
- metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } },
- { name: "sustained", isExtra: true,
- desc: "30 min fixed-concurrency load. Reports throughput stability and throttle ratio.",
- metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
- { name: "speculative", isExtra: true,
- desc: "Offline workload with a 1B draft model loaded. Reports acceptance rate." },
- { name: "burst", isExtra: true,
- desc: "TTFT p99 during 5x burst windows versus steady. KV pressure test." },
- ],
- },
- suite_B: {
- letter: "B",
- title: "Multi-chip throughput",
- tagline: "Large-model serving across multiple chips.",
- description:
- "70B Llama distributed across multiple accelerators. Two effects compound: the model itself no longer fits on one chip (capacity-bound) and tensor-parallel inference shards KV cache, activations, and all-reduce traffic over the interconnect. Both the framework's TP path and the chip's NVLink / Infinity Fabric / scale-out fabric come under test here.",
- primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
- workload: {
- model: "meta-llama/Meta-Llama-3-70B-Instruct",
- chips: "flexible (typ. 4 / 8)",
- precision: "BF16",
- dataset: "sharegpt_standard_v1",
- inputTokens: "~280",
- outputTokens: "~310",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "MMLU subset score against the 70B baseline." },
- { name: "offline", isExtra: false,
- desc: "Aggregate throughput across N chips serving the 70B model.",
- metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
- { name: "online", isExtra: false,
- desc: "Highest QPS that meets the 500 ms p99 TTFT SLA at 70B scale.",
- metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
- { name: "interactive", isExtra: true,
- desc: "Single-stream TTFT at 70B. Decode-bound." },
- { name: "sustained", isExtra: true,
- desc: "30 min fixed load; concurrency 4 (70B leaves less KV headroom than 8B).",
- metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
- { name: "burst", isExtra: true,
- desc: "Burst vs steady TTFT p99 at 70B scale." },
- ],
- },
- suite_C: {
- letter: "C",
- title: "Quantization efficiency",
- tagline: "Quality-adjusted throughput across precision formats.",
- description:
- "The bandwidth-to-compute transition. The same 8B model is run at five precision formats (BF16, FP8, W8A8, W8A16, W4A16); quality efficiency multiplies throughput speedup by the accuracy drop so a chip can't trade quality for speed silently. Reveals which chips have working low-precision tensor cores and which fall back to BF16 on the same instruction.",
- primary: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" },
- workload: {
- model: "meta-llama/Llama-3.1-8B-Instruct",
- chips: "1",
- precision: "BF16, FP8, W8A8, W8A16, W4A16",
- dataset: "sharegpt_standard_v1",
- inputTokens: "~280",
- outputTokens: "~310",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "Per-format accuracy gate (each format has its own threshold)." },
- { name: "offline (×5 formats)", isExtra: false,
- desc: "Offline throughput at each precision. Quality efficiency = throughput × accuracy.",
- metric: { key: "quant_quality_eff", label: "quality efficiency", direction: "desc", unit: "" } },
- { name: "online", isExtra: true,
- desc: "Online QPS sweep per format. Extra: 5 formats × QPS levels is expensive." },
- { name: "sustained", isExtra: true,
- desc: "15 min sustained load per format." },
- ],
- },
- suite_D: {
- letter: "D",
- title: "Long-context inference",
- tagline: "28K-token prefill, compute-bound regime.",
- description:
- "Compute-bound prefill. ~28K-token prompts push arithmetic intensity past the roofline knee, so chips with more raw FLOPS pull ahead of bandwidth-rich ones. The output cap (256 tokens) keeps decode short on purpose; this suite isolates the prefill side and is where Suite A's bandwidth-bound rankings begin to invert.",
- primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
- workload: {
- model: "meta-llama/Llama-3.1-8B-Instruct",
- chips: "1",
- precision: "BF16; max_model_len 30,208",
- dataset: "sharegpt_longctx_v1",
- inputTokens: "~28K",
- outputTokens: "≤256",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "MMLU gate against the 8B Llama-3.1 baseline." },
- { name: "offline", isExtra: false,
- desc: "Offline throughput at ~28K input tokens. Prefill-bound, tests raw FLOPS.",
- metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
- { name: "interactive", isExtra: true,
- desc: "Long-context TTFT (~11 s per request at 28K). p90 is primary." },
- { name: "online", isExtra: true,
- desc: "Sub-QPS levels (0.5 / 1 / 2). Rate-bound at long context." },
- { name: "sustained", isExtra: true,
- desc: "30 min sustained at concurrency 8. Throttle ratio is the headline." },
- { name: "speculative", isExtra: true,
- desc: "Long-context offline with 1B draft model. Prefill-bound speculative." },
- ],
- },
- suite_E: {
- letter: "E",
- title: "Multi-chip scaling efficiency",
- tagline: "How well does 8B throughput scale to 2 / 4 / 8 chips?",
- description:
- "The Amdahl penalty in numbers. The same 8B model runs at 1×, 2×, and (optionally) 4× / 8× chip counts; the headline metric is 2× scaling efficiency = T_2× / (2 · T_1×). Reveals NVLink / Infinity Fabric / PCIe ceilings, and exposes flagships whose per-chip throughput grew faster than the interconnect did.",
- primary: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 },
- workload: {
- model: "meta-llama/Meta-Llama-3-8B-Instruct",
- chips: "1× / 2× required; 4× / 8× optional",
- precision: "BF16",
- dataset: "sharegpt_standard_v1",
- inputTokens: "~280",
- outputTokens: "~310",
- },
- scenarios: [
- { name: "offline (1× / 2×)", isExtra: false,
- desc: "Two-chip scaling efficiency vs single chip. Required for a valid submission.",
- metric: { key: "scaling_efficiency_2x", label: "2× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } },
- { name: "offline (4×)", isExtra: false,
- desc: "Four-chip scaling efficiency. Optional but commonly reported.",
- metric: { key: "scaling_efficiency_4x", label: "4× scaling efficiency", direction: "desc", unit: "%", scale: 100, decimals: 1 } },
- { name: "offline (8×)", isExtra: false,
- desc: "Eight-chip scaling. Communication overhead is the binding constraint here." },
- ],
- },
- suite_F: {
- letter: "F",
- title: "Edge / consumer hardware",
- tagline: "Small models on single-GPU edge hardware.",
- description:
- "The pure-bandwidth lower bound. Qwen2.5-0.5B with ~95-token prompts strips away residual compute interference and short-circuits prefill, exposing raw HBM headroom and software overhead. Commodity GPUs (RTX 4090, A6000) tend to be most competitive per dollar here, and the suite doubles as a regression check for low-VRAM deployments.",
- primary: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
- workload: {
- model: "Qwen/Qwen2.5-0.5B-Instruct",
- chips: "1 (≥4 GB VRAM)",
- precision: "BF16",
- dataset: "sharegpt_edge_v1",
- inputTokens: "~95",
- outputTokens: "~150",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "MMLU gate against the 0.5B baseline." },
- { name: "offline", isExtra: false,
- desc: "Offline throughput on the edge dataset (~95 tok prompts).",
- metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
- { name: "online", isExtra: false,
- desc: "Max QPS at the standard 500 ms p99 TTFT SLA.",
- metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
- { name: "interactive", isExtra: false,
- desc: "Single-stream TTFT on consumer hardware.",
- metric: { key: "interactive_ttft_p99", label: "TTFT p99", direction: "asc", unit: "ms", decimals: 0 } },
- { name: "sustained", isExtra: true,
- desc: "15 min sustained load (shorter than datacenter suites)." },
- ],
- },
- suite_G: {
- letter: "G",
- title: "Mixture-of-Experts (MoE)",
- tagline: "Sparse routing; bandwidth-bound multi-chip serving.",
- description:
- "Sparse activation. Mixtral 8×7B activates only 2 of 8 experts per token, which keeps arithmetic intensity below dense 8B inference even at multi-chip scale. Chips with high aggregate HBM bandwidth (HBM3e generation) pay off here; pure-FLOPS advantages from compute-bound suites don't translate.",
- primary: { key: "sustained_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" },
- workload: {
- model: "mistralai/Mixtral-8x7B-Instruct-v0.1",
- chips: "≥2 (auto)",
- precision: "BF16",
- dataset: "sharegpt_standard_v1",
- inputTokens: "~280",
- outputTokens: "~310",
- },
- scenarios: [
- { name: "accuracy", isExtra: false,
- desc: "MMLU gate against the Mixtral baseline." },
- { name: "offline", isExtra: false,
- desc: "Aggregate MoE throughput. Only 2 of 8 experts activate per token.",
- metric: { key: "offline_throughput", label: "tokens/sec", direction: "desc", unit: "tokens/sec" } },
- { name: "online", isExtra: false,
- desc: "Max QPS under the 500 ms p99 TTFT SLA on MoE serving.",
- metric: { key: "online_max_qps", label: "queries/sec", direction: "desc", unit: "queries/sec" } },
- { name: "interactive", isExtra: true,
- desc: "Single-stream TTFT on MoE inference." },
- { name: "sustained", isExtra: true,
- desc: "30 min sustained MoE load. Several chips show thermal onset on this suite.",
- metric: { key: "sustained_throughput", label: "sustained throughput", direction: "desc", unit: "tokens/sec" } },
- ],
- },
-};
+export { SUITE_META } from "../data/suite-meta.js";
+import { SUITE_META } from "../data/suite-meta.js";
// House style: headline-style Title Case for all suite titles so they
// look correct everywhere they surface (home cards, rankings hero,
diff --git a/pyproject.toml b/pyproject.toml
index ea8d476d..bbdcf567 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "accelmark"
version = "0.1.0"
description = "Open benchmark leaderboard for AI accelerators on LLM workloads"
readme = "README.md"
-license = "MIT"
+license = "Apache-2.0"
license-files = ["LICENSE"]
requires-python = ">=3.10"
authors = [
@@ -31,7 +31,7 @@ classifiers = [
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: System :: Benchmark",
- "License :: OSI Approved :: MIT License",
+ "License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
@@ -60,4 +60,20 @@ requires = ["setuptools>=68"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
-include = ["loadgen*"]
+# AccelMark is primarily a clone-and-run repository (see `python run.py`
+# in README.md). `pip install -e .` is supported so contributors can pick
+# up the shared deps and import the helper packages (`loadgen`, the base
+# `runners.benchmark_runner` class, `serve.adapter` / `serve.capacity`,
+# and the `openclaw_skill` entry points) from anywhere — but there is no
+# top-level `accelmark` package to invoke via `python -m`.
+include = [
+ "loadgen*",
+ "runners*",
+ "serve*",
+ "openclaw_skill*",
+]
+exclude = [
+ "tests*",
+ "*.tests",
+ "*.tests.*",
+]
diff --git a/runners/README.md b/runners/README.md
index aaf4d812..c5180111 100644
--- a/runners/README.md
+++ b/runners/README.md
@@ -89,13 +89,15 @@ class MyFrameworkRunner(BenchmarkRunner):
# You almost never need to restrict this below ["bf16", "fp16", "fp32"].
SUPPORTED_PRECISIONS = ["bf16", "fp16", "fp32"]
- # Declare supported quantization formats for Suite C.
- # BF16 is always included. List only formats your framework can load.
- # FP8 requires native FP8 hardware (H100, MI300X).
- SUPPORTED_QUANTIZATIONS = ["fp8", "w8a8", "w8a16", "w4a16"] # H100 full support
- # SUPPORTED_QUANTIZATIONS = ["w8a8", "w8a16", "w4a16"] # A100 (no FP8)
- # SUPPORTED_QUANTIZATIONS = ["w8a8", "w4a16"] # ROCm example
- # SUPPORTED_QUANTIZATIONS = [] # Apple MLX
+ # Declare the framework's quantization backends. Suite C cross-references
+ # each precision_model_map entry's engine_kwargs.quantization against this
+ # list to decide which formats to run on this runner. The strings must
+ # match the engine's own backend names (e.g. vLLM's `quantization=` kwarg),
+ # NOT suite-level precision tags like W8A8/FP8.
+ SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"] # vLLM full
+ # SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # A100 (no FP8)
+ # SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # ROCm
+ # SUPPORTED_QUANTIZATION_BACKENDS = [] # Apple MLX
def load_model(self, model_path: str, parallelism: dict) -> None:
from myframework import Engine
@@ -268,7 +270,7 @@ Override these class attributes in your runner to declare what the framework sup
| `SUPPORTS_ONLINE` | `True` | Set `False` if framework cannot handle concurrent requests |
| `SUPPORTS_MULTI_CHIP` | `True` | Set `False` if no tensor parallelism — tensor_parallel_size from runner config and CLI is ignored; runner always uses 1 chip |
| `SUPPORTED_PRECISIONS` | `["bf16", "fp16", "fp32"]` | Maximum compute precisions on capable hardware. Hardware detection automatically restricts this (V100 → FP16, MI100 → FP16, M1 → FP16). Only restrict below the default if your framework genuinely cannot use a precision regardless of hardware. |
-| `SUPPORTED_QUANTIZATIONS` | `[]` | Quantization formats supported for Suite C. Use uppercase strings: `"FP8"`, `"W8A8"`, `"W8A16"`, `"W4A16"`. BF16 is always supported and does not need to be listed. Empty list means this runner skips all quantized formats in Suite C. |
+| `SUPPORTED_QUANTIZATION_BACKENDS` | `[]` | Framework-level quantization backends Suite C can use, named after the engine's own identifiers (vLLM examples: `"fp8"`, `"compressed-tensors"`, `"gptq_marlin"`, `"awq"`). NOT the suite precision tags (`W8A8`, `FP8`, `W4A16` …). BF16/FP16/FP32 are always allowed and must not be listed. Empty list means this runner skips every quantized entry in Suite C's `precision_model_map`. |
---
diff --git a/runners/benchmark_runner.py b/runners/benchmark_runner.py
index 5b0c2747..747071af 100644
--- a/runners/benchmark_runner.py
+++ b/runners/benchmark_runner.py
@@ -82,12 +82,43 @@ class InferenceRequest:
extra: dict = dataclass_field(default_factory=dict)
-# ── Scenario constants ────────────────────────────────────────────────────────
+# ── Scenario registry ────────────────────────────────────────────────────────
+#
+# Each ScenarioSpec describes how the base class should drive a scenario name
+# at runtime. Adding a new scenario means appending one row to
+# ``_SCENARIO_REGISTRY`` (and, if needed, implementing a new inference method
+# on the runner) — no edits to the if/elif ladders or merge order constants.
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ScenarioSpec:
+ """Declarative contract for one scenario name."""
+
+ name: str
+ inference_kind: str # "offline" | "streaming"
+ needs_streaming: bool # raise an error if SUPPORTS_STREAMING is False
+ use_async: bool # passed to load_model() as use_async
+ merge_key: Optional[str] # key under metrics dict to merge (None = no-merge, e.g. accuracy)
+
+
+_SCENARIO_REGISTRY: "dict[str, ScenarioSpec]" = {
+ "accuracy": ScenarioSpec("accuracy", "offline", False, False, None),
+ "offline": ScenarioSpec("offline", "offline", False, False, "offline"),
+ "online": ScenarioSpec("online", "streaming", True, True, "online"),
+ "interactive": ScenarioSpec("interactive", "streaming", True, True, "interactive"),
+ "sustained": ScenarioSpec("sustained", "streaming", True, True, "sustained"),
+ "speculative": ScenarioSpec("speculative", "offline", False, False, "speculative"),
+ "burst": ScenarioSpec("burst", "streaming", True, True, "burst"),
+ "training": ScenarioSpec("training", "offline", False, False, "training"),
+}
# Canonical order in which scenario metrics are merged into a suite result.
-_MERGE_SCENARIO_KEYS = [
- "offline", "online", "interactive", "sustained", "training",
- "speculative", "burst",
+# Derived from the registry so adding a new scenario only requires editing
+# the registry above.
+_MERGE_SCENARIO_KEYS: list[str] = [
+ spec.merge_key for spec in _SCENARIO_REGISTRY.values() if spec.merge_key
]
# ── Base class ────────────────────────────────────────────────────────────────
@@ -315,13 +346,9 @@ def get_peak_memory_gb(self) -> Optional[float]:
"""
return None
- def format_prompt(self, prompt: str) -> str:
- """
- Apply chat template or other prompt formatting.
- Override if the platform requires specific prompt formatting.
- Default: return prompt unchanged.
- """
- return prompt
+ # ``format_prompt`` is defined further down (it depends on self.tokenizer
+ # which subclasses populate during load_model). Keeping it as a single
+ # source of truth avoids two definitions on the same class.
def get_supported_precisions(
self, chip_name: str, env_info: dict
@@ -933,6 +960,152 @@ def parse_args(self) -> argparse.Namespace:
return args
+ # ── Scenario dispatch helpers ────────────────────────────────────────────
+
+ @classmethod
+ def _scenario_spec(cls, scenario: str) -> ScenarioSpec:
+ """Return the ScenarioSpec for ``scenario``.
+
+ Falls back to a synthetic streaming spec for names not declared in
+ the global registry — this preserves the historical behaviour where
+ unknown scenarios defaulted to streaming inference. New scenarios
+ SHOULD register themselves in ``_SCENARIO_REGISTRY`` so the merge
+ order and use_async flag are picked up automatically.
+ """
+ spec = _SCENARIO_REGISTRY.get(scenario)
+ if spec is not None:
+ return spec
+ return ScenarioSpec(
+ name=scenario,
+ inference_kind="streaming",
+ needs_streaming=False,
+ use_async=True,
+ merge_key=scenario,
+ )
+
+ def _resolve_inference_fn(self, scenario: str):
+ """Pick the runner's inference function for the given scenario name.
+
+ Dispatch rules are derived from the scenario registry:
+ - ``inference_kind == "offline"`` → ``inference_fn_offline``
+ - ``inference_kind == "streaming"`` → ``inference_fn_streaming``
+ (requires ``SUPPORTS_STREAMING = True``; aborts otherwise when
+ the scenario explicitly demands streaming)
+ - Unknown / non-streaming runners fall back to a sync wrapper
+ around ``inference_fn_offline``.
+ """
+ spec = self._scenario_spec(scenario)
+
+ if spec.inference_kind == "offline":
+ return self.inference_fn_offline
+
+ if spec.inference_kind == "streaming":
+ if self.SUPPORTS_STREAMING:
+ return self.inference_fn_streaming
+ if spec.needs_streaming:
+ print(
+ f"Error: scenario '{scenario}' requires "
+ f"SUPPORTS_STREAMING = True."
+ )
+ sys.exit(1)
+ def _sync_wrapper(request: InferenceRequest) -> InferenceResult:
+ results = self.inference_fn_offline([request])
+ return results[0]
+ return _sync_wrapper
+
+ raise ValueError(
+ f"Unknown inference_kind '{spec.inference_kind}' for scenario "
+ f"'{scenario}'. Update _SCENARIO_REGISTRY in benchmark_runner.py."
+ )
+
+ # ── Shared load-context preparation ───────────────────────────────────────
+
+ def _prepare_load_context(self, args, suite: dict, output_dir: Path) -> dict:
+ """
+ Common pre-`load_model` plumbing shared by accuracy and benchmark
+ scenarios. Resolves the precision-aware model id, model path,
+ parallelism, env_info, and configures the precision-related instance
+ variables (``_precision_dtype_override``, ``_precision_engine_kwargs``,
+ ``_effective_precision``). Returns a dict of locally useful values.
+
+ Centralising this avoids the precision_model_map / dtype-override /
+ engine_kwargs glue being copy-pasted between branches.
+ """
+ # For Suite C subprocesses, --precision is set and precision_model_map
+ # holds the actual checkpoint being loaded. Use it for the display
+ # label so the log doesn't show "Loading meta-llama/..." when in fact
+ # loading the FP8/W8A8/... variant.
+ _precision_arg = getattr(args, "precision", None)
+ _precision_model_map = suite.get("precision_model_map", {})
+ _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {})
+
+ model_id = _fmt_entry.get("model_id") or suite.get("model_id", "unknown")
+ effective_model_path = self._resolve_model_path(
+ model_id, getattr(args, "model_path", None)
+ )
+
+ if getattr(args, "model_note", None):
+ self._model_note_override = args.model_note
+ if getattr(args, "model_name", None):
+ self._model_name_override = args.model_name
+
+ _par = getattr(self, "_parallelism", {})
+ parallelism = {
+ "tensor_parallel_size": _par.get("tensor_parallel_size", 1),
+ "pipeline_parallel_size": _par.get("pipeline_parallel_size", 1),
+ "expert_parallel_size": _par.get("expert_parallel_size", 1),
+ "data_parallel_size": _par.get("data_parallel_size", 1),
+ }
+
+ # Read env_info.json from task directory. For standalone runs it's in
+ # output_dir; for --scenario all it's in the parent. For deeply nested
+ # subprocess runs it may be two levels up — search up the tree.
+ env_info: dict = {}
+ for _candidate in (output_dir, output_dir.parent, output_dir.parent.parent):
+ _p = _candidate / "env_info.json"
+ if _p.exists():
+ with open(_p) as _f:
+ env_info = json.load(_f)
+ break
+
+ # Resolve precision — explicit --precision (e.g. set by a suite
+ # subprocess) takes priority over hardware-derived selection.
+ if getattr(args, "precision", None):
+ effective_precision = args.precision.upper()
+ else:
+ effective_precision = self._resolve_precision(suite, env_info)
+ self._effective_precision = effective_precision
+
+ # Inject dtype_override and engine_kwargs from precision_model_map
+ # so the runner can apply the correct quantization kernel and dtype.
+ self._precision_dtype_override = _fmt_entry.get("dtype_override")
+ self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {})
+
+ # If the precision_model_map entry declares a quantization
+ # engine_kwarg, the runner will use dtype="auto", which lets vLLM
+ # default the compute dtype to BF16 internally. On pre-Ampere
+ # hardware (V100/T4) that does not support BF16 this silently
+ # produces wrong results — force float16 when no dtype_override was
+ # already set and the hardware can't do BF16.
+ _entry_has_quantization = bool(
+ (_fmt_entry.get("engine_kwargs") or {}).get("quantization")
+ )
+ if (
+ not self._precision_dtype_override
+ and _entry_has_quantization
+ and "BF16" not in self._detect_supported_precisions(env_info)
+ ):
+ self._precision_dtype_override = "float16"
+
+ return {
+ "model_id": model_id,
+ "effective_model_path": effective_model_path,
+ "parallelism": parallelism,
+ "env_info": env_info,
+ "effective_precision": effective_precision,
+ "fmt_entry": _fmt_entry,
+ }
+
# ── Single scenario ───────────────────────────────────────────────────────
def _run_single_scenario(self, args, suite: dict) -> dict:
@@ -956,96 +1129,23 @@ def _run_single_scenario(self, args, suite: dict) -> dict:
output_dir.mkdir(parents=True, exist_ok=True)
self._setup_logging(str(output_dir))
- # Resolve and load model
- # For Suite C subprocesses, --precision is set — use precision_model_map
- # to get the actual checkpoint model_id for display and metadata.
- _precision_arg = getattr(args, "precision", None)
- _precision_model_map = suite.get("precision_model_map", {})
- _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {})
- model_id = (
- _fmt_entry.get("model_id")
- or suite.get("model_id", "unknown")
- )
- effective_model_path = self._resolve_model_path(
- model_id, getattr(args, "model_path", None)
- )
- if getattr(args, "model_note", None):
- self._model_note_override = args.model_note
- if getattr(args, "model_name", None):
- self._model_name_override = args.model_name
- _par = getattr(self, "_parallelism", {})
- tp_size = _par.get("tensor_parallel_size", 1)
- pp_size = _par.get("pipeline_parallel_size", 1)
- ep_size = _par.get("expert_parallel_size", 1)
- dp_size = _par.get("data_parallel_size", 1)
-
- # Load env_info for precision resolution (search up to 2 levels)
- _acc_env_info: dict = {}
- for _c in [output_dir, output_dir.parent, output_dir.parent.parent]:
- _p = _c / "env_info.json"
- if _p.exists():
- with open(_p) as _f:
- _acc_env_info = json.load(_f)
- break
-
- if getattr(args, "precision", None):
- effective_precision = args.precision.upper()
- else:
- effective_precision = self._resolve_precision(suite, _acc_env_info)
- self._effective_precision = effective_precision
-
- # Inject dtype_override and engine_kwargs from precision_model_map entry
- # so the runner can apply the correct quantization kernel and dtype.
- self._precision_dtype_override = _fmt_entry.get("dtype_override")
- self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {})
-
- # If the precision_model_map entry declares a quantization engine_kwarg, the
- # runner will use dtype="auto", which lets vLLM default the compute dtype to
- # BF16 internally. On pre-Ampere hardware (V100/T4) that doesn't support BF16
- # this silently produces wrong results. If no dtype_override was already set
- # by the suite entry and the hardware doesn't support BF16, force float16.
- _entry_has_quantization = bool(
- (_fmt_entry.get("engine_kwargs") or {}).get("quantization")
- )
- if (not self._precision_dtype_override
- and _entry_has_quantization
- and "BF16" not in self._detect_supported_precisions(_acc_env_info)):
- self._precision_dtype_override = "float16"
-
- if (args.scenario == "speculative"
- and "speculative_model" not in self._precision_engine_kwargs):
- _draft_id = suite.get("speculative_draft_model_id")
- if _draft_id:
- _saved = (
- getattr(self, "_model_source", None),
- getattr(self, "_model_name_override", None),
- getattr(self, "_model_note_override", None),
- )
- _draft_path = self._resolve_model_path(_draft_id, None)
- (self._model_source,
- self._model_name_override,
- self._model_note_override) = _saved
- self._precision_engine_kwargs["speculative_model"] = _draft_path
- self._precision_engine_kwargs.setdefault(
- "num_speculative_tokens",
- suite.get("speculative_num_tokens", 4),
- )
- self._precision_engine_kwargs.setdefault(
- "speculative_draft_tensor_parallel_size", 1,
- )
+ # Resolve precision-aware model_id, parallelism, env_info, and
+ # configure self._precision_* via the shared helper. Accuracy is
+ # always plain decode, so no speculative-draft injection here.
+ _ctx = self._prepare_load_context(args, suite, output_dir)
+ model_id = _ctx["model_id"]
+ effective_model_path = _ctx["effective_model_path"]
+ parallelism = _ctx["parallelism"]
print(f"Loading {model_id} for accuracy check...")
t_load = time.perf_counter()
self._current_scenario = "accuracy"
self._advance_dist_port()
self.load_model(effective_model_path, {
- "tensor_parallel_size": tp_size,
- "pipeline_parallel_size": pp_size,
- "expert_parallel_size": ep_size,
- "data_parallel_size": dp_size,
- "max_tokens": suite.get("output_tokens_max", 512),
- "max_model_len": suite.get("max_model_len"),
- "use_async": False,
+ **parallelism,
+ "max_tokens": suite.get("output_tokens_max", 512),
+ "max_model_len": suite.get("max_model_len"),
+ "use_async": False,
})
print(f"Model loaded in {round(time.perf_counter() - t_load, 1)}s")
@@ -1072,74 +1172,17 @@ def _run_single_scenario(self, args, suite: dict) -> dict:
# Load submitter profile
profile = self._load_submitter_profile()
- # Resolve model path
- # For Suite C subprocesses, --precision is set and precision_model_map holds
- # the actual checkpoint being loaded. Use it for the display label so the log
- # doesn't show "Loading meta-llama/Llama-3.1-8B-Instruct..." when loading FP8.
- _precision_arg = getattr(args, "precision", None)
- _precision_model_map = suite.get("precision_model_map", {})
- _fmt_entry = _precision_model_map.get((_precision_arg or "").upper(), {})
- model_id = (
- _fmt_entry.get("model_id")
- or suite.get("model_id", "unknown")
- )
- effective_model_path = self._resolve_model_path(
- model_id, getattr(args, "model_path", None)
- )
- if getattr(args, "model_note", None):
- self._model_note_override = args.model_note
- if getattr(args, "model_name", None):
- self._model_name_override = args.model_name
-
- # Read env_info.json from task directory.
- # For standalone runs it's in output_dir; for --scenario all it's in the parent.
- # For deeply nested subprocess runs it may be two levels up — search up the tree.
- env_info = {}
- for _candidate in [output_dir, output_dir.parent, output_dir.parent.parent]:
- _p = _candidate / "env_info.json"
- if _p.exists():
- with open(_p) as f:
- env_info = json.load(f)
- break
-
- # Load model
- _par = getattr(self, "_parallelism", {})
- tp_size = _par.get("tensor_parallel_size", 1)
- pp_size = _par.get("pipeline_parallel_size", 1)
- ep_size = _par.get("expert_parallel_size", 1)
- dp_size = _par.get("data_parallel_size", 1)
-
- print(f"Loading {model_id}...")
- t_load_start = time.perf_counter()
- self._current_scenario = args.scenario
- self._advance_dist_port()
-
- # Resolve precision — handles BF16→FP16 fallback for older hardware.
- # Explicit --precision (e.g. set by a suite subprocess) takes priority.
- if getattr(args, "precision", None):
- effective_precision = args.precision.upper()
- else:
- effective_precision = self._resolve_precision(suite, env_info)
- self._effective_precision = effective_precision
-
- # Inject dtype_override and engine_kwargs from precision_model_map entry
- # so the runner can apply the correct quantization kernel and dtype.
- self._precision_dtype_override = _fmt_entry.get("dtype_override")
- self._precision_engine_kwargs = dict(_fmt_entry.get("engine_kwargs") or {})
-
- # If the precision_model_map entry declares a quantization engine_kwarg, the
- # runner will use dtype="auto", which lets vLLM default the compute dtype to
- # BF16 internally. On pre-Ampere hardware (V100/T4) that doesn't support BF16
- # this silently produces wrong results. If no dtype_override was already set
- # by the suite entry and the hardware doesn't support BF16, force float16.
- _entry_has_quantization = bool(
- (_fmt_entry.get("engine_kwargs") or {}).get("quantization")
- )
- if (not self._precision_dtype_override
- and _entry_has_quantization
- and "BF16" not in self._detect_supported_precisions(env_info)):
- self._precision_dtype_override = "float16"
-
+ # Resolve precision-aware model_id, parallelism, env_info, and
+ # configure self._precision_* via the shared helper.
+ _ctx = self._prepare_load_context(args, suite, output_dir)
+ model_id = _ctx["model_id"]
+ effective_model_path = _ctx["effective_model_path"]
+ parallelism = _ctx["parallelism"]
+ env_info = _ctx["env_info"]
+
+ # Inject speculative-decoding draft model (only relevant in the
+ # ``speculative`` scenario branch — accuracy / offline / online never
+ # need a draft model and the suite contract may not declare one).
if (args.scenario == "speculative"
and "speculative_model" not in self._precision_engine_kwargs):
_draft_id = suite.get("speculative_draft_model_id")
@@ -1162,14 +1205,16 @@ def _run_single_scenario(self, args, suite: dict) -> dict:
"speculative_draft_tensor_parallel_size", 1,
)
+ print(f"Loading {model_id}...")
+ t_load_start = time.perf_counter()
+ self._current_scenario = args.scenario
+ self._advance_dist_port()
+
self.load_model(effective_model_path, {
- "tensor_parallel_size": tp_size,
- "pipeline_parallel_size": pp_size,
- "expert_parallel_size": ep_size,
- "data_parallel_size": dp_size,
- "max_tokens": suite.get("output_tokens_max", 512),
- "max_model_len": suite.get("max_model_len"),
- "use_async": args.scenario not in ("offline", "accuracy", "speculative"),
+ **parallelism,
+ "max_tokens": suite.get("output_tokens_max", 512),
+ "max_model_len": suite.get("max_model_len"),
+ "use_async": self._scenario_spec(args.scenario).use_async,
})
model_load_seconds = round(time.perf_counter() - t_load_start, 1)
print(f"Model loaded in {model_load_seconds}s")
@@ -1204,29 +1249,10 @@ def _run_single_scenario(self, args, suite: dict) -> dict:
chip_count=chip_count,
)
- # Select inference function
- if args.scenario == "offline":
- inference_fn = self.inference_fn_offline
- elif args.scenario == "speculative":
- inference_fn = self.inference_fn_offline
- elif args.scenario == "sustained":
- if not self.SUPPORTS_STREAMING:
- print(f"Error: sustained scenario requires SUPPORTS_STREAMING = True.")
- sys.exit(1)
- inference_fn = self.inference_fn_streaming
- elif args.scenario == "burst":
- if not self.SUPPORTS_STREAMING:
- print(f"Error: burst scenario requires SUPPORTS_STREAMING = True.")
- sys.exit(1)
- inference_fn = self.inference_fn_streaming
- elif self.SUPPORTS_STREAMING:
- inference_fn = self.inference_fn_streaming
- else:
- # Fallback for platforms without streaming
- def _sync_wrapper(request: InferenceRequest) -> InferenceResult:
- results = self.inference_fn_offline([request])
- return results[0]
- inference_fn = _sync_wrapper
+ # Select inference function via the scenario registry. Unknown
+ # scenarios fall through with a sensible default — streaming when
+ # supported, otherwise a sync wrapper around inference_fn_offline.
+ inference_fn = self._resolve_inference_fn(args.scenario)
# Run benchmark
benchmark_start = datetime.now(timezone.utc)
@@ -1458,7 +1484,6 @@ def _run_all_scenarios(self, args, suite: dict) -> None:
else:
print(" --skip-accuracy-gate set -- continuing anyway.\n")
acc_result = None
- acc_result = None
else:
# Subprocess succeeded — read accuracy.json written by the child
with open(acc_json_path) as f:
@@ -1746,33 +1771,19 @@ def _load_accuracy_baseline_for_format(
except Exception:
return None
- def _run_accuracy_scenario(
- self,
- suite: dict,
- output_dir: Path,
- ) -> dict:
+ def _score_accuracy_questions(self, questions: list) -> tuple:
"""
- Run accuracy check as a proper scenario.
- Uses inference_fn_offline() — same model, framework, precision as the benchmark.
-
- Args:
- suite: Parsed suite.json dict
- output_dir: Where to write accuracy.json
+ Run the accuracy question bank through ``inference_fn_offline`` and
+ score the answers.
- Returns:
- Accuracy dict with subset_score, baseline_delta, valid fields.
+ Returns ``(score, correct, total, wrong_examples, scored_outputs)``
+ — shared by both :meth:`_run_accuracy_scenario` and
+ :meth:`_run_accuracy_scenario_for_format` so the inference/scoring
+ path stays identical (only baseline policy differs between callers).
"""
- questions = self._load_accuracy_questions()
-
- print(f"\n{'='*60}")
- print(f" Accuracy Check ({len(questions)} questions)")
- print(f" Framework: {self._get_framework_name()}")
- print(f" Precision: {getattr(self, '_effective_precision', None) or suite.get('precision_required', 'BF16')}")
- print(f"{'='*60}\n")
-
# Build InferenceRequest objects with raw (unformatted) prompts.
- # format_prompt() is called by the runner's inference_fn_offline internally —
- # passing raw prompts here avoids double-formatting.
+ # format_prompt() is called by the runner's inference_fn_offline
+ # internally — passing raw prompts here avoids double-formatting.
accuracy_requests = []
for i, q in enumerate(questions):
raw = (
@@ -1788,7 +1799,6 @@ def _run_accuracy_scenario(
request_id=i,
))
- # Run through inference_fn_offline — same model, framework, precision
t_start = time.perf_counter()
try:
results = self.inference_fn_offline(accuracy_requests)
@@ -1797,10 +1807,9 @@ def _run_accuracy_scenario(
elapsed = round(time.perf_counter() - t_start, 1)
print(f"Completed in {elapsed}s")
- # Score answers
correct = 0
- wrong_examples = []
- scored_outputs = []
+ wrong_examples: list[str] = []
+ scored_outputs: list[dict] = []
for i, result in enumerate(results):
text = (result.output_text or "").strip()
match = re.search(r"\b([ABCD])\b", text.upper())
@@ -1817,15 +1826,58 @@ def _run_accuracy_scenario(
)
scored_outputs.append({
"question_id": questions[i].get("question_id", i),
- "question": questions[i]["question"],
- "choices": questions[i]["choices"],
- "expected": expected,
- "predicted": predicted,
- "correct": is_correct,
- "raw_output": text[:500],
+ "question": questions[i]["question"],
+ "choices": questions[i]["choices"],
+ "expected": expected,
+ "predicted": predicted,
+ "correct": is_correct,
+ "raw_output": text[:500],
})
score = round(correct / len(questions), 4) if questions else 0.0
+ return score, correct, len(questions), wrong_examples, scored_outputs
+
+ @staticmethod
+ def _write_accuracy_artifacts(
+ output_dir: Path, acc: dict, scored_outputs: list
+ ) -> None:
+ """Persist accuracy.json and accuracy_outputs.jsonl for one scenario."""
+ acc_path = output_dir / "accuracy.json"
+ with open(acc_path, "w") as f:
+ json.dump(acc, f, indent=2)
+ print(f"Saved to: {acc_path}")
+
+ outputs_path = output_dir / "accuracy_outputs.jsonl"
+ with open(outputs_path, "w") as f:
+ for row in scored_outputs:
+ f.write(json.dumps(row) + "\n")
+
+ def _run_accuracy_scenario(
+ self,
+ suite: dict,
+ output_dir: Path,
+ ) -> dict:
+ """
+ Run accuracy check as a proper scenario.
+ Uses inference_fn_offline() — same model, framework, precision as the benchmark.
+
+ Args:
+ suite: Parsed suite.json dict
+ output_dir: Where to write accuracy.json
+
+ Returns:
+ Accuracy dict with subset_score, baseline_delta, valid fields.
+ """
+ questions = self._load_accuracy_questions()
+
+ print(f"\n{'='*60}")
+ print(f" Accuracy Check ({len(questions)} questions)")
+ print(f" Framework: {self._get_framework_name()}")
+ print(f" Precision: {getattr(self, '_effective_precision', None) or suite.get('precision_required', 'BF16')}")
+ print(f"{'='*60}\n")
+
+ score, correct, total, wrong_examples, scored_outputs = \
+ self._score_accuracy_questions(questions)
# Compare to baseline — one-sided: score must not drop more than threshold
# below baseline. Scoring ABOVE baseline is always valid.
@@ -1844,7 +1896,7 @@ def _run_accuracy_scenario(
valid = (delta >= -threshold) if delta is not None else True
# Print results
- print(f"Score: {correct}/{len(questions)} = {score:.4f}")
+ print(f"Score: {correct}/{total} = {score:.4f}")
if baseline_score is not None:
sign = "+" if delta >= 0 else ""
print(f"Baseline: {baseline_score:.4f}")
@@ -1859,30 +1911,18 @@ def _run_accuracy_scenario(
f"(threshold: {threshold}) — submission will be flagged")
acc = {
- "subset_score": score,
+ "subset_score": score,
"baseline_delta": delta,
- "valid": valid,
- "framework": self._get_framework_name(),
- "precision": getattr(self, "_effective_precision", None) or suite.get("precision_required", "BF16"),
+ "valid": valid,
+ "framework": self._get_framework_name(),
+ "precision": getattr(self, "_effective_precision", None) or suite.get("precision_required", "BF16"),
"notes": (
f"Integrated accuracy check — used same "
f"{self._get_framework_name()} instance as benchmark."
),
}
- # Save accuracy.json to submission directory
- acc_path = output_dir / "accuracy.json"
- with open(acc_path, "w") as f:
- json.dump(acc, f, indent=2)
- print(f"Saved to: {acc_path}")
-
- # Save per-question outputs (gitignored — for local debugging only)
- outputs_path = output_dir / "accuracy_outputs.jsonl"
- with open(outputs_path, "w") as f:
- for row in scored_outputs:
- f.write(json.dumps(row) + "\n")
- print(f"Per-question outputs saved to: {outputs_path}")
-
+ self._write_accuracy_artifacts(output_dir, acc, scored_outputs)
return acc
def _run_accuracy_scenario_for_format(
@@ -1918,60 +1958,8 @@ def _run_accuracy_scenario_for_format(
print(f" Framework: {self._get_framework_name()}")
print(f"{'='*60}\n")
- # Build InferenceRequest objects with raw (unformatted) prompts.
- # format_prompt() is called by the runner's inference_fn_offline internally.
- accuracy_requests = []
- for i, q in enumerate(questions):
- raw = (
- f"Question: {q['question']}\n"
- f"A) {q['choices'][0]}\n"
- f"B) {q['choices'][1]}\n"
- f"C) {q['choices'][2]}\n"
- f"D) {q['choices'][3]}\n"
- f"Answer:"
- )
- accuracy_requests.append(InferenceRequest(
- prompt=raw,
- request_id=i,
- ))
-
- t_start = time.perf_counter()
- try:
- results = self.inference_fn_offline(accuracy_requests)
- except Exception as e:
- raise RuntimeError(f"Accuracy inference failed: {e}") from e
- elapsed = round(time.perf_counter() - t_start, 1)
- print(f"Completed in {elapsed}s")
-
- # Score answers
- correct = 0
- wrong_examples = []
- scored_outputs = []
- for i, result in enumerate(results):
- text = (result.output_text or "").strip()
- match = re.search(r"\b([ABCD])\b", text.upper())
- predicted = match.group(1) if match else "?"
- expected = questions[i].get("answer", "")
- is_correct = (predicted == expected)
- if is_correct:
- correct += 1
- elif len(wrong_examples) < 3:
- wrong_examples.append(
- f" Q: {questions[i]['question'][:65]}\n"
- f" Expected: {expected}, Got: {predicted} "
- f"(raw: '{text[:20]}')"
- )
- scored_outputs.append({
- "question_id": questions[i].get("question_id", i),
- "question": questions[i]["question"],
- "choices": questions[i]["choices"],
- "expected": expected,
- "predicted": predicted,
- "correct": is_correct,
- "raw_output": text[:500],
- })
-
- score = round(correct / len(questions), 4) if questions else 0.0
+ score, correct, total, wrong_examples, scored_outputs = \
+ self._score_accuracy_questions(questions)
# Per-format baseline and threshold
baseline_score = self._load_accuracy_baseline_for_format(model_id, precision)
@@ -1982,7 +1970,7 @@ def _run_accuracy_scenario_for_format(
# None = baseline not set yet (placeholder) — not a failure
# Print results
- print(f"Score: {correct}/{len(questions)} = {score:.4f}")
+ print(f"Score: {correct}/{total} = {score:.4f}")
if baseline_score is not None:
sign = "+" if delta >= 0 else ""
print(f"Baseline ({precision}): {baseline_score:.4f}")
@@ -2010,18 +1998,7 @@ def _run_accuracy_scenario_for_format(
"notes": f"Suite C per-format accuracy check. Threshold: {threshold}",
}
- # Write accuracy.json
- acc_path = output_dir / "accuracy.json"
- with open(acc_path, "w") as f:
- json.dump(acc, f, indent=2)
- print(f"Saved to: {acc_path}")
-
- # Write per-question outputs (gitignored)
- outputs_path = output_dir / "accuracy_outputs.jsonl"
- with open(outputs_path, "w") as f:
- for row in scored_outputs:
- f.write(json.dumps(row) + "\n")
-
+ self._write_accuracy_artifacts(output_dir, acc, scored_outputs)
return acc
# ── GPU memory release ────────────────────────────────────────────────────
@@ -2111,28 +2088,10 @@ def _build_result_json(
ep_size = _par.get("expert_parallel_size", 1)
dp_size = _par.get("data_parallel_size", 1)
- # For Suite C subprocesses, --precision is set and precision_model_map holds
- # the actual quantized checkpoint. Use it so each per-format result.json records
- # the real model_id/revision (e.g. RedHatAI/...-FP8), not the suite-level model_id.
- _result_precision = (
- getattr(self, "_effective_precision", None)
- or getattr(args, "precision", None)
- )
- _pm_entry = suite.get("precision_model_map", {}).get(
- (_result_precision or "").upper(), {}
- )
- _result_model_id = (
- _pm_entry.get("model_id")
- or suite.get("model_id", "unknown")
- )
- _result_model_revision = (
- _pm_entry.get("model_revision")
- or suite.get("model_revision", "unknown")
- )
-
- # For Suite C subprocesses, --precision is set and precision_model_map holds
- # the actual quantized checkpoint. Use it so each per-format result.json records
- # the real model_id/revision (e.g. RedHatAI/...-FP8), not the suite-level model_id.
+ # For Suite C subprocesses, --precision is set and precision_model_map
+ # holds the actual quantized checkpoint. Use it so each per-format
+ # result.json records the real model_id/revision (e.g.
+ # RedHatAI/...-FP8), not the suite-level model_id.
_result_precision = (
getattr(self, "_effective_precision", None)
or getattr(args, "precision", None)
@@ -2196,7 +2155,7 @@ def _build_result_json(
"subset_score": None,
"baseline_delta": None,
"valid": False,
- "notes": "Run --scenario accuracy to check model accuracy.",
+ "notes": "Run --scenario accuracy to check model accuracy.",
},
"meta": {
"submitted_by": profile.get("submitted_by", ""),
diff --git a/runners/template/runner.py b/runners/template/runner.py
index 2b502d32..7e797770 100644
--- a/runners/template/runner.py
+++ b/runners/template/runner.py
@@ -65,11 +65,24 @@ class TemplateRunner(BenchmarkRunner):
BenchmarkRunner auto-detects hardware limits and intersects with this list.
"""
- SUPPORTED_QUANTIZATIONS = []
+ SUPPORTED_QUANTIZATION_BACKENDS = []
"""
- Quantization formats for Suite C. List any of: "fp8", "w8a8", "w8a16", "w4a16"
- BF16 is always supported — do not list it here.
- Empty list = this runner skips all quantized formats in Suite C.
+ Framework-level quantization backends supported by this runner. The
+ values are passed directly to the engine (e.g. vLLM's `quantization=`
+ kwarg), so the names mirror the engine's vocabulary — NOT the suite-level
+ precision tags (W8A8, FP8, W4A16, …).
+
+ Suite C cross-references each precision_model_map entry's
+ engine_kwargs.quantization against this list to decide which formats to
+ run on this runner. Adding a new quantized format becomes a pure suite
+ edit — no runner change is needed if the backend is already supported.
+
+ Examples (vLLM names):
+ SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"]
+ SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"]
+ SUPPORTED_QUANTIZATION_BACKENDS = [] # BF16/FP16/FP32 only
+
+ BF16/FP16/FP32 are always allowed — do not list them here.
"""
# ── Initializer ───────────────────────────────────────────────────────────
diff --git a/runners/validate_suites.py b/runners/validate_suites.py
new file mode 100644
index 00000000..a7b60c79
--- /dev/null
+++ b/runners/validate_suites.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Validate suite folders under suites/.
+
+Checks per folder:
+ - suite.json exists and parses as JSON
+ - suite.json validates against schema/suite.schema.json
+ - suite.suite_id matches the folder name
+ - suite.dataset resolves to datasets//requests.jsonl
+
+Usage:
+ # Validate every suite
+ python runners/validate_suites.py
+
+ # Validate a specific suite folder (name or path)
+ python runners/validate_suites.py --dir suite_A
+ python runners/validate_suites.py --dir suites/suite_A
+ python runners/validate_suites.py --dir /abs/path/to/suite_A
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+try:
+ import jsonschema
+ HAS_JSONSCHEMA = True
+except ImportError:
+ HAS_JSONSCHEMA = False
+ print("Warning: jsonschema not installed — schema validation skipped")
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SUITES_DIR = REPO_ROOT / "suites"
+SCHEMA_PATH = REPO_ROOT / "schema" / "suite.schema.json"
+DATASETS_DIR = REPO_ROOT / "datasets"
+
+# Files / folders that live flat under suites/ — not suite folders
+_NON_SUITE_NAMES = {"README.md", "__pycache__", ".DS_Store"}
+
+
+def _load_schema() -> dict | None:
+ if not HAS_JSONSCHEMA:
+ return None
+ if not SCHEMA_PATH.exists():
+ print(f"Error: schema not found at {SCHEMA_PATH}")
+ sys.exit(1)
+ return json.loads(SCHEMA_PATH.read_text())
+
+
+def _iter_suite_folders() -> list[Path]:
+ if not SUITES_DIR.exists():
+ return []
+ out = []
+ for entry in sorted(SUITES_DIR.iterdir()):
+ if not entry.is_dir() or entry.name in _NON_SUITE_NAMES or entry.name.startswith("."):
+ continue
+ out.append(entry)
+ return out
+
+
+def _resolve_target(target: str) -> Path:
+ p = Path(target)
+ if p.is_absolute():
+ return p
+ # Allow "suite_A" or "suites/suite_A"
+ if (SUITES_DIR / target).exists():
+ return SUITES_DIR / target
+ return REPO_ROOT / target
+
+
+def validate_suite(folder: Path, schema: dict | None) -> list[str]:
+ errors: list[str] = []
+ name = folder.name
+ suite_json = folder / "suite.json"
+
+ if not suite_json.exists():
+ errors.append(f"missing suite.json at {suite_json}")
+ return errors
+
+ try:
+ data = json.loads(suite_json.read_text())
+ except json.JSONDecodeError as exc:
+ errors.append(f"suite.json is not valid JSON: {exc}")
+ return errors
+
+ declared_id = data.get("suite_id")
+ if declared_id != name:
+ errors.append(
+ f"suite_id mismatch: folder is '{name}' but suite.suite_id is "
+ f"'{declared_id}'."
+ )
+
+ if schema is not None:
+ validator = jsonschema.Draft7Validator(schema)
+ for err in validator.iter_errors(data):
+ path = ".".join(str(p) for p in err.absolute_path) or ""
+ errors.append(f"schema: {path}: {err.message}")
+
+ dataset = data.get("dataset")
+ if dataset:
+ dataset_path = DATASETS_DIR / dataset / "requests.jsonl"
+ if not dataset_path.exists():
+ errors.append(
+ f"dataset '{dataset}' referenced by suite.json does not exist "
+ f"at {dataset_path}. Add the dataset to datasets/ or fix the "
+ f"'dataset' field."
+ )
+
+ return errors
+
+
+def _print_result(folder: Path, errors: list[str]) -> None:
+ if errors:
+ print(f"FAIL {folder.name}")
+ for err in errors:
+ print(f" - {err}")
+ else:
+ print(f"OK {folder.name}")
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(
+ description="Validate suite folders under suites/."
+ )
+ parser.add_argument(
+ "--dir",
+ default=None,
+ help="Validate a single suite folder (name, relative, or absolute path).",
+ )
+ args = parser.parse_args()
+
+ schema = _load_schema()
+
+ if args.dir:
+ target = _resolve_target(args.dir)
+ if not target.exists() or not target.is_dir():
+ print(f"Error: '{args.dir}' is not an existing directory.")
+ return 2
+ folders = [target]
+ else:
+ folders = _iter_suite_folders()
+ if not folders:
+ print("No suite folders found under suites/.")
+ return 0
+
+ total_errors = 0
+ for folder in folders:
+ errs = validate_suite(folder, schema)
+ _print_result(folder, errs)
+ total_errors += len(errs)
+
+ print()
+ if total_errors:
+ print(f"Found {total_errors} problem(s) across {len(folders)} suite folder(s).")
+ return 1
+ print(f"All {len(folders)} suite folder(s) valid.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/schema/accuracy_subset.README.md b/schema/accuracy_subset.README.md
new file mode 100644
index 00000000..4705f0c2
--- /dev/null
+++ b/schema/accuracy_subset.README.md
@@ -0,0 +1,58 @@
+# `accuracy_subset.jsonl` — accuracy gate question bank
+
+100 multiple-choice items drawn from
+[MMLU](https://github.com/hendrycks/test) (Massive Multitask Language
+Understanding). Every benchmark run executes this subset against the loaded
+model as a "model-quality sanity check" before measuring throughput or
+latency. The subset is **immutable** — see `CONTRIBUTING.md` "A few rules"
+and `benchmark_runner.py::_run_accuracy_scenario`.
+
+## File format
+
+One JSON object per line:
+
+```json
+{
+ "question_id": "mmlu_0096",
+ "subject": "machine_learning",
+ "question": "Which of the following statements about Naive Bayes is incorrect?",
+ "choices": ["...", "...", "...", "..."],
+ "answer": "B"
+}
+```
+
+| Field | Notes |
+|---------------|---------------------------------------------------|
+| `question_id` | Stable identifier (`mmlu_`) — never reused |
+| `subject` | MMLU subject tag (e.g. `machine_learning`) |
+| `question` | Plain-text prompt |
+| `choices` | List of exactly 4 strings |
+| `answer` | Letter in `{"A", "B", "C", "D"}` |
+
+## How AccelMark uses it
+
+- Loaded by `runners/benchmark_runner.py` (`_run_accuracy_scenario`, ~line 1700).
+- Scored as `correct / total`; compared against per-suite baselines in
+ [`accuracy_baselines.json`](accuracy_baselines.json).
+- A failed gate aborts the benchmark unless the user passes
+ `--skip-accuracy-gate` (the resulting submission is permanently flagged).
+
+This is **not** a measurement of MMLU performance — the subset is too small.
+It exists only to catch grossly broken model weights / quantization configs
+before runtime measurements waste hours of compute.
+
+## License & attribution
+
+The questions are a 100-item subset of MMLU:
+
+> Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., &
+> Steinhardt, J. (2021). **Measuring Massive Multitask Language
+> Understanding.** *International Conference on Learning Representations.*
+> arXiv:[2009.03300](https://arxiv.org/abs/2009.03300)
+> Source:
+
+MMLU is distributed under the **MIT License**. AccelMark redistributes
+this subset under the same license; the AccelMark Apache-2.0 license
+covers only the surrounding evaluation code, not the question content.
+
+See [`../NOTICE`](../NOTICE) for the full third-party attribution.
diff --git a/schema/suite.schema.json b/schema/suite.schema.json
new file mode 100644
index 00000000..1367fe0d
--- /dev/null
+++ b/schema/suite.schema.json
@@ -0,0 +1,215 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema",
+ "title": "AccelMark Suite",
+ "description": "Contract for suites//suite.json. Validates the fields BenchmarkRunner and the leaderboard generator depend on. Inline notes (keys prefixed with '_') are intentionally allowed.",
+ "type": "object",
+
+ "required": [
+ "suite_id",
+ "description",
+ "model_id",
+ "model_revision",
+ "dataset",
+ "scenarios",
+ "precision_required",
+ "allowed_precisions",
+ "max_model_len",
+ "output_tokens_max",
+ "concurrency_levels",
+ "num_runs",
+ "warmup_runs",
+ "request_count"
+ ],
+
+ "additionalProperties": true,
+
+ "properties": {
+ "suite_id": {
+ "type": "string",
+ "pattern": "^suite_[A-Z][A-Za-z0-9_]*$",
+ "description": "Folder name under suites/. Must match the directory name."
+ },
+ "description": {
+ "type": "string",
+ "minLength": 1
+ },
+ "model_id": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Canonical model identifier (typically a HuggingFace repo id)."
+ },
+ "model_revision": {
+ "type": "string",
+ "minLength": 1,
+ "description": "Pinned model revision (commit SHA or tag) — never 'main'."
+ },
+ "dataset": {
+ "type": "string",
+ "pattern": "^[a-z0-9][a-z0-9_]*_v[0-9]+$",
+ "description": "Dataset folder name under datasets/. Must exist as datasets//requests.jsonl."
+ },
+
+ "scenarios": {
+ "type": "object",
+ "required": ["default", "extra"],
+ "additionalProperties": false,
+ "properties": {
+ "default": {
+ "type": "array",
+ "items": { "$ref": "#/definitions/scenarioName" },
+ "uniqueItems": true,
+ "minItems": 1,
+ "description": "Scenarios executed when `--scenario default` (or no --scenario) is passed."
+ },
+ "extra": {
+ "type": "array",
+ "items": { "$ref": "#/definitions/scenarioName" },
+ "uniqueItems": true,
+ "description": "Opt-in scenarios runnable with --scenario all or --scenario ."
+ }
+ }
+ },
+
+ "precision_required": { "$ref": "#/definitions/precisionTag" },
+ "allowed_precisions": {
+ "type": "array",
+ "items": { "$ref": "#/definitions/precisionTag" },
+ "uniqueItems": true,
+ "minItems": 1
+ },
+
+ "max_model_len": { "type": "integer", "minimum": 128 },
+ "output_tokens_max": { "type": "integer", "minimum": 1 },
+
+ "concurrency_levels": {
+ "type": "array",
+ "items": { "type": "integer", "minimum": 1 },
+ "minItems": 1,
+ "uniqueItems": true
+ },
+
+ "num_runs": { "type": "integer", "minimum": 1 },
+ "warmup_runs": { "type": "integer", "minimum": 0 },
+ "warmup_minutes": { "type": "number", "minimum": 0 },
+
+ "request_count": { "type": "integer", "minimum": 1 },
+
+ "request_distribution": {
+ "type": "object",
+ "additionalProperties": true,
+ "properties": {
+ "input_tokens_p25": { "type": "number", "minimum": 0 },
+ "input_tokens_p50": { "type": "number", "minimum": 0 },
+ "input_tokens_p75": { "type": "number", "minimum": 0 },
+ "input_tokens_p99": { "type": "number", "minimum": 0 },
+ "output_tokens_p50": { "type": "number", "minimum": 0 },
+ "output_tokens_p99": { "type": "number", "minimum": 0 },
+ "source": { "type": "string" }
+ }
+ },
+
+ "online_qps_levels": {
+ "type": ["array", "null"],
+ "items": { "type": "number", "exclusiveMinimum": 0 },
+ "minItems": 1,
+ "uniqueItems": true
+ },
+ "online_sla_ttft_ms": { "type": ["integer", "null"], "minimum": 1 },
+ "online_sla_ttft_ms_relaxed": { "type": ["integer", "null"], "minimum": 1 },
+ "online_request_count": { "type": ["integer", "null"], "minimum": 1 },
+ "online_warmup_runs": { "type": "integer", "minimum": 0 },
+
+ "interactive_request_count": { "type": ["integer", "null"], "minimum": 1 },
+ "interactive_warmup_runs": { "type": "integer", "minimum": 0 },
+
+ "sustained_concurrency": { "type": "integer", "minimum": 1 },
+ "duration_minutes": { "type": "number", "minimum": 0 },
+ "sample_interval_seconds": { "type": "number", "minimum": 0 },
+
+ "accuracy_threshold_delta": { "type": "number" },
+
+ "required_chips": {
+ "oneOf": [
+ { "type": "integer", "minimum": 1 },
+ { "type": "string", "enum": ["auto"] }
+ ],
+ "description": "Either an explicit chip count or the literal string 'auto'."
+ },
+
+ "chip_counts_required": {
+ "type": "array",
+ "items": { "type": "integer", "minimum": 1 },
+ "uniqueItems": true
+ },
+ "chip_counts_optional": {
+ "type": "array",
+ "items": { "type": "integer", "minimum": 1 },
+ "uniqueItems": true
+ },
+ "chip_counts_all": {
+ "type": "array",
+ "items": { "type": "integer", "minimum": 1 },
+ "uniqueItems": true
+ },
+
+ "speculative_draft_model_id": { "type": "string", "minLength": 1 },
+ "speculative_draft_model_revision": { "type": "string", "minLength": 1 },
+ "speculative_num_tokens": { "type": "integer", "minimum": 1 },
+
+ "burst_steady_qps": { "type": "number", "exclusiveMinimum": 0 },
+ "burst_peak_qps": { "type": "number", "exclusiveMinimum": 0 },
+ "burst_duration_seconds": { "type": "number", "exclusiveMinimum": 0 },
+ "burst_interval_seconds": { "type": "number", "exclusiveMinimum": 0 },
+
+ "precision_model_map": {
+ "type": "object",
+ "description": "Suite C only — maps each precision tag to a fixed quantized checkpoint.",
+ "patternProperties": {
+ "^[A-Z][A-Z0-9]*$": {
+ "type": "object",
+ "required": ["model_id", "model_revision"],
+ "additionalProperties": true,
+ "properties": {
+ "model_id": { "type": "string", "minLength": 1 },
+ "model_revision": { "type": "string", "minLength": 1 },
+ "dtype_override": { "type": "string" },
+ "engine_kwargs": { "type": "object" }
+ }
+ }
+ }
+ },
+ "precision_levels": {
+ "type": "array",
+ "items": { "$ref": "#/definitions/precisionTag" },
+ "uniqueItems": true,
+ "description": "Suite C only — ordered list of precisions to evaluate."
+ },
+ "accuracy_thresholds": {
+ "type": "object",
+ "description": "Suite C only — per-format accuracy delta thresholds.",
+ "patternProperties": {
+ "^[A-Z][A-Z0-9]*$": { "type": "number" }
+ }
+ }
+ },
+
+ "definitions": {
+ "scenarioName": {
+ "type": "string",
+ "enum": [
+ "accuracy",
+ "offline",
+ "online",
+ "interactive",
+ "sustained",
+ "speculative",
+ "burst"
+ ]
+ },
+ "precisionTag": {
+ "type": "string",
+ "pattern": "^[A-Z][A-Z0-9]*$",
+ "description": "Uppercase precision tag (BF16, FP16, FP32, FP8, W8A8, W8A16, W4A16, …)."
+ }
+ }
+}
diff --git a/serve/server.py b/serve/server.py
index 59d85036..a8482c7b 100644
--- a/serve/server.py
+++ b/serve/server.py
@@ -24,7 +24,6 @@
from contextlib import asynccontextmanager
from typing import Optional, Union
-import uvicorn
from fastapi import Depends, FastAPI, Header, HTTPException, Request, status
from fastapi.responses import StreamingResponse
@@ -397,6 +396,10 @@ def start_server(
logger.info("=" * 60)
# ── Launch uvicorn ─────────────────────────────────────────────────────
+ # Imported lazily so importing `serve.server` (e.g. from tests, or to
+ # build the ASGI `app` for an external runner) does not require uvicorn.
+ import uvicorn
+
uvicorn.run(
app,
host=host,
diff --git a/serve/tests/mock_runner.py b/serve/tests/mock_runner.py
index 9d2c42c5..c2d7d9b1 100644
--- a/serve/tests/mock_runner.py
+++ b/serve/tests/mock_runner.py
@@ -73,10 +73,14 @@ async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceRe
)
async def inference_fn_token_stream(self, request: InferenceRequest):
- """Yield response word by word to simulate token streaming."""
- for word in self._response_text.split():
- await asyncio.sleep(0.001)
- yield word + " "
+ """
+ Per RunnerProtocol, true token streaming is optional. MockRunner
+ declares "not supported" by raising NotImplementedError so the
+ serve layer exercises its single-chunk fallback path. Use
+ TokenStreamingMockRunner below to test the true-streaming path.
+ """
+ raise NotImplementedError("MockRunner does not implement true token streaming")
+ yield # pragma: no cover - keeps this an async generator for the protocol shape
def format_prompt(self, prompt: str) -> str:
return prompt # pass through unchanged
@@ -93,4 +97,22 @@ def _compute_implementation_id(self) -> Optional[str]:
class NoStreamingMockRunner(MockRunner):
"""Mock runner that declares SUPPORTS_STREAMING = False."""
- SUPPORTS_STREAMING = False
\ No newline at end of file
+ SUPPORTS_STREAMING = False
+
+
+class TokenStreamingMockRunner(MockRunner):
+ """
+ Mock runner that *does* implement true token streaming — yields the
+ response text word by word with a small async delay. Used by tests
+ that exercise the multi-chunk SSE path in serve/server.py.
+
+ Spaces are emitted as a leading separator before each word *after*
+ the first, so concatenating every delta reconstructs the original
+ response text exactly (no trailing space) — matching how real
+ tokenizers stream BPE / SentencePiece pieces.
+ """
+
+ async def inference_fn_token_stream(self, request: InferenceRequest):
+ for i, word in enumerate(self._response_text.split()):
+ await asyncio.sleep(0.001)
+ yield (" " + word) if i else word
\ No newline at end of file
diff --git a/suites/README.md b/suites/README.md
index e3cdbfb6..aedb5652 100644
--- a/suites/README.md
+++ b/suites/README.md
@@ -309,8 +309,10 @@ not model version differences.
Each format runs against the same 100 prompts with concurrency levels
`[1, 4, 16, 64]` from `suite_C/suite.json` (not the same sweep as Suite A’s
-`[8, 32, 128]`). Format availability depends on the runner's `SUPPORTED_QUANTIZATIONS`
-declaration — unsupported formats are skipped automatically.
+`[8, 32, 128]`). Format availability depends on the runner's
+`SUPPORTED_QUANTIZATION_BACKENDS` declaration — unsupported formats are
+skipped automatically by matching each entry's `engine_kwargs.quantization`
+against the runner's backend list.
### Metrics
@@ -341,13 +343,15 @@ On H100, FP8 would show ~1.5-1.8× speedup.
### Runner requirements
-Declare which formats your runner supports:
+Declare which quantization backends your runner's framework supports. The
+strings are the engine's own backend identifiers (vLLM names shown), NOT
+suite precision tags such as W8A8/FP8/W4A16:
```python
# In your runner class:
-SUPPORTED_QUANTIZATIONS = ["fp8", "w8a8", "w8a16", "w4a16"] # H100
-SUPPORTED_QUANTIZATIONS = ["w8a8", "w8a16", "w4a16"] # A100 (no native FP8)
-SUPPORTED_QUANTIZATIONS = [] # BF16 only
+SUPPORTED_QUANTIZATION_BACKENDS = ["fp8", "compressed-tensors", "gptq_marlin"] # vLLM full
+SUPPORTED_QUANTIZATION_BACKENDS = ["compressed-tensors", "gptq_marlin"] # No native FP8
+SUPPORTED_QUANTIZATION_BACKENDS = [] # BF16 only
```
Each format's checkpoint must be available locally. Add to
@@ -712,7 +716,7 @@ submissions.
## Adding a new suite
-1. Open a GitHub Issue using the "Request new suite" template
+1. Open a GitHub Issue using the [**Propose a new suite**](https://github.com/JuhaoLiang1997/AccelMark/issues/new?template=new_suite.md) template
2. Specify: model, chip count, scenarios, and rationale
3. Discuss the proposal in the issue thread — interested contributors weigh in
4. Create `suites/suite_X/suite.json` referencing a shared dataset
diff --git a/suites/suite_C/suite.json b/suites/suite_C/suite.json
index 8adbb45c..fa14a3c5 100644
--- a/suites/suite_C/suite.json
+++ b/suites/suite_C/suite.json
@@ -47,7 +47,7 @@
},
"precision_levels": ["BF16", "FP16", "FP8", "W8A8", "W8A16", "W4A16"],
- "_precision_levels_note": "FP16 runs on all hardware including pre-Ampere. FP8 requires Ampere+ and is skipped automatically on FP16-only runners via SUPPORTED_QUANTIZATIONS.",
+ "_precision_levels_note": "FP16 runs on all hardware including pre-Ampere. FP8 requires Ampere+ and is skipped automatically on FP16-only runners via SUPPORTED_QUANTIZATION_BACKENDS.",
"accuracy_thresholds": {
"BF16": 0.03,