FreedomIntelligence · JuhaoLiang1997 · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -520,7 +520,8 @@ Copy the closest existing suite and modify. Required fields:
   "online_sla_ttft_ms": 500,
   "num_runs": 3,
   "warmup_runs": 1,
-  "online_warmup_runs": 0,
+  "online_warmup_requests": 10,
+  "burst_warmup_requests": 10,
   "interactive_warmup_runs": 0,
   "accuracy_threshold_delta": 0.1,
   "request_count": 200,
@@ -911,6 +912,71 @@ class InferenceResult:
 | speculative | Offline throughput with draft model (same path as offline, engine uses speculative decoding) | `throughput_tokens_per_sec`; optional `task.runtime_metrics.acceptance_rate` if runner overrides `get_runtime_metrics()` |
 | burst | Two-state bursty load: alternates steady QPS and burst QPS windows | `burst_degradation_ratio` (burst_ttft_p99 / steady_ttft_p99); `sla_met_during_burst` |
 
+### Warmup contract
+
+Cold engines inflate the first few timed requests by hundreds of ms (JIT
+compile, CUDA-graph allocation, KV cache priming). Each scenario discards
+a configurable prefix:
+
+| Scenario | Suite key | Default | Unit |
+|---|---|---|---|
+| offline / speculative / interactive | `warmup_runs` / `interactive_warmup_runs` | `1` / `0` | full passes |
+| online | `online_warmup_requests` | `10` | dummy requests fired before QPS sweep |
+| burst | `burst_warmup_requests` | `10` | dummy requests fired before first cycle |
+| sustained | `warmup_minutes` | `2` | minutes of samples excluded from analysis |
+
+Warmup-time exceptions are logged and swallowed — they never abort the
+timed phase.
+
+### Reliability metrics
+
+Each scenario emits an inter-run reliability block alongside its primary
+metrics so submitters can prove their results are reproducible without
+shipping `samples.jsonl`. Shape:
+
+```json
+{
+  "n":         3,
+  "mean":      1234.5,
+  "std":         21.3,
+  "cv_pct":      1.7,
+  "stability": "stable",
+  "runs": [1230.1, 1255.2, 1218.2]
+}
+```
+
+`stability` thresholds: `cv_pct ≤ 3 → stable ✓`, `≤ 8 → noisy ⚠`,
+otherwise `high-variance`. Calibrated from the May-2026 backfill — see
+the comment above `_STABILITY_THRESHOLD_*` in `loadgen/loadgen.py` for the
+empirical distribution that informed the choice. Tunable centrally there.
+
+**`high-variance` is informational, not a verdict.** High CV means the
+hardware × workload combo carries irreducible jitter (thermal throttle on
+consumer cards, HCCL noise on 16-chip Ascend topologies, acceptance-rate
+fluctuation on speculative decoding) — it is **not** a sign the
+submission is broken. The frontend reflects this: high-variance pills
+use an orange tone with no error glyph, while only stable / noisy carry
+✓ / ⚠ icons.
+
+If you submit a result that lands as high-variance, you do not need to
+re-run. The badge is for downstream readers picking hardware for
+latency-sensitive workloads — they can use the CV % to size their
+safety margins, while peak-throughput shoppers can largely ignore it.
+
+| Scenario | Field path | Reliability source |
+|---|---|---|
+| offline | `metrics.offline.results_by_concurrency[i].throughput_tokens_per_sec_reliability` | per-run throughput across `num_runs` |
+| online | `metrics.online.results_by_qps[i].ttft_ms_p99_reliability` | per-run TTFT p99 across `num_runs` |
+| interactive | `metrics.interactive.ttft_ms_p99_reliability` | per-run TTFT p99 across `num_runs` |
+| sustained | `metrics.sustained.throughput_post_warmup_reliability` | per-interval throughput (post-warmup) |
+| burst | `metrics.burst.recovery_time_seconds` (+ `_per_cycle`) | seconds until rolling p99 returns to ≤ 1.5× steady baseline |
+
+Backfilling these for existing results is done by
+`tools/backfill_distribution_stats.py`, which reads each result's local
+`samples.jsonl` and writes the summary stats in place. Offline reliability
+cannot be backfilled because per-run throughput was never recorded in
+`samples.jsonl` historically — it stays `{}` for old offline results.
+
 ---
 
 ## Schema and Validation

diff --git a/leaderboard/generate.py b/leaderboard/generate.py
@@ -6,6 +6,8 @@
     python leaderboard/generate.py
 """
 
+from __future__ import annotations
+
 import hashlib
 import json
 import re
@@ -238,6 +240,11 @@ def extract_detail(result: dict) -> dict:
         "meta_model_load_sec":   meta.get("model_load_seconds"),
         "meta_start_time":       meta.get("benchmark_start_time"),
         "meta_notes":            meta.get("notes"),
+        # Vendor-specific environment fields collected by platforms/<vendor>.py
+        # (e.g. ROCm-SMI link health, NVML clock telemetry). The modal flattens
+        # this dict and shows only non-null entries — different vendors record
+        # different keys by design and no UI tries to unify them.
+        "env_vendor_details":    env.get("vendor_details") or {},
     }
 
 
@@ -297,13 +304,18 @@ def _concurrency_labels(rows):
     def _online_block():
         online   = metrics.get("online", {})
         qps_rows = online.get("results_by_qps", [])
+        # Per-QPS reliability blocks. Emitted as a parallel array so the
+        # frontend can render a badge next to each QPS row without joining
+        # by index from a separate object.
         return {
             "labels":        [str(r.get("target_qps", "")) for r in qps_rows],
             "ttft_p50":      [r.get("ttft_ms_p50") for r in qps_rows],
             "ttft_p90":      [r.get("ttft_ms_p90") for r in qps_rows],
             "tpot_p50":      [r.get("tpot_ms_p50") for r in qps_rows],
             "sla_met":       [r.get("sla_met")      for r in qps_rows],
             "max_valid_qps": online.get("max_valid_qps"),
+            "ttft_ms_p99_reliability":
+                [r.get("ttft_ms_p99_reliability") or {} for r in qps_rows],
         }
 
     def _interactive_block():
@@ -315,6 +327,7 @@ def _interactive_block():
             "tpot_p50": iv.get("tpot_ms_p50"),
             "tpot_p90": iv.get("tpot_ms_p90"),
             "tpot_p99": iv.get("tpot_ms_p99"),
+            "ttft_ms_p99_reliability": iv.get("ttft_ms_p99_reliability") or {},
         }
 
     def _sustained_block():
@@ -334,6 +347,8 @@ def _sustained_block():
             "throttle_ratio":        s.get("throttle_ratio"),
             "throttle_onset_minute": s.get("throttle_onset_minute"),
             "ttft_p99_drift_ms":     s.get("ttft_p99_drift_ms"),
+            "throughput_post_warmup_reliability":
+                s.get("throughput_post_warmup_reliability") or {},
             "samples":               samples,
         }
 
@@ -352,6 +367,9 @@ def _burst_block():
             "burst_requests_total":         b.get("burst_requests_total"),
             "sla_met_during_burst":         b.get("sla_met_during_burst"),
             "burst_degradation_ratio":      b.get("burst_degradation_ratio"),
+            "recovery_time_seconds":        b.get("recovery_time_seconds"),
+            "recovery_time_seconds_per_cycle":
+                b.get("recovery_time_seconds_per_cycle") or [],
             "results_by_cycle":             b.get("results_by_cycle"),
         }
 
@@ -370,6 +388,11 @@ def _speculative_block():
             "mean_accepted_tokens": rm.get("mean_accepted_tokens"),
         }
 
+    # Per-concurrency-level offline reliability blocks. Parallel array to
+    # `throughput` and `memory_gb` so the frontend can join by row index.
+    def _offline_reliability(rows):
+        return [r.get("throughput_tokens_per_sec_reliability") or {} for r in rows]
+
     if suite == "suite_A":
         rows = _offline_rows()
         return {
@@ -378,6 +401,7 @@ def _speculative_block():
                 "labels":     _concurrency_labels(rows),
                 "throughput": [r.get("throughput_tokens_per_sec") for r in rows],
                 "memory_gb":  [r.get("peak_memory_gb")            for r in rows],
+                "throughput_reliability": _offline_reliability(rows),
             },
             "online":      _online_block(),
             "interactive": _interactive_block(),
@@ -395,6 +419,7 @@ def _speculative_block():
                 "throughput":          [r.get("throughput_tokens_per_sec")          for r in rows],
                 "throughput_per_chip": [r.get("throughput_tokens_per_sec_per_chip") for r in rows],
                 "memory_gb":           [r.get("peak_memory_gb")                     for r in rows],
+                "throughput_reliability": _offline_reliability(rows),
             },
             "online":    _online_block(),
             "sustained": _sustained_block(),
@@ -409,6 +434,7 @@ def _speculative_block():
                 "labels":     _concurrency_labels(rows),
                 "throughput": [r.get("throughput_tokens_per_sec") for r in rows],
                 "memory_gb":  [r.get("peak_memory_gb")            for r in rows],
+                "throughput_reliability": _offline_reliability(rows),
             },
             "interactive": _interactive_block(),
             "sustained":   _sustained_block(),
@@ -514,6 +540,7 @@ def _speculative_block():
                 "labels":     _concurrency_labels(rows),
                 "throughput": [r.get("throughput_tokens_per_sec") for r in rows],
                 "memory_gb":  [r.get("peak_memory_gb")            for r in rows],
+                "throughput_reliability": _offline_reliability(rows),
             },
             "online":      _online_block(),
             "interactive": _interactive_block(),
@@ -530,6 +557,7 @@ def _speculative_block():
                 "labels":     _concurrency_labels(rows),
                 "throughput": [r.get("throughput_tokens_per_sec") for r in rows],
                 "memory_gb":  [r.get("peak_memory_gb")            for r in rows],
+                "throughput_reliability": _offline_reliability(rows),
             },
             "online":      _online_block(),
             "interactive": _interactive_block(),

diff --git a/leaderboard/site/assets/css/modal.css b/leaderboard/site/assets/css/modal.css
@@ -127,6 +127,79 @@
   margin-right: 0.3rem;
 }
 
+/* Inter-run reliability pill that lives in the modal subtitle. Colours
+ * track --good / --warn / --bad so the existing palette controls dark-mode
+ * behaviour. We intentionally tone down opacity so the badge does not
+ * compete with the primary metric callouts above. */
+.modal-reliab-pill {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.25rem;
+  padding: 0.1rem 0.5rem;
+  border-radius: 999px;
+  font-size: 0.7rem;
+  font-weight: 600;
+  letter-spacing: 0.01em;
+  border: 1px solid color-mix(in srgb, currentColor 35%, transparent);
+  background: color-mix(in srgb, currentColor 10%, transparent);
+  /* Pill is rendered as a <button> so it can be clicked / focused; strip
+   * the default button chrome and signal "this is hoverable for help"
+   * via the cursor + the ⓘ glyph inside. */
+  font-family: inherit;
+  cursor: help;
+  transition: filter 0.15s ease, transform 0.15s ease;
+}
+.modal-reliab-pill:hover  { filter: brightness(1.05); transform: translateY(-1px); }
+.modal-reliab-pill:active { transform: translateY(0); }
+.modal-reliab-pill:focus-visible {
+  outline: 2px solid color-mix(in srgb, currentColor 55%, transparent);
+  outline-offset: 2px;
+}
+.modal-reliab-help {
+  opacity: 0.65;
+  font-weight: 500;
+  font-size: 0.78em;
+}
+.modal-reliab-pill.stable         { color: var(--good, #2da44e); }
+.modal-reliab-pill.noisy          { color: var(--warn, #d29922); }
+/* High-variance uses an orange tone (not pure red) on purpose: red would
+ * read as an error glyph, but a high-CV submission may be a perfectly
+ * correct measurement of a legitimately jittery hardware × workload pair
+ * (e.g. consumer cards under sustained load). Orange says "look at this"
+ * without saying "this is broken". */
+.modal-reliab-pill.high_variance  { color: #d97706; }
+.modal-reliab-pill.unknown        { color: var(--fg-faint); }
+
+/* Inline help paragraph rendered under a detail-section title. Currently
+ * only the Reliability section uses this, so the styling stays modest —
+ * we don't want it competing with the data rows below. */
+.detail-section-help {
+  margin: 0 0 0.55rem 0;
+  font-size: 0.78rem;
+  line-height: 1.5;
+  color: var(--fg-muted, #6b7280);
+}
+.detail-section-help .reliab-legend {
+  display: inline-block;
+  padding: 0 0.3rem;
+  border-radius: 4px;
+  font-weight: 600;
+  font-size: 0.92em;
+  background: color-mix(in srgb, currentColor 12%, transparent);
+}
+.detail-section-help .reliab-legend.stable        { color: var(--good, #2da44e); }
+.detail-section-help .reliab-legend.noisy         { color: var(--warn, #d29922); }
+.detail-section-help .reliab-legend.high_variance { color: #d97706; }
+
+/* Brief highlight flash when a click-jump from the subtitle pill lands on
+ * a detail section, so the user notices the scroll target. The animation
+ * is deliberately gentle (1.4 s fade-out via JS-toggled class). */
+.detail-section.detail-section-flash {
+  background: color-mix(in srgb, var(--accent, #2563eb) 8%, transparent);
+  border-radius: 6px;
+  transition: background-color 0.4s ease;
+}
+
 .modal-close {
   background: transparent;
   border: 1px solid var(--border-soft);