SemiAnalysisAI · edwingao28 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -260,6 +260,8 @@ jobs:
                 fi
                 echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus"
                 RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py
+                agg_file="agg_${result_file%.json}.json"
+                python3 utils/validate_agg_result.py "$agg_file"
               fi
             fi
           done
@@ -279,6 +281,11 @@ jobs:
           path: multinode_server_logs.tar.gz
           if-no-files-found: ignore
 
+      - name: Validate agentic aggregated result
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"
+
       - name: Upload agentic aggregated result
         if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -218,15 +218,25 @@ jobs:
         run: |
           python3 utils/process_result.py
 
+      - name: Validate agg result
+        if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "agg_${RESULT_FILENAME}.json"
+
       - name: Upload result
         if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
 
+      - name: Validate agentic aggregated result
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"
+
       - name: Upload agentic aggregated result
-        if: ${{ always() && inputs.scenario-type == 'agentic-coding' }}
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: bmk_agentic_${{ env.RESULT_FILENAME }}

diff --git a/.github/workflows/test-process-result.yml b/.github/workflows/test-process-result.yml
@@ -5,6 +5,12 @@ on:
     paths:
       - 'utils/process_result.py'
       - 'utils/test_process_result.py'
+      - 'utils/validate_agg_result.py'
+      - 'utils/test_validate_agg_result.py'
+      - 'benchmarks/benchmark_lib.sh'
+      - '.github/workflows/benchmark-tmpl.yml'
+      - '.github/workflows/benchmark-multinode-tmpl.yml'
+      - '.github/workflows/test-process-result.yml'
 
 permissions:
   contents: read
@@ -33,4 +39,4 @@ jobs:
       - name: Run pytest
         run: |
           cd utils
-          pytest test_process_result.py -v
+          pytest test_process_result.py test_validate_agg_result.py -v
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -373,6 +373,7 @@ run_benchmark_serving() {
         --save-result
         --num-warmups "$((2 * max_concurrency))" \
         --percentile-metrics 'ttft,tpot,itl,e2el'
+        --metric-percentiles '75,90,95,99,99.9'
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
     )

diff --git a/utils/test_process_result.py b/utils/test_process_result.py
@@ -6,11 +6,15 @@
 """
 import pytest
 import json
+import re
 import subprocess
 import sys
 from pathlib import Path
 
 SCRIPT_PATH = Path(__file__).parent / "process_result.py"
+BENCHMARK_LIB_PATH = (
+    Path(__file__).resolve().parents[1] / "benchmarks" / "benchmark_lib.sh"
+)
 
 
 # =============================================================================
@@ -102,6 +106,20 @@ def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_resul
     )
 
 
+# =============================================================================
+# Test benchmark command contract
+# =============================================================================
+
+def test_benchmark_serving_requests_summary_percentiles():
+    """Benchmark production must request every percentile summarize.py reads."""
+    script = BENCHMARK_LIB_PATH.read_text(encoding="utf-8")
+
+    match = re.search(r"--metric-percentiles\s+['\"]([^'\"]+)['\"]", script)
+
+    assert match, "run_benchmark_serving must pass --metric-percentiles"
+    assert match.group(1).split(",") == ["75", "90", "95", "99", "99.9"]
+
+
 # =============================================================================
 # Test get_required_env_vars function
 # =============================================================================

diff --git a/utils/test_validate_agg_result.py b/utils/test_validate_agg_result.py
@@ -0,0 +1,179 @@
+"""Tests for validate_agg_result.py, covering both fixed-seq and agentic agg schemas."""
+import json
+import math
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from validate_agg_result import (
+    validate,
+    check_identity,
+    check_numeric_finite,
+    check_throughput,
+    check_percentile_families,
+    check_monotonicity,
+)
+
+SCRIPT = Path(__file__).parent / "validate_agg_result.py"
+
+
+def fixed_seq_agg():
+    """Valid fixed-seq agg: all latency families + intvty at p75..p99.9 (intvty decreasing)."""
+    data = {
+        "hw": "b200", "framework": "sglang", "precision": "fp8",
+        "model": "test/model", "infmax_model_prefix": "tm",
+        "conc": 8, "isl": 1024, "osl": 1024, "is_multinode": False,
+        "tp": 8, "ep": 1, "dp_attention": "false",
+        "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
+        "mean_tpot": 0.01, "mean_intvty": 100.0,
+    }
+    for i, p in enumerate((75, 90, 95, 99, 99.9), start=1):
+        k = str(int(p)) if p == int(p) else str(p)
+        data[f"p{k}_ttft"] = float(i)
+        data[f"p{k}_tpot"] = float(i)
+        data[f"p{k}_itl"] = float(i)
+        data[f"p{k}_e2el"] = float(i)
+        data[f"p{k}_intvty"] = 1000.0 / (i + 9)
+    return data
+
+
+def agentic_agg():
+    """Valid agentic agg: same families at p75/p90/p95, no isl/osl, intvty increasing."""
+    data = {
+        "hw": "b200", "framework": "sglang", "precision": "fp8",
+        "model": "test/model", "infmax_model_prefix": "tm",
+        "conc": 8, "is_multinode": False,
+        "tp": 8, "ep": 1, "dp_attention": "false",
+        "scenario_type": "agentic-coding",
+        "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
+        "mean_tpot": 0.01, "mean_intvty": 100.0,
+        "theoretical_cache_hit_rate": None,
+    }
+    for i, p in enumerate((75, 90, 95), start=1):
+        data[f"p{p}_ttft"] = float(i)
+        data[f"p{p}_tpot"] = float(i)
+        data[f"p{p}_itl"] = float(i)
+        data[f"p{p}_e2el"] = float(i)
+        data[f"p{p}_intvty"] = float(i) * 10
+    return data
+
+
+def test_fixed_seq_valid_passes():
+    assert validate(fixed_seq_agg()) == []
+
+
+def test_agentic_valid_passes():
+    assert validate(agentic_agg()) == []
+
+
+def test_agentic_intvty_must_increase():
+    data = agentic_agg()
+    data["p95_intvty"] = data["p75_intvty"] - 1.0
+    assert any("intvty" in e and "non-decreasing" in e for e in check_monotonicity(data))
+
+
+def test_fixed_seq_intvty_must_decrease():
+    data = fixed_seq_agg()
+    data["p90_intvty"] = data["p75_intvty"] + 100.0
+    assert any("intvty" in e and "non-increasing" in e for e in check_monotonicity(data))
+
+
+def test_missing_sibling_percentile_fails():
+    data = fixed_seq_agg()
+    del data["p95_e2el"]
+    assert any("e2el" in e and "95" in e for e in check_percentile_families(data))
+
+
+def test_intvty_must_mirror_tpot():
+    data = fixed_seq_agg()
+    del data["p99_intvty"]
+    assert any(
+        "p99_tpot present but p99_intvty missing" in e
+        for e in check_percentile_families(data)
+    )
+
+
+def test_latency_monotonicity_fails():
+    data = fixed_seq_agg()
+    data["p90_ttft"] = data["p75_ttft"] - 1.0
+    assert any("ttft" in e and "non-decreasing" in e for e in check_monotonicity(data))
+
+
+def test_negative_percentile_value_fails():
+    data = fixed_seq_agg()
+    data["p90_ttft"] = -1.0
+    assert any("non-negative" in e for e in check_monotonicity(data))
+
+
+def test_malformed_percentile_key_flagged():
+    data = fixed_seq_agg()
+    data["p150_tpot"] = 1.0
+    assert any("malformed" in e for e in check_percentile_families(data))
+
+
+def test_throughput_positive_required():
+    data = fixed_seq_agg()
+    data["tput_per_gpu"] = 0.0
+    assert any("tput_per_gpu" in e for e in check_throughput(data))
+
+
+def test_throughput_sum_is_not_asserted():
+    data = fixed_seq_agg()
+    data["input_tput_per_gpu"] = 123.0  # input+output need not equal total
+    assert check_throughput(data) == []
+
+
+def test_nan_field_fails():
+    data = fixed_seq_agg()
+    data["mean_tpot"] = math.nan
+    assert any("finite" in e for e in check_numeric_finite(data))
+
+
+def test_missing_identity_fails():
+    data = fixed_seq_agg()
+    data["hw"] = ""
+    assert any("hw" in e for e in check_identity(data))
+
+
+def test_fixed_seq_requires_isl_osl():
+    data = fixed_seq_agg()
+    del data["isl"]
+    assert any("isl" in e for e in check_identity(data))
+
+
+def test_agentic_does_not_require_isl_osl():
+    assert all("isl" not in e and "osl" not in e for e in check_identity(agentic_agg()))
+
+
+def test_multinode_decode_fields_may_be_zero():
+    data = fixed_seq_agg()
+    data["is_multinode"] = True
+    for k in ("prefill_tp", "prefill_ep", "prefill_num_workers"):
+        data[k] = 4
+    for k in ("decode_tp", "decode_ep", "decode_num_workers"):
+        data[k] = 0
+    data["prefill_dp_attention"] = "true"
+    data["decode_dp_attention"] = "true"
+    assert check_identity(data) == []
+
+
+def _run_cli(tmp_path, payload):
+    path = tmp_path / "agg.json"
+    path.write_text(payload)
+    return subprocess.run([sys.executable, str(SCRIPT), str(path)], capture_output=True)
+
+
+def test_cli_accepts_valid(tmp_path):
+    assert _run_cli(tmp_path, json.dumps(fixed_seq_agg())).returncode == 0
+
+
+def test_cli_rejects_non_dict_json(tmp_path):
+    assert _run_cli(tmp_path, "[]").returncode == 1
+
+
+def test_cli_rejects_invalid_agg(tmp_path):
+    data = fixed_seq_agg()
+    data["tput_per_gpu"] = -1.0
+    assert _run_cli(tmp_path, json.dumps(data)).returncode == 1