From 2574e4ff779c6f061ae3632708f778a3a04c829d Mon Sep 17 00:00:00 2001
From: Wenyao Gao <wgao11@u.rochester.edu>
Date: Sun, 21 Jun 2026 21:44:15 -0700
Subject: [PATCH 1/2] feat: add data-derived validate_agg_result.py invariant
 guard for agg JSONs

---
 utils/test_validate_agg_result.py | 179 ++++++++++++++++++++
 utils/validate_agg_result.py      | 263 ++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+)
 create mode 100644 utils/test_validate_agg_result.py
 create mode 100644 utils/validate_agg_result.py

diff --git a/utils/test_validate_agg_result.py b/utils/test_validate_agg_result.py
new file mode 100644
index 000000000..7b35f9f9e
--- /dev/null
+++ b/utils/test_validate_agg_result.py
@@ -0,0 +1,179 @@
+"""Tests for validate_agg_result.py, covering both fixed-seq and agentic agg schemas."""
+import json
+import math
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+from validate_agg_result import (
+    validate,
+    check_identity,
+    check_numeric_finite,
+    check_throughput,
+    check_percentile_families,
+    check_monotonicity,
+)
+
+SCRIPT = Path(__file__).parent / "validate_agg_result.py"
+
+
+def fixed_seq_agg():
+    """Valid fixed-seq agg: all latency families + intvty at p75..p99.9 (intvty decreasing)."""
+    data = {
+        "hw": "b200", "framework": "sglang", "precision": "fp8",
+        "model": "test/model", "infmax_model_prefix": "tm",
+        "conc": 8, "isl": 1024, "osl": 1024, "is_multinode": False,
+        "tp": 8, "ep": 1, "dp_attention": "false",
+        "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
+        "mean_tpot": 0.01, "mean_intvty": 100.0,
+    }
+    for i, p in enumerate((75, 90, 95, 99, 99.9), start=1):
+        k = str(int(p)) if p == int(p) else str(p)
+        data[f"p{k}_ttft"] = float(i)
+        data[f"p{k}_tpot"] = float(i)
+        data[f"p{k}_itl"] = float(i)
+        data[f"p{k}_e2el"] = float(i)
+        data[f"p{k}_intvty"] = 1000.0 / (i + 9)
+    return data
+
+
+def agentic_agg():
+    """Valid agentic agg: same families at p75/p90/p95, no isl/osl, intvty increasing."""
+    data = {
+        "hw": "b200", "framework": "sglang", "precision": "fp8",
+        "model": "test/model", "infmax_model_prefix": "tm",
+        "conc": 8, "is_multinode": False,
+        "tp": 8, "ep": 1, "dp_attention": "false",
+        "scenario_type": "agentic-coding",
+        "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
+        "mean_tpot": 0.01, "mean_intvty": 100.0,
+        "theoretical_cache_hit_rate": None,
+    }
+    for i, p in enumerate((75, 90, 95), start=1):
+        data[f"p{p}_ttft"] = float(i)
+        data[f"p{p}_tpot"] = float(i)
+        data[f"p{p}_itl"] = float(i)
+        data[f"p{p}_e2el"] = float(i)
+        data[f"p{p}_intvty"] = float(i) * 10
+    return data
+
+
+def test_fixed_seq_valid_passes():
+    assert validate(fixed_seq_agg()) == []
+
+
+def test_agentic_valid_passes():
+    assert validate(agentic_agg()) == []
+
+
+def test_agentic_intvty_must_increase():
+    data = agentic_agg()
+    data["p95_intvty"] = data["p75_intvty"] - 1.0
+    assert any("intvty" in e and "non-decreasing" in e for e in check_monotonicity(data))
+
+
+def test_fixed_seq_intvty_must_decrease():
+    data = fixed_seq_agg()
+    data["p90_intvty"] = data["p75_intvty"] + 100.0
+    assert any("intvty" in e and "non-increasing" in e for e in check_monotonicity(data))
+
+
+def test_missing_sibling_percentile_fails():
+    data = fixed_seq_agg()
+    del data["p95_e2el"]
+    assert any("e2el" in e and "95" in e for e in check_percentile_families(data))
+
+
+def test_intvty_must_mirror_tpot():
+    data = fixed_seq_agg()
+    del data["p99_intvty"]
+    assert any(
+        "p99_tpot present but p99_intvty missing" in e
+        for e in check_percentile_families(data)
+    )
+
+
+def test_latency_monotonicity_fails():
+    data = fixed_seq_agg()
+    data["p90_ttft"] = data["p75_ttft"] - 1.0
+    assert any("ttft" in e and "non-decreasing" in e for e in check_monotonicity(data))
+
+
+def test_negative_percentile_value_fails():
+    data = fixed_seq_agg()
+    data["p90_ttft"] = -1.0
+    assert any("non-negative" in e for e in check_monotonicity(data))
+
+
+def test_malformed_percentile_key_flagged():
+    data = fixed_seq_agg()
+    data["p150_tpot"] = 1.0
+    assert any("malformed" in e for e in check_percentile_families(data))
+
+
+def test_throughput_positive_required():
+    data = fixed_seq_agg()
+    data["tput_per_gpu"] = 0.0
+    assert any("tput_per_gpu" in e for e in check_throughput(data))
+
+
+def test_throughput_sum_is_not_asserted():
+    data = fixed_seq_agg()
+    data["input_tput_per_gpu"] = 123.0  # input+output need not equal total
+    assert check_throughput(data) == []
+
+
+def test_nan_field_fails():
+    data = fixed_seq_agg()
+    data["mean_tpot"] = math.nan
+    assert any("finite" in e for e in check_numeric_finite(data))
+
+
+def test_missing_identity_fails():
+    data = fixed_seq_agg()
+    data["hw"] = ""
+    assert any("hw" in e for e in check_identity(data))
+
+
+def test_fixed_seq_requires_isl_osl():
+    data = fixed_seq_agg()
+    del data["isl"]
+    assert any("isl" in e for e in check_identity(data))
+
+
+def test_agentic_does_not_require_isl_osl():
+    assert all("isl" not in e and "osl" not in e for e in check_identity(agentic_agg()))
+
+
+def test_multinode_decode_fields_may_be_zero():
+    data = fixed_seq_agg()
+    data["is_multinode"] = True
+    for k in ("prefill_tp", "prefill_ep", "prefill_num_workers"):
+        data[k] = 4
+    for k in ("decode_tp", "decode_ep", "decode_num_workers"):
+        data[k] = 0
+    data["prefill_dp_attention"] = "true"
+    data["decode_dp_attention"] = "true"
+    assert check_identity(data) == []
+
+
+def _run_cli(tmp_path, payload):
+    path = tmp_path / "agg.json"
+    path.write_text(payload)
+    return subprocess.run([sys.executable, str(SCRIPT), str(path)], capture_output=True)
+
+
+def test_cli_accepts_valid(tmp_path):
+    assert _run_cli(tmp_path, json.dumps(fixed_seq_agg())).returncode == 0
+
+
+def test_cli_rejects_non_dict_json(tmp_path):
+    assert _run_cli(tmp_path, "[]").returncode == 1
+
+
+def test_cli_rejects_invalid_agg(tmp_path):
+    data = fixed_seq_agg()
+    data["tput_per_gpu"] = -1.0
+    assert _run_cli(tmp_path, json.dumps(data)).returncode == 1
diff --git a/utils/validate_agg_result.py b/utils/validate_agg_result.py
new file mode 100644
index 000000000..d06e49211
--- /dev/null
+++ b/utils/validate_agg_result.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+"""Validate an aggregate benchmark result JSON before artifact upload.
+
+Checks structural and physical invariants for both fixed-seq and agentic results,
+deriving the expected keys from the data rather than assuming a fixed percentile set.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import re
+import sys
+from pathlib import Path
+from typing import Any, Iterable
+
+LATENCY_FAMILIES = ("ttft", "tpot", "itl", "e2el")
+INTERACTIVITY_FAMILY = "intvty"
+THROUGHPUT_KEYS = ("tput_per_gpu", "output_tput_per_gpu", "input_tput_per_gpu")
+AGENTIC_SCENARIO = "agentic-coding"
+
+# Prefix-style percentile keys: p75_tpot, p99.9_intvty.
+_PCTL_KEY = re.compile(r"^p(\d+(?:\.\d+)?)_(.+)$")
+
+
+def is_number(value: Any) -> bool:
+    """Real int/float, excluding bool."""
+    return isinstance(value, (int, float)) and not isinstance(value, bool)
+
+
+def is_positive_int(value: Any) -> bool:
+    """Integer greater than zero (bools excluded)."""
+    return isinstance(value, int) and not isinstance(value, bool) and value > 0
+
+
+def is_non_negative_int(value: Any) -> bool:
+    """Integer at least zero (bools excluded)."""
+    return isinstance(value, int) and not isinstance(value, bool) and value >= 0
+
+
+def fmt_pctl(rank: float) -> str:
+    """Render a percentile as the keys do: 90.0 -> '90', 99.9 -> '99.9'."""
+    return str(int(rank)) if rank == int(rank) else str(rank)
+
+
+def percentiles_present(data: dict[str, Any], family: str) -> dict[float, str]:
+    """Map percentile rank -> key for every p<rank>_<family> key in data."""
+    found: dict[float, str] = {}
+    for key in data:
+        match = _PCTL_KEY.match(key)
+        if match and match.group(2) == family:
+            rank = float(match.group(1))
+            if 0 <= rank <= 100:
+                found[rank] = key
+    return found
+
+
+def malformed_percentile_keys(data: dict[str, Any], families: Iterable[str]) -> list[str]:
+    """Keys shaped like p<rank>_<family> whose rank is outside the 0-100 range."""
+    bad: list[str] = []
+    family_set = set(families)
+    for key in data:
+        match = _PCTL_KEY.match(key)
+        if match and match.group(2) in family_set:
+            rank = float(match.group(1))
+            if not 0 <= rank <= 100:
+                bad.append(key)
+    return sorted(bad)
+
+
+def check_identity(data: dict[str, Any]) -> list[str]:
+    """Identity strings, run dimensions, and topology metadata are well-formed."""
+    errors: list[str] = []
+    for key in ("hw", "framework", "precision", "model", "infmax_model_prefix"):
+        value = data.get(key)
+        if not isinstance(value, str) or not value.strip():
+            errors.append(f"{key} must be a non-empty string")
+    if not is_positive_int(data.get("conc")):
+        errors.append("conc must be a positive integer")
+    # Agentic runs are variable-length and carry no isl/osl.
+    if data.get("scenario_type") != AGENTIC_SCENARIO:
+        for key in ("isl", "osl"):
+            if not is_positive_int(data.get(key)):
+                errors.append(f"{key} must be a positive integer")
+    is_multinode = data.get("is_multinode")
+    if not isinstance(is_multinode, bool):
+        errors.append("is_multinode must be present as a bool")
+        return errors
+    if is_multinode:
+        for key in ("prefill_tp", "prefill_ep", "prefill_num_workers"):
+            if not is_positive_int(data.get(key)):
+                errors.append(f"{key} must be a positive integer for multinode topology")
+        for key in ("decode_tp", "decode_ep", "decode_num_workers"):
+            if not is_non_negative_int(data.get(key)):
+                errors.append(f"{key} must be a non-negative integer for multinode topology")
+        for key in ("prefill_dp_attention", "decode_dp_attention"):
+            if key not in data:
+                errors.append(f"{key} is required for multinode topology")
+    else:
+        for key in ("tp", "ep"):
+            if not is_positive_int(data.get(key)):
+                errors.append(f"{key} must be a positive integer for single-node topology")
+        if "dp_attention" not in data:
+            errors.append("dp_attention is required for single-node topology")
+    return errors
+
+
+def numeric_paths(value: Any, path: str = "") -> Iterable[tuple[str, float]]:
+    """Yield (path, number) for every numeric leaf in nested JSON-like data."""
+    if isinstance(value, bool):
+        return
+    if isinstance(value, (int, float)):
+        yield path, value
+        return
+    if isinstance(value, dict):
+        for key, child in value.items():
+            child_path = f"{path}.{key}" if path else str(key)
+            yield from numeric_paths(child, child_path)
+    elif isinstance(value, list):
+        for index, child in enumerate(value):
+            yield from numeric_paths(child, f"{path}[{index}]")
+
+
+def check_numeric_finite(data: dict[str, Any]) -> list[str]:
+    """No numeric field may be NaN or +/-Infinity."""
+    return [
+        f"{path} must be finite"
+        for path, value in numeric_paths(data)
+        if not math.isfinite(value)
+    ]
+
+
+def check_throughput(data: dict[str, Any]) -> list[str]:
+    """Per-GPU throughput fields are present, finite, and positive."""
+    errors: list[str] = []
+    for key in THROUGHPUT_KEYS:
+        value = data.get(key)
+        if not is_number(value) or not math.isfinite(value) or value <= 0:
+            errors.append(f"{key} must be a positive finite number")
+    return errors
+
+
+def check_percentile_families(data: dict[str, Any]) -> list[str]:
+    """Families that report percentiles must report the same ranks, and intvty mirrors
+    tpot key-for-key. A family a run does not emit at all is not required."""
+    errors: list[str] = []
+    for key in malformed_percentile_keys(data, (*LATENCY_FAMILIES, INTERACTIVITY_FAMILY)):
+        errors.append(f"{key} is a malformed percentile key")
+
+    present = {family: percentiles_present(data, family) for family in LATENCY_FAMILIES}
+    with_pctls = {family: ranks for family, ranks in present.items() if ranks}
+    if with_pctls:
+        union = set().union(*(set(ranks) for ranks in with_pctls.values()))
+        for family, ranks in with_pctls.items():
+            for missing in sorted(union - set(ranks)):
+                errors.append(
+                    f"metric '{family}' missing percentile p{fmt_pctl(missing)} "
+                    "that other metrics report"
+                )
+
+    tpot_ranks = set(present["tpot"])
+    intvty_ranks = set(percentiles_present(data, INTERACTIVITY_FAMILY))
+    for missing in sorted(tpot_ranks - intvty_ranks):
+        errors.append(
+            f"p{fmt_pctl(missing)}_tpot present but p{fmt_pctl(missing)}_intvty missing"
+        )
+    for extra in sorted(intvty_ranks - tpot_ranks):
+        errors.append(
+            f"p{fmt_pctl(extra)}_intvty present but p{fmt_pctl(extra)}_tpot missing"
+        )
+    return errors
+
+
+def _monotonic(data: dict[str, Any], family: str, increasing: bool) -> list[str]:
+    """One family's percentile values are non-negative, finite, and monotonic in rank."""
+    entries = percentiles_present(data, family)
+    errors: list[str] = []
+    prev_key = ""
+    prev_val: float | None = None
+    for rank in sorted(entries):
+        key = entries[rank]
+        value = data[key]
+        if not is_number(value) or not math.isfinite(value):
+            errors.append(f"{key} must be a finite percentile value")
+            prev_val = None
+            continue
+        if value < 0:
+            errors.append(f"{key} must be non-negative")
+        if prev_val is not None:
+            if increasing and value < prev_val:
+                errors.append(
+                    f"{family} percentiles must be non-decreasing: "
+                    f"{prev_key}={prev_val} > {key}={value}"
+                )
+            if not increasing and value > prev_val:
+                errors.append(
+                    f"{family} percentiles must be non-increasing: "
+                    f"{prev_key}={prev_val} < {key}={value}"
+                )
+        prev_key, prev_val = key, value
+    return errors
+
+
+def check_monotonicity(data: dict[str, Any]) -> list[str]:
+    """Latency percentiles are non-decreasing in P. Interactivity is 1000/tpot for
+    fixed-seq (non-increasing in P) but a measured percentile of 1/itl for agentic
+    (non-decreasing)."""
+    errors: list[str] = []
+    for family in LATENCY_FAMILIES:
+        errors += _monotonic(data, family, increasing=True)
+    agentic = data.get("scenario_type") == AGENTIC_SCENARIO
+    errors += _monotonic(data, INTERACTIVITY_FAMILY, increasing=agentic)
+    return errors
+
+
+def validate(data: dict[str, Any]) -> list[str]:
+    """Return all validation errors for one aggregate result."""
+    errors: list[str] = []
+    errors += check_identity(data)
+    errors += check_numeric_finite(data)
+    errors += check_throughput(data)
+    errors += check_percentile_families(data)
+    errors += check_monotonicity(data)
+    return errors
+
+
+def load_json(path: Path) -> Any:
+    """Load a JSON file."""
+    with open(path, encoding="utf-8") as handle:
+        return json.load(handle)
+
+
+def main() -> int:
+    """CLI: validate one aggregate result JSON; exit 1 with messages on failure."""
+    parser = argparse.ArgumentParser(
+        description="Validate an InferenceX aggregate result JSON."
+    )
+    parser.add_argument("agg_json", type=Path)
+    args = parser.parse_args()
+
+    try:
+        data = load_json(args.agg_json)
+    except (OSError, json.JSONDecodeError) as exc:
+        print(f"failed to load JSON: {exc}", file=sys.stderr)
+        return 1
+
+    if not isinstance(data, dict):
+        print("agg JSON must be an object", file=sys.stderr)
+        return 1
+
+    errors = validate(data)
+    if errors:
+        print(f"Agg result validation failed for {args.agg_json}:", file=sys.stderr)
+        for error in errors:
+            print(f"  - {error}", file=sys.stderr)
+        return 1
+
+    print(f"Agg result validated: {args.agg_json}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From b470e3c0a05c7e82bdde8b5581991168dd6a8ad1 Mon Sep 17 00:00:00 2001
From: Wenyao Gao <wgao11@u.rochester.edu>
Date: Sun, 21 Jun 2026 21:44:15 -0700
Subject: [PATCH 2/2] ci: validate agg results pre-upload
 (fixed-seq/multinode/agentic); request p75/p95 percentiles

---
 .github/workflows/benchmark-multinode-tmpl.yml |  7 +++++++
 .github/workflows/benchmark-tmpl.yml           | 12 +++++++++++-
 .github/workflows/test-process-result.yml      |  8 +++++++-
 benchmarks/benchmark_lib.sh                    |  1 +
 utils/test_process_result.py                   | 18 ++++++++++++++++++
 5 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index e58cff478..b05d34cf9 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -260,6 +260,8 @@ jobs:
                 fi
                 echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus"
                 RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py
+                agg_file="agg_${result_file%.json}.json"
+                python3 utils/validate_agg_result.py "$agg_file"
               fi
             fi
           done
@@ -279,6 +281,11 @@ jobs:
           path: multinode_server_logs.tar.gz
           if-no-files-found: ignore
 
+      - name: Validate agentic aggregated result
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"
+
       - name: Upload agentic aggregated result
         if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index a57e89725..34de7fe2a 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -218,6 +218,11 @@ jobs:
         run: |
           python3 utils/process_result.py
 
+      - name: Validate agg result
+        if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "agg_${RESULT_FILENAME}.json"
+
       - name: Upload result
         if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
@@ -225,8 +230,13 @@ jobs:
           name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
 
+      - name: Validate agentic aggregated result
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
+        run: |
+          python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"
+
       - name: Upload agentic aggregated result
-        if: ${{ always() && inputs.scenario-type == 'agentic-coding' }}
+        if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: bmk_agentic_${{ env.RESULT_FILENAME }}
diff --git a/.github/workflows/test-process-result.yml b/.github/workflows/test-process-result.yml
index d6967775e..e8050433e 100644
--- a/.github/workflows/test-process-result.yml
+++ b/.github/workflows/test-process-result.yml
@@ -5,6 +5,12 @@ on:
     paths:
       - 'utils/process_result.py'
       - 'utils/test_process_result.py'
+      - 'utils/validate_agg_result.py'
+      - 'utils/test_validate_agg_result.py'
+      - 'benchmarks/benchmark_lib.sh'
+      - '.github/workflows/benchmark-tmpl.yml'
+      - '.github/workflows/benchmark-multinode-tmpl.yml'
+      - '.github/workflows/test-process-result.yml'
 
 permissions:
   contents: read
@@ -33,4 +39,4 @@ jobs:
       - name: Run pytest
         run: |
           cd utils
-          pytest test_process_result.py -v
+          pytest test_process_result.py test_validate_agg_result.py -v
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 95e063a3d..774154f39 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -373,6 +373,7 @@ run_benchmark_serving() {
         --save-result
         --num-warmups "$((2 * max_concurrency))" \
         --percentile-metrics 'ttft,tpot,itl,e2el'
+        --metric-percentiles '75,90,95,99,99.9'
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
     )
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 4037689ea..f93a70396 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -6,11 +6,15 @@
 """
 import pytest
 import json
+import re
 import subprocess
 import sys
 from pathlib import Path
 
 SCRIPT_PATH = Path(__file__).parent / "process_result.py"
+BENCHMARK_LIB_PATH = (
+    Path(__file__).resolve().parents[1] / "benchmarks" / "benchmark_lib.sh"
+)
 
 
 # =============================================================================
@@ -102,6 +106,20 @@ def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_resul
     )
 
 
+# =============================================================================
+# Test benchmark command contract
+# =============================================================================
+
+def test_benchmark_serving_requests_summary_percentiles():
+    """Benchmark production must request every percentile summarize.py reads."""
+    script = BENCHMARK_LIB_PATH.read_text(encoding="utf-8")
+
+    match = re.search(r"--metric-percentiles\s+['\"]([^'\"]+)['\"]", script)
+
+    assert match, "run_benchmark_serving must pass --metric-percentiles"
+    assert match.group(1).split(",") == ["75", "90", "95", "99", "99.9"]
+
+
 # =============================================================================
 # Test get_required_env_vars function
 # =============================================================================