From 2574e4ff779c6f061ae3632708f778a3a04c829d Mon Sep 17 00:00:00 2001 From: Wenyao Gao Date: Sun, 21 Jun 2026 21:44:15 -0700 Subject: [PATCH 1/2] feat: add data-derived validate_agg_result.py invariant guard for agg JSONs --- utils/test_validate_agg_result.py | 179 ++++++++++++++++++++ utils/validate_agg_result.py | 263 ++++++++++++++++++++++++++++++ 2 files changed, 442 insertions(+) create mode 100644 utils/test_validate_agg_result.py create mode 100644 utils/validate_agg_result.py diff --git a/utils/test_validate_agg_result.py b/utils/test_validate_agg_result.py new file mode 100644 index 000000000..7b35f9f9e --- /dev/null +++ b/utils/test_validate_agg_result.py @@ -0,0 +1,179 @@ +"""Tests for validate_agg_result.py, covering both fixed-seq and agentic agg schemas.""" +import json +import math +import subprocess +import sys +from pathlib import Path + +import pytest + +from validate_agg_result import ( + validate, + check_identity, + check_numeric_finite, + check_throughput, + check_percentile_families, + check_monotonicity, +) + +SCRIPT = Path(__file__).parent / "validate_agg_result.py" + + +def fixed_seq_agg(): + """Valid fixed-seq agg: all latency families + intvty at p75..p99.9 (intvty decreasing).""" + data = { + "hw": "b200", "framework": "sglang", "precision": "fp8", + "model": "test/model", "infmax_model_prefix": "tm", + "conc": 8, "isl": 1024, "osl": 1024, "is_multinode": False, + "tp": 8, "ep": 1, "dp_attention": "false", + "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0, + "mean_tpot": 0.01, "mean_intvty": 100.0, + } + for i, p in enumerate((75, 90, 95, 99, 99.9), start=1): + k = str(int(p)) if p == int(p) else str(p) + data[f"p{k}_ttft"] = float(i) + data[f"p{k}_tpot"] = float(i) + data[f"p{k}_itl"] = float(i) + data[f"p{k}_e2el"] = float(i) + data[f"p{k}_intvty"] = 1000.0 / (i + 9) + return data + + +def agentic_agg(): + """Valid agentic agg: same families at p75/p90/p95, no isl/osl, intvty increasing.""" + data = { + "hw": "b200", "framework": "sglang", "precision": "fp8", + "model": "test/model", "infmax_model_prefix": "tm", + "conc": 8, "is_multinode": False, + "tp": 8, "ep": 1, "dp_attention": "false", + "scenario_type": "agentic-coding", + "tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0, + "mean_tpot": 0.01, "mean_intvty": 100.0, + "theoretical_cache_hit_rate": None, + } + for i, p in enumerate((75, 90, 95), start=1): + data[f"p{p}_ttft"] = float(i) + data[f"p{p}_tpot"] = float(i) + data[f"p{p}_itl"] = float(i) + data[f"p{p}_e2el"] = float(i) + data[f"p{p}_intvty"] = float(i) * 10 + return data + + +def test_fixed_seq_valid_passes(): + assert validate(fixed_seq_agg()) == [] + + +def test_agentic_valid_passes(): + assert validate(agentic_agg()) == [] + + +def test_agentic_intvty_must_increase(): + data = agentic_agg() + data["p95_intvty"] = data["p75_intvty"] - 1.0 + assert any("intvty" in e and "non-decreasing" in e for e in check_monotonicity(data)) + + +def test_fixed_seq_intvty_must_decrease(): + data = fixed_seq_agg() + data["p90_intvty"] = data["p75_intvty"] + 100.0 + assert any("intvty" in e and "non-increasing" in e for e in check_monotonicity(data)) + + +def test_missing_sibling_percentile_fails(): + data = fixed_seq_agg() + del data["p95_e2el"] + assert any("e2el" in e and "95" in e for e in check_percentile_families(data)) + + +def test_intvty_must_mirror_tpot(): + data = fixed_seq_agg() + del data["p99_intvty"] + assert any( + "p99_tpot present but p99_intvty missing" in e + for e in check_percentile_families(data) + ) + + +def test_latency_monotonicity_fails(): + data = fixed_seq_agg() + data["p90_ttft"] = data["p75_ttft"] - 1.0 + assert any("ttft" in e and "non-decreasing" in e for e in check_monotonicity(data)) + + +def test_negative_percentile_value_fails(): + data = fixed_seq_agg() + data["p90_ttft"] = -1.0 + assert any("non-negative" in e for e in check_monotonicity(data)) + + +def test_malformed_percentile_key_flagged(): + data = fixed_seq_agg() + data["p150_tpot"] = 1.0 + assert any("malformed" in e for e in check_percentile_families(data)) + + +def test_throughput_positive_required(): + data = fixed_seq_agg() + data["tput_per_gpu"] = 0.0 + assert any("tput_per_gpu" in e for e in check_throughput(data)) + + +def test_throughput_sum_is_not_asserted(): + data = fixed_seq_agg() + data["input_tput_per_gpu"] = 123.0 # input+output need not equal total + assert check_throughput(data) == [] + + +def test_nan_field_fails(): + data = fixed_seq_agg() + data["mean_tpot"] = math.nan + assert any("finite" in e for e in check_numeric_finite(data)) + + +def test_missing_identity_fails(): + data = fixed_seq_agg() + data["hw"] = "" + assert any("hw" in e for e in check_identity(data)) + + +def test_fixed_seq_requires_isl_osl(): + data = fixed_seq_agg() + del data["isl"] + assert any("isl" in e for e in check_identity(data)) + + +def test_agentic_does_not_require_isl_osl(): + assert all("isl" not in e and "osl" not in e for e in check_identity(agentic_agg())) + + +def test_multinode_decode_fields_may_be_zero(): + data = fixed_seq_agg() + data["is_multinode"] = True + for k in ("prefill_tp", "prefill_ep", "prefill_num_workers"): + data[k] = 4 + for k in ("decode_tp", "decode_ep", "decode_num_workers"): + data[k] = 0 + data["prefill_dp_attention"] = "true" + data["decode_dp_attention"] = "true" + assert check_identity(data) == [] + + +def _run_cli(tmp_path, payload): + path = tmp_path / "agg.json" + path.write_text(payload) + return subprocess.run([sys.executable, str(SCRIPT), str(path)], capture_output=True) + + +def test_cli_accepts_valid(tmp_path): + assert _run_cli(tmp_path, json.dumps(fixed_seq_agg())).returncode == 0 + + +def test_cli_rejects_non_dict_json(tmp_path): + assert _run_cli(tmp_path, "[]").returncode == 1 + + +def test_cli_rejects_invalid_agg(tmp_path): + data = fixed_seq_agg() + data["tput_per_gpu"] = -1.0 + assert _run_cli(tmp_path, json.dumps(data)).returncode == 1 diff --git a/utils/validate_agg_result.py b/utils/validate_agg_result.py new file mode 100644 index 000000000..d06e49211 --- /dev/null +++ b/utils/validate_agg_result.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +"""Validate an aggregate benchmark result JSON before artifact upload. + +Checks structural and physical invariants for both fixed-seq and agentic results, +deriving the expected keys from the data rather than assuming a fixed percentile set. +""" +from __future__ import annotations + +import argparse +import json +import math +import re +import sys +from pathlib import Path +from typing import Any, Iterable + +LATENCY_FAMILIES = ("ttft", "tpot", "itl", "e2el") +INTERACTIVITY_FAMILY = "intvty" +THROUGHPUT_KEYS = ("tput_per_gpu", "output_tput_per_gpu", "input_tput_per_gpu") +AGENTIC_SCENARIO = "agentic-coding" + +# Prefix-style percentile keys: p75_tpot, p99.9_intvty. +_PCTL_KEY = re.compile(r"^p(\d+(?:\.\d+)?)_(.+)$") + + +def is_number(value: Any) -> bool: + """Real int/float, excluding bool.""" + return isinstance(value, (int, float)) and not isinstance(value, bool) + + +def is_positive_int(value: Any) -> bool: + """Integer greater than zero (bools excluded).""" + return isinstance(value, int) and not isinstance(value, bool) and value > 0 + + +def is_non_negative_int(value: Any) -> bool: + """Integer at least zero (bools excluded).""" + return isinstance(value, int) and not isinstance(value, bool) and value >= 0 + + +def fmt_pctl(rank: float) -> str: + """Render a percentile as the keys do: 90.0 -> '90', 99.9 -> '99.9'.""" + return str(int(rank)) if rank == int(rank) else str(rank) + + +def percentiles_present(data: dict[str, Any], family: str) -> dict[float, str]: + """Map percentile rank -> key for every p_ key in data.""" + found: dict[float, str] = {} + for key in data: + match = _PCTL_KEY.match(key) + if match and match.group(2) == family: + rank = float(match.group(1)) + if 0 <= rank <= 100: + found[rank] = key + return found + + +def malformed_percentile_keys(data: dict[str, Any], families: Iterable[str]) -> list[str]: + """Keys shaped like p_ whose rank is outside the 0-100 range.""" + bad: list[str] = [] + family_set = set(families) + for key in data: + match = _PCTL_KEY.match(key) + if match and match.group(2) in family_set: + rank = float(match.group(1)) + if not 0 <= rank <= 100: + bad.append(key) + return sorted(bad) + + +def check_identity(data: dict[str, Any]) -> list[str]: + """Identity strings, run dimensions, and topology metadata are well-formed.""" + errors: list[str] = [] + for key in ("hw", "framework", "precision", "model", "infmax_model_prefix"): + value = data.get(key) + if not isinstance(value, str) or not value.strip(): + errors.append(f"{key} must be a non-empty string") + if not is_positive_int(data.get("conc")): + errors.append("conc must be a positive integer") + # Agentic runs are variable-length and carry no isl/osl. + if data.get("scenario_type") != AGENTIC_SCENARIO: + for key in ("isl", "osl"): + if not is_positive_int(data.get(key)): + errors.append(f"{key} must be a positive integer") + is_multinode = data.get("is_multinode") + if not isinstance(is_multinode, bool): + errors.append("is_multinode must be present as a bool") + return errors + if is_multinode: + for key in ("prefill_tp", "prefill_ep", "prefill_num_workers"): + if not is_positive_int(data.get(key)): + errors.append(f"{key} must be a positive integer for multinode topology") + for key in ("decode_tp", "decode_ep", "decode_num_workers"): + if not is_non_negative_int(data.get(key)): + errors.append(f"{key} must be a non-negative integer for multinode topology") + for key in ("prefill_dp_attention", "decode_dp_attention"): + if key not in data: + errors.append(f"{key} is required for multinode topology") + else: + for key in ("tp", "ep"): + if not is_positive_int(data.get(key)): + errors.append(f"{key} must be a positive integer for single-node topology") + if "dp_attention" not in data: + errors.append("dp_attention is required for single-node topology") + return errors + + +def numeric_paths(value: Any, path: str = "") -> Iterable[tuple[str, float]]: + """Yield (path, number) for every numeric leaf in nested JSON-like data.""" + if isinstance(value, bool): + return + if isinstance(value, (int, float)): + yield path, value + return + if isinstance(value, dict): + for key, child in value.items(): + child_path = f"{path}.{key}" if path else str(key) + yield from numeric_paths(child, child_path) + elif isinstance(value, list): + for index, child in enumerate(value): + yield from numeric_paths(child, f"{path}[{index}]") + + +def check_numeric_finite(data: dict[str, Any]) -> list[str]: + """No numeric field may be NaN or +/-Infinity.""" + return [ + f"{path} must be finite" + for path, value in numeric_paths(data) + if not math.isfinite(value) + ] + + +def check_throughput(data: dict[str, Any]) -> list[str]: + """Per-GPU throughput fields are present, finite, and positive.""" + errors: list[str] = [] + for key in THROUGHPUT_KEYS: + value = data.get(key) + if not is_number(value) or not math.isfinite(value) or value <= 0: + errors.append(f"{key} must be a positive finite number") + return errors + + +def check_percentile_families(data: dict[str, Any]) -> list[str]: + """Families that report percentiles must report the same ranks, and intvty mirrors + tpot key-for-key. A family a run does not emit at all is not required.""" + errors: list[str] = [] + for key in malformed_percentile_keys(data, (*LATENCY_FAMILIES, INTERACTIVITY_FAMILY)): + errors.append(f"{key} is a malformed percentile key") + + present = {family: percentiles_present(data, family) for family in LATENCY_FAMILIES} + with_pctls = {family: ranks for family, ranks in present.items() if ranks} + if with_pctls: + union = set().union(*(set(ranks) for ranks in with_pctls.values())) + for family, ranks in with_pctls.items(): + for missing in sorted(union - set(ranks)): + errors.append( + f"metric '{family}' missing percentile p{fmt_pctl(missing)} " + "that other metrics report" + ) + + tpot_ranks = set(present["tpot"]) + intvty_ranks = set(percentiles_present(data, INTERACTIVITY_FAMILY)) + for missing in sorted(tpot_ranks - intvty_ranks): + errors.append( + f"p{fmt_pctl(missing)}_tpot present but p{fmt_pctl(missing)}_intvty missing" + ) + for extra in sorted(intvty_ranks - tpot_ranks): + errors.append( + f"p{fmt_pctl(extra)}_intvty present but p{fmt_pctl(extra)}_tpot missing" + ) + return errors + + +def _monotonic(data: dict[str, Any], family: str, increasing: bool) -> list[str]: + """One family's percentile values are non-negative, finite, and monotonic in rank.""" + entries = percentiles_present(data, family) + errors: list[str] = [] + prev_key = "" + prev_val: float | None = None + for rank in sorted(entries): + key = entries[rank] + value = data[key] + if not is_number(value) or not math.isfinite(value): + errors.append(f"{key} must be a finite percentile value") + prev_val = None + continue + if value < 0: + errors.append(f"{key} must be non-negative") + if prev_val is not None: + if increasing and value < prev_val: + errors.append( + f"{family} percentiles must be non-decreasing: " + f"{prev_key}={prev_val} > {key}={value}" + ) + if not increasing and value > prev_val: + errors.append( + f"{family} percentiles must be non-increasing: " + f"{prev_key}={prev_val} < {key}={value}" + ) + prev_key, prev_val = key, value + return errors + + +def check_monotonicity(data: dict[str, Any]) -> list[str]: + """Latency percentiles are non-decreasing in P. Interactivity is 1000/tpot for + fixed-seq (non-increasing in P) but a measured percentile of 1/itl for agentic + (non-decreasing).""" + errors: list[str] = [] + for family in LATENCY_FAMILIES: + errors += _monotonic(data, family, increasing=True) + agentic = data.get("scenario_type") == AGENTIC_SCENARIO + errors += _monotonic(data, INTERACTIVITY_FAMILY, increasing=agentic) + return errors + + +def validate(data: dict[str, Any]) -> list[str]: + """Return all validation errors for one aggregate result.""" + errors: list[str] = [] + errors += check_identity(data) + errors += check_numeric_finite(data) + errors += check_throughput(data) + errors += check_percentile_families(data) + errors += check_monotonicity(data) + return errors + + +def load_json(path: Path) -> Any: + """Load a JSON file.""" + with open(path, encoding="utf-8") as handle: + return json.load(handle) + + +def main() -> int: + """CLI: validate one aggregate result JSON; exit 1 with messages on failure.""" + parser = argparse.ArgumentParser( + description="Validate an InferenceX aggregate result JSON." + ) + parser.add_argument("agg_json", type=Path) + args = parser.parse_args() + + try: + data = load_json(args.agg_json) + except (OSError, json.JSONDecodeError) as exc: + print(f"failed to load JSON: {exc}", file=sys.stderr) + return 1 + + if not isinstance(data, dict): + print("agg JSON must be an object", file=sys.stderr) + return 1 + + errors = validate(data) + if errors: + print(f"Agg result validation failed for {args.agg_json}:", file=sys.stderr) + for error in errors: + print(f" - {error}", file=sys.stderr) + return 1 + + print(f"Agg result validated: {args.agg_json}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From b470e3c0a05c7e82bdde8b5581991168dd6a8ad1 Mon Sep 17 00:00:00 2001 From: Wenyao Gao Date: Sun, 21 Jun 2026 21:44:15 -0700 Subject: [PATCH 2/2] ci: validate agg results pre-upload (fixed-seq/multinode/agentic); request p75/p95 percentiles --- .github/workflows/benchmark-multinode-tmpl.yml | 7 +++++++ .github/workflows/benchmark-tmpl.yml | 12 +++++++++++- .github/workflows/test-process-result.yml | 8 +++++++- benchmarks/benchmark_lib.sh | 1 + utils/test_process_result.py | 18 ++++++++++++++++++ 5 files changed, 44 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index e58cff478..b05d34cf9 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -260,6 +260,8 @@ jobs: fi echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus" RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py + agg_file="agg_${result_file%.json}.json" + python3 utils/validate_agg_result.py "$agg_file" fi fi done @@ -279,6 +281,11 @@ jobs: path: multinode_server_logs.tar.gz if-no-files-found: ignore + - name: Validate agentic aggregated result + if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }} + run: | + python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json" + - name: Upload agentic aggregated result if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index a57e89725..34de7fe2a 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -218,6 +218,11 @@ jobs: run: | python3 utils/process_result.py + - name: Validate agg result + if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} + run: | + python3 utils/validate_agg_result.py "agg_${RESULT_FILENAME}.json" + - name: Upload result if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -225,8 +230,13 @@ jobs: name: bmk_${{ env.RESULT_FILENAME }} path: agg_${{ env.RESULT_FILENAME }}.json + - name: Validate agentic aggregated result + if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }} + run: | + python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json" + - name: Upload agentic aggregated result - if: ${{ always() && inputs.scenario-type == 'agentic-coding' }} + if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }} uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: bmk_agentic_${{ env.RESULT_FILENAME }} diff --git a/.github/workflows/test-process-result.yml b/.github/workflows/test-process-result.yml index d6967775e..e8050433e 100644 --- a/.github/workflows/test-process-result.yml +++ b/.github/workflows/test-process-result.yml @@ -5,6 +5,12 @@ on: paths: - 'utils/process_result.py' - 'utils/test_process_result.py' + - 'utils/validate_agg_result.py' + - 'utils/test_validate_agg_result.py' + - 'benchmarks/benchmark_lib.sh' + - '.github/workflows/benchmark-tmpl.yml' + - '.github/workflows/benchmark-multinode-tmpl.yml' + - '.github/workflows/test-process-result.yml' permissions: contents: read @@ -33,4 +39,4 @@ jobs: - name: Run pytest run: | cd utils - pytest test_process_result.py -v + pytest test_process_result.py test_validate_agg_result.py -v diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 95e063a3d..774154f39 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -373,6 +373,7 @@ run_benchmark_serving() { --save-result --num-warmups "$((2 * max_concurrency))" \ --percentile-metrics 'ttft,tpot,itl,e2el' + --metric-percentiles '75,90,95,99,99.9' --result-dir "$result_dir" --result-filename "$result_filename.json" ) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 4037689ea..f93a70396 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -6,11 +6,15 @@ """ import pytest import json +import re import subprocess import sys from pathlib import Path SCRIPT_PATH = Path(__file__).parent / "process_result.py" +BENCHMARK_LIB_PATH = ( + Path(__file__).resolve().parents[1] / "benchmarks" / "benchmark_lib.sh" +) # ============================================================================= @@ -102,6 +106,20 @@ def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_resul ) +# ============================================================================= +# Test benchmark command contract +# ============================================================================= + +def test_benchmark_serving_requests_summary_percentiles(): + """Benchmark production must request every percentile summarize.py reads.""" + script = BENCHMARK_LIB_PATH.read_text(encoding="utf-8") + + match = re.search(r"--metric-percentiles\s+['\"]([^'\"]+)['\"]", script) + + assert match, "run_benchmark_serving must pass --metric-percentiles" + assert match.group(1).split(",") == ["75", "90", "95", "99", "99.9"] + + # ============================================================================= # Test get_required_env_vars function # =============================================================================