Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/benchmark-multinode-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ jobs:
fi
echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus"
RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py
agg_file="agg_${result_file%.json}.json"
python3 utils/validate_agg_result.py "$agg_file"
fi
fi
done
Expand All @@ -279,6 +281,11 @@ jobs:
path: multinode_server_logs.tar.gz
if-no-files-found: ignore

- name: Validate agentic aggregated result
if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
run: |
python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"

- name: Upload agentic aggregated result
if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
Expand Down
12 changes: 11 additions & 1 deletion .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -218,15 +218,25 @@ jobs:
run: |
python3 utils/process_result.py

- name: Validate agg result
if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
run: |
python3 utils/validate_agg_result.py "agg_${RESULT_FILENAME}.json"

- name: Upload result
if: ${{ !inputs.eval-only && inputs.scenario-type != 'agentic-coding' }}
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: bmk_${{ env.RESULT_FILENAME }}
path: agg_${{ env.RESULT_FILENAME }}.json

- name: Validate agentic aggregated result
if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
run: |
python3 utils/validate_agg_result.py "${RESULT_FILENAME}.json"

- name: Upload agentic aggregated result
if: ${{ always() && inputs.scenario-type == 'agentic-coding' }}
if: ${{ !inputs.eval-only && inputs.scenario-type == 'agentic-coding' }}
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: bmk_agentic_${{ env.RESULT_FILENAME }}
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/test-process-result.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ on:
paths:
- 'utils/process_result.py'
- 'utils/test_process_result.py'
- 'utils/validate_agg_result.py'
- 'utils/test_validate_agg_result.py'
- 'benchmarks/benchmark_lib.sh'
- '.github/workflows/benchmark-tmpl.yml'
- '.github/workflows/benchmark-multinode-tmpl.yml'
- '.github/workflows/test-process-result.yml'

permissions:
contents: read
Expand Down Expand Up @@ -33,4 +39,4 @@ jobs:
- name: Run pytest
run: |
cd utils
pytest test_process_result.py -v
pytest test_process_result.py test_validate_agg_result.py -v
1 change: 1 addition & 0 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ run_benchmark_serving() {
--save-result
--num-warmups "$((2 * max_concurrency))" \
--percentile-metrics 'ttft,tpot,itl,e2el'
--metric-percentiles '75,90,95,99,99.9'
--result-dir "$result_dir"
--result-filename "$result_filename.json"
)
Expand Down
18 changes: 18 additions & 0 deletions utils/test_process_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@
"""
import pytest
import json
import re
import subprocess
import sys
from pathlib import Path

SCRIPT_PATH = Path(__file__).parent / "process_result.py"
BENCHMARK_LIB_PATH = (
Path(__file__).resolve().parents[1] / "benchmarks" / "benchmark_lib.sh"
)


# =============================================================================
Expand Down Expand Up @@ -102,6 +106,20 @@ def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_resul
)


# =============================================================================
# Test benchmark command contract
# =============================================================================

def test_benchmark_serving_requests_summary_percentiles():
"""Benchmark production must request every percentile summarize.py reads."""
script = BENCHMARK_LIB_PATH.read_text(encoding="utf-8")

match = re.search(r"--metric-percentiles\s+['\"]([^'\"]+)['\"]", script)

assert match, "run_benchmark_serving must pass --metric-percentiles"
assert match.group(1).split(",") == ["75", "90", "95", "99", "99.9"]


# =============================================================================
# Test get_required_env_vars function
# =============================================================================
Expand Down
179 changes: 179 additions & 0 deletions utils/test_validate_agg_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""Tests for validate_agg_result.py, covering both fixed-seq and agentic agg schemas."""
import json
import math
import subprocess
import sys
from pathlib import Path

import pytest

from validate_agg_result import (
validate,
check_identity,
check_numeric_finite,
check_throughput,
check_percentile_families,
check_monotonicity,
)

SCRIPT = Path(__file__).parent / "validate_agg_result.py"


def fixed_seq_agg():
"""Valid fixed-seq agg: all latency families + intvty at p75..p99.9 (intvty decreasing)."""
data = {
"hw": "b200", "framework": "sglang", "precision": "fp8",
"model": "test/model", "infmax_model_prefix": "tm",
"conc": 8, "isl": 1024, "osl": 1024, "is_multinode": False,
"tp": 8, "ep": 1, "dp_attention": "false",
"tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
"mean_tpot": 0.01, "mean_intvty": 100.0,
}
for i, p in enumerate((75, 90, 95, 99, 99.9), start=1):
k = str(int(p)) if p == int(p) else str(p)
data[f"p{k}_ttft"] = float(i)
data[f"p{k}_tpot"] = float(i)
data[f"p{k}_itl"] = float(i)
data[f"p{k}_e2el"] = float(i)
data[f"p{k}_intvty"] = 1000.0 / (i + 9)
return data


def agentic_agg():
"""Valid agentic agg: same families at p75/p90/p95, no isl/osl, intvty increasing."""
data = {
"hw": "b200", "framework": "sglang", "precision": "fp8",
"model": "test/model", "infmax_model_prefix": "tm",
"conc": 8, "is_multinode": False,
"tp": 8, "ep": 1, "dp_attention": "false",
"scenario_type": "agentic-coding",
"tput_per_gpu": 1000.0, "output_tput_per_gpu": 800.0, "input_tput_per_gpu": 200.0,
"mean_tpot": 0.01, "mean_intvty": 100.0,
"theoretical_cache_hit_rate": None,
}
for i, p in enumerate((75, 90, 95), start=1):
data[f"p{p}_ttft"] = float(i)
data[f"p{p}_tpot"] = float(i)
data[f"p{p}_itl"] = float(i)
data[f"p{p}_e2el"] = float(i)
data[f"p{p}_intvty"] = float(i) * 10
return data


def test_fixed_seq_valid_passes():
assert validate(fixed_seq_agg()) == []


def test_agentic_valid_passes():
assert validate(agentic_agg()) == []


def test_agentic_intvty_must_increase():
data = agentic_agg()
data["p95_intvty"] = data["p75_intvty"] - 1.0
assert any("intvty" in e and "non-decreasing" in e for e in check_monotonicity(data))


def test_fixed_seq_intvty_must_decrease():
data = fixed_seq_agg()
data["p90_intvty"] = data["p75_intvty"] + 100.0
assert any("intvty" in e and "non-increasing" in e for e in check_monotonicity(data))


def test_missing_sibling_percentile_fails():
data = fixed_seq_agg()
del data["p95_e2el"]
assert any("e2el" in e and "95" in e for e in check_percentile_families(data))


def test_intvty_must_mirror_tpot():
data = fixed_seq_agg()
del data["p99_intvty"]
assert any(
"p99_tpot present but p99_intvty missing" in e
for e in check_percentile_families(data)
)


def test_latency_monotonicity_fails():
data = fixed_seq_agg()
data["p90_ttft"] = data["p75_ttft"] - 1.0
assert any("ttft" in e and "non-decreasing" in e for e in check_monotonicity(data))


def test_negative_percentile_value_fails():
data = fixed_seq_agg()
data["p90_ttft"] = -1.0
assert any("non-negative" in e for e in check_monotonicity(data))


def test_malformed_percentile_key_flagged():
data = fixed_seq_agg()
data["p150_tpot"] = 1.0
assert any("malformed" in e for e in check_percentile_families(data))


def test_throughput_positive_required():
data = fixed_seq_agg()
data["tput_per_gpu"] = 0.0
assert any("tput_per_gpu" in e for e in check_throughput(data))


def test_throughput_sum_is_not_asserted():
data = fixed_seq_agg()
data["input_tput_per_gpu"] = 123.0 # input+output need not equal total
assert check_throughput(data) == []


def test_nan_field_fails():
data = fixed_seq_agg()
data["mean_tpot"] = math.nan
assert any("finite" in e for e in check_numeric_finite(data))


def test_missing_identity_fails():
data = fixed_seq_agg()
data["hw"] = ""
assert any("hw" in e for e in check_identity(data))


def test_fixed_seq_requires_isl_osl():
data = fixed_seq_agg()
del data["isl"]
assert any("isl" in e for e in check_identity(data))


def test_agentic_does_not_require_isl_osl():
assert all("isl" not in e and "osl" not in e for e in check_identity(agentic_agg()))


def test_multinode_decode_fields_may_be_zero():
data = fixed_seq_agg()
data["is_multinode"] = True
for k in ("prefill_tp", "prefill_ep", "prefill_num_workers"):
data[k] = 4
for k in ("decode_tp", "decode_ep", "decode_num_workers"):
data[k] = 0
data["prefill_dp_attention"] = "true"
data["decode_dp_attention"] = "true"
assert check_identity(data) == []


def _run_cli(tmp_path, payload):
path = tmp_path / "agg.json"
path.write_text(payload)
return subprocess.run([sys.executable, str(SCRIPT), str(path)], capture_output=True)


def test_cli_accepts_valid(tmp_path):
assert _run_cli(tmp_path, json.dumps(fixed_seq_agg())).returncode == 0


def test_cli_rejects_non_dict_json(tmp_path):
assert _run_cli(tmp_path, "[]").returncode == 1


def test_cli_rejects_invalid_agg(tmp_path):
data = fixed_seq_agg()
data["tput_per_gpu"] = -1.0
assert _run_cli(tmp_path, json.dumps(data)).returncode == 1
Loading