From 2854c96b5730e63bed8360ff2a10e58ac648d65f Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Fri, 15 May 2026 11:09:09 +0800 Subject: [PATCH 1/5] =?UTF-8?q?feat:=20add=201Cat-vLLM=20runner=20for=20Te?= =?UTF-8?q?sla=20V100=20=E2=80=94=20nvidia=5Fonecat=5Fvllm=5Fa43d1bcf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the AccelMark runner for the 1Cat-vLLM community fork that re-enables AWQ 4-bit inference on Volta (SM70) Tesla V100 via lmdeploy TurboMind kernels and the FLASH_ATTN_V100 attention backend. What is included: * runners/nvidia_onecat_vllm_a43d1bcf/ — runner.py, meta.json (with hardware_label="NVIDIA V100 (SM70)" and suite_support self-declaration), requirements.txt, README.md * configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example The README platforms matrix updates automatically — the hardware label is taken from meta.hardware_label rather than the catalogue default, so the V100-specific row is rendered correctly without touching schema/platforms.json or any shared file. Capability flags: * SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath). * SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline contribution; FP8 KV cache and other formats are intentionally not exposed by default. * Auto-injects attention_backend=FLASH_ATTN_V100 unless the user overrides it. * Suite F (Qwen2.5-0.5B-Instruct on a consumer/edge GPU) is marked unsupported — 1Cat-vLLM targets dense + MoE on 4 x V100, not edge inference. Initial commit, not yet validated end-to-end on hardware; all applicable suites are marked "pending". Co-authored-by: Cursor --- README.md | 1 + ...r_nvidia_onecat_vllm_a43d1bcf.yaml.example | 71 +++ runners/nvidia_onecat_vllm_a43d1bcf/README.md | 150 +++++ runners/nvidia_onecat_vllm_a43d1bcf/meta.json | 21 + .../requirements.txt | 58 ++ runners/nvidia_onecat_vllm_a43d1bcf/runner.py | 517 ++++++++++++++++++ 6 files changed, 818 insertions(+) create mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/README.md create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/meta.json create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/runner.py diff --git a/README.md b/README.md index 3007966..1c4acbd 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | +| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_a43d1bcf` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example new file mode 100644 index 0000000..d39949e --- /dev/null +++ b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example @@ -0,0 +1,71 @@ +# AccelMark runner config — nvidia_onecat_vllm_a43d1bcf (1Cat-vLLM on V100) +# +# Copy this file to runner_nvidia_onecat_vllm_a43d1bcf.yaml (remove the +# .example suffix) and adjust for your hardware. The actual .yaml is +# gitignored. +# +# These settings adapt the runner to your hardware environment. They are +# recorded in result.json task.extra_config but are NOT part of the +# benchmark identity (not hashed into run_id). +# +# Merge priority: CLI flags > suite-specific > global defaults > runner defaults + +# ── Global defaults (apply to all suites) ───────────────────────────────────── + +# Tensor parallel size — number of V100 cards (1Cat-vLLM 1.0.0 public ref: +# 4 x V100 32 GB). For 256K context the recommended TP is 4 on 32 GB cards. +tensor_parallel_size: 4 + +# Disable Volta CUDA-graph capture. Set true if you hit Triton sm>=80 errors. +# 1Cat-vLLM normally captures graphs without --enforce-eager — leave false +# unless you've observed startup hangs on the first request. +enforce_eager: false + +# 1Cat 1.0.0 public default is 1 (low-concurrency stable serving). +# Bump to 4 only for the MTP + prefix-cache profile. +max_num_seqs: 1 + +# 1Cat 1.0.0 public default. Reduce to 0.85 if you see OOM at engine init. +gpu_memory_utilization: 0.88 + +# Pass-through kwargs forwarded to vLLM LLM() / AsyncEngineArgs(). 1Cat-vLLM +# recognises `attention_backend` and a few extra knobs (kv-cache-auto-trim-ratio, +# compilation-config, speculative-config). Unknown keys are dropped at engine +# init with a warning, so this is safe across vLLM 0.7.x / 1Cat-1.0.0. +# +# Defaults left commented out below — the runner auto-injects +# `attention_backend: FLASH_ATTN_V100` if you don't set it here or via the +# VLLM_ATTENTION_BACKEND environment variable. +# +# engine_kwargs: +# attention_backend: FLASH_ATTN_V100 # auto-injected by the runner +# kv_cache_auto_trim_ratio: 0.0 # disables 1Cat KV auto-trim (MTP profile) +# mamba_cache_mode: align # required for Qwen3.6 hybrid Mamba +# compilation_config: +# cudagraph_mode: full_and_piecewise +# cudagraph_capture_sizes: [1, 2, 4, 8] + +# ── Suite-specific overrides ─────────────────────────────────────────────────── + +suites: + suite_D: + # Long-context — keep max_num_seqs low and reserve more memory. + max_num_seqs: 1 + gpu_memory_utilization: 0.85 + + suite_C: + # Quantization suite — AWQ is the primary 1Cat target. + # max_model_len: 12288 reproduces the 1Cat 1.0.0 internal speed harness. + max_num_seqs: 1 + + # MTP profile for Qwen3.6-27B-AWQ — copy this and uncomment to use. + # suite_A: + # max_num_seqs: 4 + # engine_kwargs: + # enable_prefix_caching: true + # speculative_config: + # method: mtp + # num_speculative_tokens: 4 + # compilation_config: + # cudagraph_mode: full_and_piecewise + # cudagraph_capture_sizes: [1, 2, 4, 8, 9, 18] diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md new file mode 100644 index 0000000..eb9f5c0 --- /dev/null +++ b/runners/nvidia_onecat_vllm_a43d1bcf/README.md @@ -0,0 +1,150 @@ +# nvidia_onecat_vllm_a43d1bcf — 1Cat-vLLM Runner (Tesla V100 / SM70) + +AccelMark runner for **Tesla V100 / V100S only**, using +[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) — the community vLLM fork +that re-enables modern AWQ 4-bit serving and FlashAttention on Volta GPUs +(SM70). + +> **Hardware scope:** This runner is intentionally narrow. On Ampere +> (A100/A800/A10/L4/4090/etc.) or newer, use the upstream +> `nvidia_vllm_*` runner — 1Cat-vLLM's kernels are tuned for SM70 and +> provide no benefit on later architectures. + +> **Status:** Committed without an end-to-end validation run yet. The runner +> code is a thin specialisation of the upstream NVIDIA vLLM runner (only +> capability flags + attention-backend default differ), so existing test +> coverage of the parent runner applies. Plan to add a reference +> `Tesla V100-SXM2-32GBx4 suite_B` result once a target box is available. + +## Why 1Cat-vLLM exists + +| Pain on stock vLLM + V100 | 1Cat-vLLM's fix | +|---|---| +| AWQ kernels require SM75+ | Integrated lmdeploy TurboMind WMMA kernels for SM70 | +| FlashAttention 2/3 require Ampere+ | Custom `FLASH_ATTN_V100` Volta backend | +| Qwen3.5 / Qwen3.6 dense + MoE not loadable | Model configs and runtime fixes shipped in fork | +| Long-context paged-prefill stability | SM70-specific MLA/GDN runtime fixes | +| FP8 KV cache | `fp8_e5m2` (experimental) on V100 FA path | + +For full release notes see + RELEASE_NOTES_1.0.0.md. + +## Defaults this runner injects + +| Knob | Default | Where set | Why | +|---|---|---|---| +| `attention_backend` | `FLASH_ATTN_V100` | `load_model()` if not already specified | 1Cat-vLLM's recommended V100 path | +| `SUPPORTED_PRECISIONS` | `["fp16", "fp32"]` | class attribute | V100 has no BF16 | +| `SUPPORTED_QUANTIZATION_BACKENDS` | `["awq"]` | class attribute | 1Cat's headline kernel; other formats not validated on this stack | +| `max_num_seqs` | `1` | runner config default | 1Cat 1.0.0 public default — 256K context on V100 | +| `gpu_memory_utilization` | `0.88` | runner config default | 1Cat 1.0.0 public default | + +To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump +`max_num_seqs` to `4` and pass `speculative_config` via the runner config +`engine_kwargs` — see the example config file. + +## Supported suites + +| Suite | Recommendation | +|-------|---------------| +| Suite A — Llama-3-8B 1× | Runs, but vanilla `nvidia_vllm_47f5d58e --enforce-eager` already covers this. Use 1Cat only if you want the FA-V100 attention path. | +| Suite B — Llama-3-70B multi-chip | **Primary target.** Recommended `--tensor-parallel-size 4`. | +| Suite C — Quantization | Restricted to AWQ — this is where 1Cat shines. | +| Suite D — Long context (~28K) | **Primary target.** `FLASH_ATTN_V100` is the only V100-friendly long-context path. | +| Suite E — Scaling | Same considerations as Suite B; useful for measuring how 1Cat's MCCL-equivalent scales. | +| Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. | +| Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. | + +## Prerequisites + +```bash +# 1. CUDA 12.8 toolkit + matching driver (570.x recommended) +# https://developer.nvidia.com/cuda-12-8-0-download-archive + +# 2. Python 3.12 (1Cat 1.0.0 ships cp312 wheels only) +conda create -y -n 1cat-vllm-1.0.0 python=3.12 +conda activate 1cat-vllm-1.0.0 + +# 3. Install the 1Cat-vLLM wheels +pip install --prefer-binary --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cu128 \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" + +# 4. Install AccelMark extras +pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt +``` + +## Smoke test the install + +```bash +python - <<'PY' +import torch, vllm +print("torch:", torch.__version__, " vllm:", vllm.__version__) +try: + import flash_attn_v100_cuda # SM70 FA kernels + print("flash_attn_v100: ok") +except Exception as e: + print("flash_attn_v100: MISSING ->", e) +PY +``` + +`flash_attn_v100` MUST be importable — if it isn't, you accidentally +installed plain vLLM from PyPI; reinstall from the 1Cat release wheels above. + +## Basic usage + +```bash +# Suite D (long-context) on 4 x V100 32 GB +python run.py --runner nvidia_onecat_vllm_a43d1bcf \ + --suite suite_D \ + --tensor-parallel-size 4 + +# Suite C with AWQ (Qwen3.5-27B-AWQ as the validation model) +python run.py --runner nvidia_onecat_vllm_a43d1bcf \ + --suite suite_C \ + --tensor-parallel-size 4 \ + --model-path /data/models/Qwen3.5-27B-AWQ + +# Override attention backend (rare — for benchmarking vs Triton fallback) +python run.py --runner nvidia_onecat_vllm_a43d1bcf \ + --suite suite_B \ + --tensor-parallel-size 4 \ + # Then set attention_backend in your runner config engine_kwargs. +``` + +## Runner config + +Copy the example: + +```bash +cp configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example \ + configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml +``` + +Key defaults differ from the upstream NVIDIA runner: + +| Field | 1Cat default | Upstream default | Notes | +|-------|--------------|------------------|-------| +| `max_num_seqs` | 1 | 512 | 256K context demands very tight KV cache budget | +| `gpu_memory_utilization` | 0.88 | 0.90 | Matches 1Cat 1.0.0 public reference | +| `engine_kwargs.attention_backend` | `FLASH_ATTN_V100` (auto) | — | Auto-injected unless overridden | + +## Known gaps (pre-smoke-test) + +- The Volta CUDA-graph capture path needs validation under + `--scenario sustained`. If startup hangs on the first request, set + `enforce_eager: true` in your runner config. +- The accuracy gate uses the suite's stock prompts — on AWQ checkpoints + the gate threshold may be too tight; the suite spec already allows + per-format thresholds (Suite C) so this is mostly relevant on Suite A/D. +- MTP / speculative profiles are documented in 1Cat 1.0.0 but not + exercised here yet; flat speculative keys in `_precision_engine_kwargs` + are still forwarded as `speculative_config` by `benchmark_runner.py`, + the same as for the upstream runner. + +## Requirements + +See `requirements.txt`. The heavy dependencies (`torch`, `flash_attn_v100`, +`vllm` fork) MUST come from the 1Cat-vLLM release wheels — do not install +upstream `vllm` from PyPI. diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json new file mode 100644 index 0000000..b86b000 --- /dev/null +++ b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json @@ -0,0 +1,21 @@ +{ + "id": "nvidia_onecat_vllm_a43d1bcf", + "platform": "nvidia", + "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA", + "framework": "1Cat-vLLM", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — the community vLLM fork that re-enables AWQ 4-bit inference on Volta via lmdeploy TurboMind kernels and the FLASH_ATTN_V100 attention backend. Targets Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB. Use the upstream nvidia_vllm runner on Ampere or newer.", + "supersedes_chain": [], + "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless the user overrides it. SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath). SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline contribution; FP8 KV cache and other formats are intentionally not exposed by default. Initial commit, not yet validated end-to-end on hardware.", + "created": "2026-05-15", + "hardware_label": "NVIDIA V100 (SM70)", + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "unsupported", + "G": "pending" + } +} diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt new file mode 100644 index 0000000..01e687d --- /dev/null +++ b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt @@ -0,0 +1,58 @@ +# AccelMark -- 1Cat-vLLM (SM70 / V100) runner dependencies +# +# 1Cat-vLLM is a community fork of vLLM tuned for Tesla V100. It ships two +# wheels that must be installed together: +# - flash_attn_v100 (Volta-optimised FlashAttention kernels) +# - vllm (patched fork, exposes the FLASH_ATTN_V100 backend +# and AWQ 4-bit kernels for SM70) +# +# Both wheels are published as GitHub release assets at: +# https://github.com/1CatAI/1Cat-vLLM/releases +# +# Reference validated stack (1Cat-vLLM 1.0.0): +# OS: Ubuntu 24.04 +# Python: 3.12 +# CUDA: 12.8 +# PyTorch: 2.9.1+cu128 +# Driver: 570.211.01 +# GPU: 4 x Tesla V100 32 GB +# +# Installation: +# # 1. Install CUDA 12.8 toolkit and matching driver +# # 2. Create a fresh Python 3.12 environment +# # 3. Install the two 1Cat-vLLM wheels from the release page: +# pip install --prefer-binary --no-cache-dir \ +# --extra-index-url https://download.pytorch.org/whl/cu128 \ +# "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ +# "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" +# +# # 4. Then install the AccelMark extras below: +# pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt +# +# Note: do NOT add `torch==2.x` here — the matching torch wheel is pulled in +# by the 1Cat-vLLM wheel install command above. Listing torch here would +# fight with the cu128 extra-index-url. + +# Transformers stack — compatible with 1Cat-vLLM 1.0.0 (vllm fork 1.0.0, +# based on upstream vLLM 0.7.x line). Versions match the upstream +# nvidia_vllm_47f5d58e runner so we know they're consistent. +transformers==4.57.6 +tokenizers==0.22.2 +huggingface-hub==0.35.0 +accelerate==1.10.1 +safetensors==0.6.2 + +# AccelMark dependencies +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 + +# NVIDIA monitoring (for power and GPU stats — same as upstream NVIDIA runner) +nvidia-ml-py==13.580.82 + +# Async support +aiohttp==3.12.15 + +# Config file parsing +PyYAML==6.0.2 diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py b/runners/nvidia_onecat_vllm_a43d1bcf/runner.py new file mode 100644 index 0000000..56002bb --- /dev/null +++ b/runners/nvidia_onecat_vllm_a43d1bcf/runner.py @@ -0,0 +1,517 @@ +""" +AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script. + +Implements BenchmarkRunner for `1Cat-vLLM`, the community fork of vLLM that +re-enables modern model serving on Tesla V100 / SM70 hardware. The fork +preserves the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``, +``SamplingParams``) but ships several SM70-specific pieces: + + - the ``FLASH_ATTN_V100`` attention backend (FlashAttention re-implemented + for Volta) — set as this runner's default + - AWQ 4-bit kernels patched in from lmdeploy TurboMind for SM70 + - validated paths for Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB + - FP8 KV cache (``fp8_e5m2``) as an experimental option + +Because the Python entry points are identical to upstream vLLM, this runner +is structurally a clone of ``nvidia_vllm_*`` with three runtime overrides: + + 1. ``SUPPORTED_PRECISIONS`` drops BF16 (V100 cannot do BF16 in hardware). + 2. ``SUPPORTED_QUANTIZATION_BACKENDS`` advertises only ``awq`` — the fork's + headline feature; other quantizations are unproven on this stack. + 3. ``load_model()`` auto-injects ``attention_backend="FLASH_ATTN_V100"`` + into the engine kwargs unless the user has explicitly set one. + +Reference build: + Python 3.12 · CUDA 12.8 · torch 2.9.1+cu128 · 1Cat-vLLM 1.0.0 wheels + Validated on 4 x Tesla PG503 / V100 32 GB. + +All orchestration logic still lives in runners/benchmark_runner.py. +""" + +import asyncio +import sys +import time +from pathlib import Path +from typing import Optional + +# Add repo root to path +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(_REPO_ROOT)) + +import torch +from vllm import LLM, AsyncLLMEngine, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from transformers import AutoTokenizer + +from runners.benchmark_runner import BenchmarkRunner, InferenceRequest +from loadgen.types import InferenceResult + + + +# Suppress per-request vLLM logs by default +import logging +logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) +logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) + + +class OneCatVLLMRunner(BenchmarkRunner): + """ + AccelMark benchmark runner using 1Cat-vLLM (SM70 / V100 fork) on NVIDIA. + + This runner is intended **specifically for Tesla V100 / V100S (SM70)**. + Other NVIDIA GPUs should use the upstream ``nvidia_vllm_*`` runner — + 1Cat-vLLM's kernels are tuned for Volta and provide no advantage on + Ampere or newer hardware. + """ + + SUPPORTS_STREAMING = True + SUPPORTS_BATCHING = True + SUPPORTS_ONLINE = True + SUPPORTS_MULTI_CHIP = True + + # V100 has no BF16 datapath — drop BF16 entirely so the suite picks + # FP16 as the effective precision without an amber warning. FP32 is left + # in for completeness but is essentially never used in inference. + SUPPORTED_PRECISIONS = ["fp16", "fp32"] + + # 1Cat-vLLM's flagship contribution is AWQ 4-bit on SM70 (via lmdeploy + # TurboMind kernels). Other quantization backends are not validated on + # this stack — keep the surface conservative and let users opt-in by + # subclassing if they want to try FP8 KV cache (``fp8_e5m2`` only). + SUPPORTED_QUANTIZATION_BACKENDS = ["awq"] + + def __init__(self): + self.llm: LLM = None + self.engine: AsyncLLMEngine = None + self.tokenizer: AutoTokenizer = None + self.sampling_params: SamplingParams = None + self._loop: asyncio.AbstractEventLoop = None + + def _get_chip_count(self) -> int: + """Return the number of available CUDA GPUs.""" + try: + import torch + n = torch.cuda.device_count() + return n if n > 0 else 1 + except Exception: + return 1 + + def _get_framework_name(self) -> str: + return "1Cat-vLLM" + + def _get_framework_version(self) -> str: + """Report vllm.__version__ plus the flash_attn_v100 wheel version. + + 1Cat-vLLM ships two coupled wheels (`vllm` patched fork + `flash_attn_v100`) + and the FA-V100 wheel is the bit that actually changes attention + performance on Volta. Recording both makes the result reproducible. + """ + core = "unknown" + try: + import vllm + core = vllm.__version__ + except Exception: + pass + + fa_v100 = None + try: + from importlib.metadata import version as _pkg_version + fa_v100 = _pkg_version("flash_attn_v100") + except Exception: + try: + import flash_attn_v100_cuda # type: ignore # noqa: F401 + fa_v100 = "installed" + except Exception: + fa_v100 = None + + if fa_v100: + return f"{core}+flash_attn_v100-{fa_v100}" + return core + + def load_model(self, model_path: str, parallelism: dict) -> None: + """Load model — sync LLM for offline/accuracy, async engine for streaming.""" + tp_size = parallelism["tensor_parallel_size"] + pp_size = parallelism["pipeline_parallel_size"] + ep_size = parallelism.get("expert_parallel_size", 1) + assert pp_size <= 1, "Pipeline parallelism is not supported in OneCatVLLMRunner" + + max_tokens = parallelism["max_tokens"] + max_model_len = parallelism["max_model_len"] + use_async = parallelism["use_async"] + enforce_eager = getattr(self, "_enforce_eager", False) + + cfg = getattr(self, "_runner_config", {}) + # 1Cat-vLLM public defaults for 4 x V100 32 GB: + # max_num_seqs = 1 (baseline) or 4 (MTP profile) + # gpu_memory_utilization = 0.88 + # These differ noticeably from upstream vLLM's 512 / 0.90 defaults + # because 256K context on V100 demands a much tighter KV budget. + max_num_seqs = cfg.get("max_num_seqs", 1) + gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88) + extra_kwargs = dict(cfg.get("engine_kwargs") or {}) + + # ── Default to FLASH_ATTN_V100 attention backend ────────────────────── + # 1Cat-vLLM's public recommendation for V100 is FLASH_ATTN_V100. Inject + # it as the default unless the user explicitly set a backend in their + # runner config engine_kwargs, or via the VLLM_ATTENTION_BACKEND + # environment variable. We support both spellings because vLLM accepts + # the kwarg as `attention_backend` and the env var as + # VLLM_ATTENTION_BACKEND. + import os + if ( + "attention_backend" not in extra_kwargs + and "VLLM_ATTENTION_BACKEND" not in os.environ + ): + extra_kwargs["attention_backend"] = "FLASH_ATTN_V100" + + # ── Filter engine_kwargs to only fields this vLLM version accepts ───── + # Avoids TypeError when the runner config YAML references a field that + # doesn't exist in the installed vLLM version (EngineArgs is a strict + # dataclass — unknown keyword arguments raise TypeError immediately). + try: + import dataclasses + from vllm.engine.arg_utils import EngineArgs as _EngineArgs + _valid = {f.name for f in dataclasses.fields(_EngineArgs)} + _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid} + if _dropped: + print(f" Warning: engine_kwargs keys not supported by this " + f"1Cat-vLLM version and will be ignored: {list(_dropped)}") + extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} + except Exception: + pass # If introspection fails, pass kwargs as-is and let vLLM report the error + + # Use precision resolved by BenchmarkRunner._resolve_precision() + effective_precision = getattr(self, "_effective_precision", "BF16").upper() + precision = getattr(self, "_precision", None) or effective_precision + + # dtype_override and quantization may be injected by benchmark_runner from + # precision_model_map entry fields (dtype_override, engine_kwargs.quantization). + # These take priority over the runner's own precision→dtype mapping below. + _dtype_override = getattr(self, "_precision_dtype_override", None) + _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) + + quantization = _prec_eng_kwargs.pop("quantization", None) + + # Map native precision names to explicit dtypes. + # Quantized formats (anything not in this map) use dtype="auto" — vLLM reads + # the storage dtype from the checkpoint's config.json, and the quantization + # kernel is set explicitly via the `quantization` kwarg already populated above + # from precision_model_map engine_kwargs. No fallback guessing needed here. + _NATIVE_DTYPE_MAP = { + "BF16": "bfloat16", + "FP16": "float16", + "FP32": "float32", + } + dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") + self._quantization_method = quantization # None for native, explicit str for quantized + + # dtype_override from precision_model_map wins over the mapping above. + # Used for e.g. FP16 baseline on pre-Ampere hardware (V100/T4). + if _dtype_override: + dtype = _dtype_override + + # Merge remaining precision_engine_kwargs (after popping quantization) into + # extra_kwargs so they reach LLM() / AsyncEngineArgs. Runner YAML engine_kwargs + # still take final precedence via the **extra_kwargs spread at the end. + if _prec_eng_kwargs: + _prec_eng_kwargs.update(extra_kwargs) # runner YAML wins on conflict + extra_kwargs = _prec_eng_kwargs + + print(f"Loading model: precision={precision}, dtype={dtype}" + + (f", quantization_method={self._quantization_method}" + if self._quantization_method else "")) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=False + ) + + self.sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + if not use_async: + llm_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + gpu_memory_utilization=gpu_memory_util, + **extra_kwargs, + ) + if ep_size > 1: + llm_kwargs["enable_expert_parallel"] = True + llm_kwargs["tensor_parallel_size"] = tp_size + if quantization: + llm_kwargs["quantization"] = quantization + if max_model_len: + llm_kwargs["max_model_len"] = max_model_len + self.llm = LLM(**llm_kwargs) + else: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + engine_kwargs = dict( + model=model_path, + dtype=dtype, + tensor_parallel_size=tp_size, + trust_remote_code=False, + enforce_eager=enforce_eager, + gpu_memory_utilization=gpu_memory_util, + # engine_kwargs values override named fields above if the same key appears in both. + # This is intentional — engine_kwargs is the power-user escape hatch. + **extra_kwargs, + ) + if ep_size > 1: + engine_kwargs["enable_expert_parallel"] = True + if max_model_len: + engine_kwargs["max_model_len"] = max_model_len + engine_args = AsyncEngineArgs(**engine_kwargs) + self.engine = AsyncLLMEngine.from_engine_args(engine_args) + + def get_effective_dtype(self) -> Optional[str]: + """ + Report the actual compute dtype vLLM used after model loading. + + vLLM exposes the resolved dtype via model_config after initialization. + This captures cases like FP8 weights on A100 computing in BF16. + """ + try: + if self.llm is not None: + # Sync LLM path + dtype = self.llm.llm_engine.model_config.dtype + return str(dtype).replace("torch.", "") + elif self.engine is not None: + # Async engine path + dtype = self.engine.engine.model_config.dtype + return str(dtype).replace("torch.", "") + except Exception: + pass + # Fall back to declared dtype if introspection fails + return getattr(self, "_effective_dtype", None) + + def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: + """Send all requests to vLLM at once. vLLM handles internal batching. + + total_time_ms in each returned InferenceResult is set to the wall-clock + elapsed time of the entire batch — NOT an individual per-request latency. + vLLM's sync LLM.generate() blocks until all requests finish, so there is + no per-request completion timestamp available. All results share the same + total_time_ms value, which is the correct denominator for throughput: + throughput = total_tokens / (elapsed_ms / 1000) + """ + formatted = [self._format_prompt(r.prompt) for r in requests] + t_start = time.perf_counter() + outputs = self.llm.generate(formatted, self.sampling_params) + elapsed = time.perf_counter() - t_start + + # Store output text for _run_accuracy_integrated() + self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] + + results = [] + for output in outputs: + results.append(InferenceResult( + first_token_time_ms=None, + total_time_ms=elapsed * 1000, + output_tokens=len(output.outputs[0].token_ids), + input_tokens=len(output.prompt_token_ids), + success=True, + output_text=output.outputs[0].text, + )) + return results + + async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: + """Stream a single request, measuring TTFT.""" + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + t_start = time.perf_counter() + first_token_time_ms = None + output_tokens = 0 + output_text = "" + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + if ( + first_token_time_ms is None + and len(output.outputs[0].token_ids) > 0 + ): + first_token_time_ms = (time.perf_counter() - t_start) * 1000 + output_tokens = len(output.outputs[0].token_ids) + output_text = output.outputs[0].text + + total_time_ms = (time.perf_counter() - t_start) * 1000 + return InferenceResult( + first_token_time_ms=first_token_time_ms, + total_time_ms=total_time_ms, + output_tokens=output_tokens, + input_tokens=0, + success=True, + output_text=output_text, + ) + + async def inference_fn_token_stream(self, request: InferenceRequest): + """ + Async generator yielding decoded text deltas for the serve layer. + + Each yield is the delta text since the last output — new characters + only, not the full accumulated string. + + vLLM's engine.generate() yields cumulative outputs, so we track the + previous text length and slice off only the new portion each step. + """ + from vllm.utils import random_uuid + + formatted = self._format_prompt(request.prompt) + request_id = random_uuid() + prev_length = 0 + + async for output in self.engine.generate( + formatted, self.sampling_params, request_id + ): + current_text = output.outputs[0].text + delta = current_text[prev_length:] + if delta: + yield delta + prev_length = len(current_text) + + def get_peak_memory_gb(self) -> float: + try: + return torch.cuda.max_memory_allocated() / (1024 ** 3) + except Exception: + return None + + def release_resources(self) -> None: + """Release vLLM engines and distributed state.""" + if self.llm is not None: + try: + del self.llm + except Exception: + pass + self.llm = None + + if self.engine is not None: + try: + if self._loop and not self._loop.is_closed(): + self._loop.run_until_complete(self.engine.shutdown()) + except Exception: + pass + try: + del self.engine + except Exception: + pass + self.engine = None + + # Destroy vLLM's distributed state so the next engine initialisation + # creates a fresh TCPStore server. Must call destroy_model_parallel() + # first to clear vLLM's cached group references; only then is it safe + # to destroy the underlying torch process group. Skipping this step + # leaves torch.distributed.is_initialized()==True, which causes + # init_distributed_environment() to skip creating the new TCPStore + # server, so spawned worker processes can never connect (→ 600 s timeout). + try: + from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + cleanup_dist_env_and_memory(shutdown_ray=False) + except Exception: + # Fallback for older vLLM builds that lack cleanup_dist_env_and_memory + try: + from vllm.distributed.parallel_state import ( + destroy_model_parallel, destroy_distributed_environment, + ) + destroy_model_parallel() + destroy_distributed_environment() + except Exception: + pass + + # Final guard: if torch.distributed is still initialized after the cleanup + # attempts above, destroy the default process group here. Without this, + # vLLM's init_distributed_environment() skips TCPStore server creation on + # the next LLM() init, so new worker processes can never join the barrier + # (→ 1800 s Gloo timeout) because the main driver calls barrier() on the + # stale old group while workers wait on a fresh one that never reaches quorum. + try: + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + except Exception: + pass + + def parse_args(self): + """Add vLLM/NVIDIA-specific CLI flags. Base class pre-loads runner config.""" + args = super().parse_args() + cfg = self._runner_config + + # ── Runner-specific CLI flags ───────────────────────────────────────── + # Defined here (not in benchmark_runner) — vLLM/NVIDIA-specific concepts. + import argparse + parser = argparse.ArgumentParser(add_help=False) + parser.add_argument("--tensor-parallel-size", type=int, default=None, + dest="tensor_parallel_size") + parser.add_argument("--pipeline-parallel-size", type=int, default=None, + dest="pipeline_parallel_size") + parser.add_argument("--expert-parallel-size", type=int, default=None, + dest="expert_parallel_size") + parser.add_argument("--enforce-eager", action="store_true", default=False, + dest="enforce_eager") + extra, _ = parser.parse_known_args() + + # Priority: CLI flag > yaml config > required_chips > auto-detected > default 1 + # Fully resolved by base class. + tp_size, _tp_source = self._resolve_tensor_parallel_size( + extra.tensor_parallel_size + ) + + pp_size = (extra.pipeline_parallel_size + if extra.pipeline_parallel_size is not None + else cfg.get("pipeline_parallel_size", 1)) + ep_size = (extra.expert_parallel_size + if extra.expert_parallel_size is not None + else cfg.get("expert_parallel_size", 1)) + # enforce_eager: CLI flag OR yaml setting (either activates it) + self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) + + print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") + if ep_size > 1: + print(f" expert_parallel_size = {ep_size} [cli/yaml]") + + if not self.SUPPORTS_MULTI_CHIP and tp_size * pp_size > 1: + print(f"Warning: {self.__class__.__name__} does not support multi-chip. " + f"Ignoring tensor_parallel_size={tp_size}, using 1.") + tp_size = 1 + pp_size = 1 + ep_size = 1 + + # Report to base class — used by _compute_run_id(), _build_result_json(), etc. + # Note: for MoE with expert parallelism, chips are shared between TP and EP + # dimensions — ep_size does not add to chip count independently. + self._parallelism = { + "tensor_parallel_size": tp_size, + "pipeline_parallel_size": pp_size, + "expert_parallel_size": ep_size, + "data_parallel_size": 1, + } + self._chip_count = tp_size * pp_size + self._precision = getattr(args, "precision", None) + return args + + def get_extra_subprocess_args(self, args) -> list[str]: + """Forward vLLM/NVIDIA-specific flags to subprocess invocations.""" + extra = [ + "--tensor-parallel-size", + str(self._parallelism.get("tensor_parallel_size", 1)), + ] + if self._parallelism.get("pipeline_parallel_size", 1) > 1: + extra += ["--pipeline-parallel-size", + str(self._parallelism["pipeline_parallel_size"])] + if self._parallelism.get("expert_parallel_size", 1) > 1: + extra += ["--expert-parallel-size", + str(self._parallelism["expert_parallel_size"])] + if self._enforce_eager: + extra += ["--enforce-eager"] + return extra + + +if __name__ == "__main__": + OneCatVLLMRunner().main() \ No newline at end of file From c8d36376413097589c75bfe2ed5fce4110dc30a4 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 02:59:18 +0000 Subject: [PATCH 2/5] update readme --- runners/nvidia_onecat_vllm_a43d1bcf/README.md | 56 ++++++++++++++++--- 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md index eb9f5c0..b358ebd 100644 --- a/runners/nvidia_onecat_vllm_a43d1bcf/README.md +++ b/runners/nvidia_onecat_vllm_a43d1bcf/README.md @@ -55,26 +55,64 @@ To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump | Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. | | Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. | -## Prerequisites +## Environment setup + +1Cat-vLLM 1.0.0 ships **prebuilt wheels only** (no PyPI `vllm`). Install the +wheels **before** `requirements.txt` — the extras file intentionally omits +`torch` / `vllm` so it does not fight the cu128 index used by the wheels. + +### Validated stack (1Cat-vLLM 1.0.0) + +| Component | Version | +|-----------|---------| +| OS | Ubuntu **24.04** (glibc ≥ 2.38) | +| Python | **3.12** (`cp312` wheels only) | +| CUDA | **12.8** toolkit + matching driver (570.x recommended) | +| PyTorch | **2.9.1+cu128** (pulled in by the wheels) | +| GPU | Tesla V100 / V100S (SM70) | + +Upstream reference: [1Cat-vLLM releases](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0) +and [installation guide](https://github.com/1CatAI/1Cat-vLLM#quick-start). + +### Ubuntu 22.04 and other older hosts + +The release wheels are linked against **glibc 2.38**. On Ubuntu 22.04 (glibc +2.35), `pip install` may succeed but `import vllm` fails with +`GLIBC_2.38 not found`. Options: + +- Run on **Ubuntu 24.04** (bare metal or VM), or +- Use a **glibc ≥ 2.38 container** on the host (see the [1Cat-vLLM Docker + notes](https://github.com/1CatAI/1Cat-vLLM#docker-deployment) — build/run + on a machine where the Docker daemon is available; nested dev containers + without `docker.sock` bind-mount usually cannot host Docker), or +- **Build from source** on your host glibc (see 1Cat-vLLM “Source build”). + +### Install steps + +From the AccelMark repo root, in a fresh **Python 3.12** environment: ```bash -# 1. CUDA 12.8 toolkit + matching driver (570.x recommended) +# 1. CUDA 12.8 toolkit + driver # https://developer.nvidia.com/cuda-12-8-0-download-archive -# 2. Python 3.12 (1Cat 1.0.0 ships cp312 wheels only) -conda create -y -n 1cat-vllm-1.0.0 python=3.12 -conda activate 1cat-vllm-1.0.0 +conda create -y -n onecat-vllm python=3.12 +conda activate onecat-vllm +python -m pip install --upgrade pip setuptools wheel -# 3. Install the 1Cat-vLLM wheels -pip install --prefer-binary --no-cache-dir \ +# 2. 1Cat-vLLM wheels (install BOTH together — do not use PyPI vllm) +python -m pip install --prefer-binary --no-cache-dir \ --extra-index-url https://download.pytorch.org/whl/cu128 \ "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" -# 4. Install AccelMark extras -pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt +# 3. AccelMark runner extras only +python -m pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt ``` +Do **not** install `vllm` from PyPI afterward — it will replace the fork. +Run benchmarks from a directory **outside** a cloned 1Cat-vLLM source tree so +Python does not import the local `vllm/` package instead of the wheel. + ## Smoke test the install ```bash From a6726015b8eb4f54bdd883de049bbe5ef8073d4e Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 16:48:41 +0800 Subject: [PATCH 3/5] update onecat runner --- README.md | 2 +- ...r_nvidia_onecat_vllm_4a9ca6c3.yaml.example | 19 ++ ...r_nvidia_onecat_vllm_a43d1bcf.yaml.example | 71 ------ runners/nvidia_onecat_vllm_4a9ca6c3/README.md | 211 ++++++++++++++++++ runners/nvidia_onecat_vllm_4a9ca6c3/meta.json | 21 ++ .../requirements.txt | 17 ++ .../runner.py | 149 +------------ runners/nvidia_onecat_vllm_a43d1bcf/README.md | 188 ---------------- runners/nvidia_onecat_vllm_a43d1bcf/meta.json | 21 -- .../requirements.txt | 58 ----- 10 files changed, 276 insertions(+), 481 deletions(-) create mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example delete mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/README.md create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/meta.json create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt rename runners/{nvidia_onecat_vllm_a43d1bcf => nvidia_onecat_vllm_4a9ca6c3}/runner.py (60%) delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/README.md delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/meta.json delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt diff --git a/README.md b/README.md index 1c4acbd..7c171ee 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | -| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_a43d1bcf` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | +| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_4a9ca6c3` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example new file mode 100644 index 0000000..8c6c310 --- /dev/null +++ b/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example @@ -0,0 +1,19 @@ +# AccelMark runner config — nvidia_onecat_vllm_4a9ca6c3 (1Cat-vLLM on V100) +# Copy to runner_nvidia_onecat_vllm_4a9ca6c3.yaml (gitignored). See runner README. + +tensor_parallel_size: 1 +enforce_eager: false +max_num_seqs: 1 +gpu_memory_utilization: 0.88 + +# V100 SM70: required for Suite A-style runs (see runner README). +engine_kwargs: + enable_prefix_caching: false + kv_cache_auto_trim_ratio: 0.0 + +suites: + suite_D: + max_num_seqs: 1 + gpu_memory_utilization: 0.85 + suite_C: + max_num_seqs: 1 diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example deleted file mode 100644 index d39949e..0000000 --- a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example +++ /dev/null @@ -1,71 +0,0 @@ -# AccelMark runner config — nvidia_onecat_vllm_a43d1bcf (1Cat-vLLM on V100) -# -# Copy this file to runner_nvidia_onecat_vllm_a43d1bcf.yaml (remove the -# .example suffix) and adjust for your hardware. The actual .yaml is -# gitignored. -# -# These settings adapt the runner to your hardware environment. They are -# recorded in result.json task.extra_config but are NOT part of the -# benchmark identity (not hashed into run_id). -# -# Merge priority: CLI flags > suite-specific > global defaults > runner defaults - -# ── Global defaults (apply to all suites) ───────────────────────────────────── - -# Tensor parallel size — number of V100 cards (1Cat-vLLM 1.0.0 public ref: -# 4 x V100 32 GB). For 256K context the recommended TP is 4 on 32 GB cards. -tensor_parallel_size: 4 - -# Disable Volta CUDA-graph capture. Set true if you hit Triton sm>=80 errors. -# 1Cat-vLLM normally captures graphs without --enforce-eager — leave false -# unless you've observed startup hangs on the first request. -enforce_eager: false - -# 1Cat 1.0.0 public default is 1 (low-concurrency stable serving). -# Bump to 4 only for the MTP + prefix-cache profile. -max_num_seqs: 1 - -# 1Cat 1.0.0 public default. Reduce to 0.85 if you see OOM at engine init. -gpu_memory_utilization: 0.88 - -# Pass-through kwargs forwarded to vLLM LLM() / AsyncEngineArgs(). 1Cat-vLLM -# recognises `attention_backend` and a few extra knobs (kv-cache-auto-trim-ratio, -# compilation-config, speculative-config). Unknown keys are dropped at engine -# init with a warning, so this is safe across vLLM 0.7.x / 1Cat-1.0.0. -# -# Defaults left commented out below — the runner auto-injects -# `attention_backend: FLASH_ATTN_V100` if you don't set it here or via the -# VLLM_ATTENTION_BACKEND environment variable. -# -# engine_kwargs: -# attention_backend: FLASH_ATTN_V100 # auto-injected by the runner -# kv_cache_auto_trim_ratio: 0.0 # disables 1Cat KV auto-trim (MTP profile) -# mamba_cache_mode: align # required for Qwen3.6 hybrid Mamba -# compilation_config: -# cudagraph_mode: full_and_piecewise -# cudagraph_capture_sizes: [1, 2, 4, 8] - -# ── Suite-specific overrides ─────────────────────────────────────────────────── - -suites: - suite_D: - # Long-context — keep max_num_seqs low and reserve more memory. - max_num_seqs: 1 - gpu_memory_utilization: 0.85 - - suite_C: - # Quantization suite — AWQ is the primary 1Cat target. - # max_model_len: 12288 reproduces the 1Cat 1.0.0 internal speed harness. - max_num_seqs: 1 - - # MTP profile for Qwen3.6-27B-AWQ — copy this and uncomment to use. - # suite_A: - # max_num_seqs: 4 - # engine_kwargs: - # enable_prefix_caching: true - # speculative_config: - # method: mtp - # num_speculative_tokens: 4 - # compilation_config: - # cudagraph_mode: full_and_piecewise - # cudagraph_capture_sizes: [1, 2, 4, 8, 9, 18] diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md b/runners/nvidia_onecat_vllm_4a9ca6c3/README.md new file mode 100644 index 0000000..d29b0e1 --- /dev/null +++ b/runners/nvidia_onecat_vllm_4a9ca6c3/README.md @@ -0,0 +1,211 @@ +# nvidia_onecat_vllm_4a9ca6c3 — 1Cat-vLLM Runner (Tesla V100 / SM70) + +AccelMark runner for **Tesla V100 / V100S only**, using +[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta). + +> **Hardware:** Use this runner only on V100 / V100S (SM70). On Ampere or newer, +> use upstream `nvidia_vllm_*`. + +> **Third-party software:** 1Cat-vLLM is maintained by [1CatAI](https://github.com/1CatAI/1Cat-vLLM) +> under its own license. AccelMark ships only the thin `runner.py` wrapper; install +> 1Cat-vLLM separately as described below. + +## Why 1Cat-vLLM + +| Limitation on stock vLLM + V100 | 1Cat-vLLM | +|--------------------------------|-----------| +| AWQ kernels need SM75+ | SM70 AWQ via lmdeploy TurboMind | +| FlashAttention 2/3 need Ampere+ | `FLASH_ATTN_V100` backend | +| Qwen3.5 / Qwen3.6 on V100 | Fork model/runtime fixes | +| Long-context on Volta | SM70 paged-attention path | + +Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0). + +## Runner defaults (code) + +| Setting | Default | +|---------|---------| +| `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) | +| `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) | +| `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only | +| `max_num_seqs` | `1` (via runner config) | +| `gpu_memory_utilization` | `0.88` | + +## Supported suites + +| Suite | Notes | +|-------|-------| +| A | Runs on 1× V100; upstream `nvidia_vllm_*` + `--enforce-eager` is often enough | +| B | **Primary** — use `--tensor-parallel-size 4` on 4× V100 32GB | +| C | **Primary** — AWQ | +| D | **Primary** — long context + `FLASH_ATTN_V100` | +| E | Multi-chip scaling (same TP guidance as B) | +| F | Not recommended (edge model; use upstream runner) | +| G | **Primary** — MoE + AWQ (Qwen3.5/3.6 class models) | + +--- + +## Environment setup + +### Reference stack (1Cat-vLLM 1.0.0) + +| Component | Version | +|-----------|---------| +| GPU | Tesla V100 / V100S (SM70) | +| Python | **3.12** (`cp312` wheels only) | +| CUDA toolkit | **12.8** | +| Driver | 570.x recommended (CUDA 12.8) | +| PyTorch | **2.9.1+cu128** (from 1Cat wheels or build env) | + +### Path A — Prebuilt wheels (Ubuntu 24.04+, glibc ≥ 2.38) + +Official wheels require **glibc 2.38+** (e.g. Ubuntu 24.04). On Ubuntu 22.04, +`pip install` may succeed but `import vllm` fails with `GLIBC_2.38 not found` +— use Path B instead. + +```bash +conda create -y -n onecat-vllm python=3.12 +conda activate onecat-vllm +python -m pip install --upgrade pip setuptools wheel + +# Install BOTH wheels together — never `pip install vllm` from PyPI +python -m pip install --prefer-binary --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cu128 \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ + "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" + +cd /path/to/AccelMark +pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt +``` + +### Path B — Build from source (Ubuntu 22.04 / glibc 2.35) + +Build on the **host glibc** so binaries link against 2.35. Typical AutoDL / +Ubuntu 22.04 V100 boxes use this path. + +**Prerequisites:** CUDA 12.8 toolkit (`nvcc` on PATH), conda Python 3.12, ~20GB +free disk for build tree + wheels. + +```bash +conda create -y -n onecat-vllm python=3.12 +conda activate onecat-vllm +export CUDA_HOME=/usr/local/cuda-12.8 +export PATH="$CUDA_HOME/bin:$PATH" +export TORCH_CUDA_ARCH_LIST="7.0" +export MAX_JOBS=6 +export PIP_CACHE_DIR=/path/to/fast/disk/pip-cache # optional + +git clone --depth 1 --branch v1.0.0 https://github.com/1CatAI/1Cat-vLLM.git +cd 1Cat-vLLM +pip install -r requirements/build.txt -r requirements/cuda.txt -r requirements/common.txt +pip install cmake build ninja + +DIST=/path/to/dist-cu128-sm70-v1.0.0 +mkdir -p "$DIST" + +# 1) flash_attn_v100 wheel +pushd flash-attention-v100 +python -m build --wheel --no-isolation --outdir "$DIST" +popd + +# 2) vllm wheel (30–90 min on V100 host) +export VLLM_TARGET_DEVICE=cuda +python -m build --wheel --no-isolation --outdir "$DIST" + +# 3) Install — run from /tmp so Python does not import the source tree +pip install "$DIST"/flash_attn_v100-*.whl +cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl + +cd /path/to/AccelMark +pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt +``` + +Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python +may import the local `vllm/` package instead of the installed wheel. + +### Smoke test + +Run from `/tmp` or the AccelMark repo root (not inside `1Cat-vLLM/`): + +```bash +python - <<'PY' +import torch, vllm +print("torch:", torch.__version__, "vllm:", vllm.__version__) +import flash_attn_v100_cuda +print("flash_attn_v100: ok") +from vllm import LLM +print("LLM import: ok") +PY +``` + +--- + +## AccelMark runner config (required on V100) + +Copy and edit: + +```bash +cp configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example \ + configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml +``` + +**Single V100 32GB** — recommended `engine_kwargs` (avoids prefix prefill shared-memory +crash on SM70: `Shared memory exceeds 96KB`): + +```yaml +tensor_parallel_size: 1 +max_num_seqs: 1 +gpu_memory_utilization: 0.88 +engine_kwargs: + enable_prefix_caching: false + kv_cache_auto_trim_ratio: 0.0 +``` + +**4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs` +unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see +example file comments). + +Other tuning: + +| Symptom | Try | +|---------|-----| +| First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` | +| OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) | +| `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ | + +--- + +## Basic usage + +```bash +cp configs/submitter.yaml.example configs/submitter.yaml # once +cp configs/models_local.yaml.example configs/models_local.yaml # map local model paths + +export PYTHONPATH=/path/to/AccelMark # if pip install -e . is unavailable + +# Suite A smoke (1× V100) +python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \ + --suite suite_A --scenario accuracy --tensor-parallel-size 1 + +# Suite B (4× V100) +python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \ + --suite suite_B --tensor-parallel-size 4 +``` + +--- + +## Known limitations + +- Prefix caching + chunked paged prefill can exceed V100's 96KB shared memory per SM; + disable `enable_prefix_caching` (see config above). +- `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional + for 1Cat's long-context V100 profile. +- Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed). +- End-to-end validation on 4× V100 reference hardware is still community-pending in + `meta.json`; single-GPU smoke (Suite A accuracy) has been exercised on V100 32GB. + +## Requirements + +See `requirements.txt`. Install `torch`, `flash_attn_v100`, and the `vllm` fork +from 1Cat-vLLM **before** the AccelMark extras file. Do not install upstream +`vllm` from PyPI after the fork. diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json b/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json new file mode 100644 index 0000000..b3d136e --- /dev/null +++ b/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json @@ -0,0 +1,21 @@ +{ + "id": "nvidia_onecat_vllm_4a9ca6c3", + "platform": "nvidia", + "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA", + "framework": "1Cat-vLLM", + "submitted_by": "JuhaoLiang1997", + "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.", + "supersedes_chain": ["nvidia_onecat_vllm_a43d1bcf"], + "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM", + "created": "2026-05-15", + "hardware_label": "NVIDIA V100 (SM70)", + "suite_support": { + "A": "pending", + "B": "pending", + "C": "pending", + "D": "pending", + "E": "pending", + "F": "unsupported", + "G": "pending" + } +} diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt b/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt new file mode 100644 index 0000000..b6d4c62 --- /dev/null +++ b/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt @@ -0,0 +1,17 @@ +# AccelMark extras for nvidia_onecat_vllm_4a9ca6c3. +# Install 1Cat-vLLM (flash_attn_v100 + vllm fork) first — see README.md. + +transformers==4.57.6 +tokenizers==0.22.2 +huggingface-hub==0.35.0 +accelerate==1.10.1 +safetensors==0.6.2 + +numpy==1.26.4 +jsonschema==4.25.1 +psutil==7.1.0 +tqdm==4.67.1 + +nvidia-ml-py==13.580.82 +aiohttp==3.12.15 +PyYAML==6.0.2 diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py b/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py similarity index 60% rename from runners/nvidia_onecat_vllm_a43d1bcf/runner.py rename to runners/nvidia_onecat_vllm_4a9ca6c3/runner.py index 56002bb..861b352 100644 --- a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py +++ b/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py @@ -1,31 +1,8 @@ """ AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script. -Implements BenchmarkRunner for `1Cat-vLLM`, the community fork of vLLM that -re-enables modern model serving on Tesla V100 / SM70 hardware. The fork -preserves the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``, -``SamplingParams``) but ships several SM70-specific pieces: - - - the ``FLASH_ATTN_V100`` attention backend (FlashAttention re-implemented - for Volta) — set as this runner's default - - AWQ 4-bit kernels patched in from lmdeploy TurboMind for SM70 - - validated paths for Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB - - FP8 KV cache (``fp8_e5m2``) as an experimental option - -Because the Python entry points are identical to upstream vLLM, this runner -is structurally a clone of ``nvidia_vllm_*`` with three runtime overrides: - - 1. ``SUPPORTED_PRECISIONS`` drops BF16 (V100 cannot do BF16 in hardware). - 2. ``SUPPORTED_QUANTIZATION_BACKENDS`` advertises only ``awq`` — the fork's - headline feature; other quantizations are unproven on this stack. - 3. ``load_model()`` auto-injects ``attention_backend="FLASH_ATTN_V100"`` - into the engine kwargs unless the user has explicitly set one. - -Reference build: - Python 3.12 · CUDA 12.8 · torch 2.9.1+cu128 · 1Cat-vLLM 1.0.0 wheels - Validated on 4 x Tesla PG503 / V100 32 GB. - -All orchestration logic still lives in runners/benchmark_runner.py. +Thin vLLM runner wrapper for the 1Cat-vLLM fork on Tesla V100 / V100S. +See README.md in this folder for install, hardware scope, and tuning. """ import asyncio @@ -34,7 +11,6 @@ from pathlib import Path from typing import Optional -# Add repo root to path _REPO_ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(_REPO_ROOT)) @@ -47,37 +23,20 @@ from loadgen.types import InferenceResult - -# Suppress per-request vLLM logs by default import logging logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING) logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING) class OneCatVLLMRunner(BenchmarkRunner): - """ - AccelMark benchmark runner using 1Cat-vLLM (SM70 / V100 fork) on NVIDIA. - - This runner is intended **specifically for Tesla V100 / V100S (SM70)**. - Other NVIDIA GPUs should use the upstream ``nvidia_vllm_*`` runner — - 1Cat-vLLM's kernels are tuned for Volta and provide no advantage on - Ampere or newer hardware. - """ + """1Cat-vLLM on NVIDIA V100 / V100S (SM70). Use nvidia_vllm_* on newer GPUs.""" SUPPORTS_STREAMING = True SUPPORTS_BATCHING = True SUPPORTS_ONLINE = True SUPPORTS_MULTI_CHIP = True - # V100 has no BF16 datapath — drop BF16 entirely so the suite picks - # FP16 as the effective precision without an amber warning. FP32 is left - # in for completeness but is essentially never used in inference. SUPPORTED_PRECISIONS = ["fp16", "fp32"] - - # 1Cat-vLLM's flagship contribution is AWQ 4-bit on SM70 (via lmdeploy - # TurboMind kernels). Other quantization backends are not validated on - # this stack — keep the surface conservative and let users opt-in by - # subclassing if they want to try FP8 KV cache (``fp8_e5m2`` only). SUPPORTED_QUANTIZATION_BACKENDS = ["awq"] def __init__(self): @@ -88,7 +47,6 @@ def __init__(self): self._loop: asyncio.AbstractEventLoop = None def _get_chip_count(self) -> int: - """Return the number of available CUDA GPUs.""" try: import torch n = torch.cuda.device_count() @@ -100,12 +58,6 @@ def _get_framework_name(self) -> str: return "1Cat-vLLM" def _get_framework_version(self) -> str: - """Report vllm.__version__ plus the flash_attn_v100 wheel version. - - 1Cat-vLLM ships two coupled wheels (`vllm` patched fork + `flash_attn_v100`) - and the FA-V100 wheel is the bit that actually changes attention - performance on Volta. Recording both makes the result reproducible. - """ core = "unknown" try: import vllm @@ -129,7 +81,6 @@ def _get_framework_version(self) -> str: return core def load_model(self, model_path: str, parallelism: dict) -> None: - """Load model — sync LLM for offline/accuracy, async engine for streaming.""" tp_size = parallelism["tensor_parallel_size"] pp_size = parallelism["pipeline_parallel_size"] ep_size = parallelism.get("expert_parallel_size", 1) @@ -141,22 +92,10 @@ def load_model(self, model_path: str, parallelism: dict) -> None: enforce_eager = getattr(self, "_enforce_eager", False) cfg = getattr(self, "_runner_config", {}) - # 1Cat-vLLM public defaults for 4 x V100 32 GB: - # max_num_seqs = 1 (baseline) or 4 (MTP profile) - # gpu_memory_utilization = 0.88 - # These differ noticeably from upstream vLLM's 512 / 0.90 defaults - # because 256K context on V100 demands a much tighter KV budget. max_num_seqs = cfg.get("max_num_seqs", 1) gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88) extra_kwargs = dict(cfg.get("engine_kwargs") or {}) - # ── Default to FLASH_ATTN_V100 attention backend ────────────────────── - # 1Cat-vLLM's public recommendation for V100 is FLASH_ATTN_V100. Inject - # it as the default unless the user explicitly set a backend in their - # runner config engine_kwargs, or via the VLLM_ATTENTION_BACKEND - # environment variable. We support both spellings because vLLM accepts - # the kwarg as `attention_backend` and the env var as - # VLLM_ATTENTION_BACKEND. import os if ( "attention_backend" not in extra_kwargs @@ -164,10 +103,6 @@ def load_model(self, model_path: str, parallelism: dict) -> None: ): extra_kwargs["attention_backend"] = "FLASH_ATTN_V100" - # ── Filter engine_kwargs to only fields this vLLM version accepts ───── - # Avoids TypeError when the runner config YAML references a field that - # doesn't exist in the installed vLLM version (EngineArgs is a strict - # dataclass — unknown keyword arguments raise TypeError immediately). try: import dataclasses from vllm.engine.arg_utils import EngineArgs as _EngineArgs @@ -178,43 +113,29 @@ def load_model(self, model_path: str, parallelism: dict) -> None: f"1Cat-vLLM version and will be ignored: {list(_dropped)}") extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid} except Exception: - pass # If introspection fails, pass kwargs as-is and let vLLM report the error + pass - # Use precision resolved by BenchmarkRunner._resolve_precision() effective_precision = getattr(self, "_effective_precision", "BF16").upper() precision = getattr(self, "_precision", None) or effective_precision - # dtype_override and quantization may be injected by benchmark_runner from - # precision_model_map entry fields (dtype_override, engine_kwargs.quantization). - # These take priority over the runner's own precision→dtype mapping below. _dtype_override = getattr(self, "_precision_dtype_override", None) _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {}) quantization = _prec_eng_kwargs.pop("quantization", None) - # Map native precision names to explicit dtypes. - # Quantized formats (anything not in this map) use dtype="auto" — vLLM reads - # the storage dtype from the checkpoint's config.json, and the quantization - # kernel is set explicitly via the `quantization` kwarg already populated above - # from precision_model_map engine_kwargs. No fallback guessing needed here. _NATIVE_DTYPE_MAP = { "BF16": "bfloat16", "FP16": "float16", "FP32": "float32", } dtype = _NATIVE_DTYPE_MAP.get(precision, "auto") - self._quantization_method = quantization # None for native, explicit str for quantized + self._quantization_method = quantization - # dtype_override from precision_model_map wins over the mapping above. - # Used for e.g. FP16 baseline on pre-Ampere hardware (V100/T4). if _dtype_override: dtype = _dtype_override - # Merge remaining precision_engine_kwargs (after popping quantization) into - # extra_kwargs so they reach LLM() / AsyncEngineArgs. Runner YAML engine_kwargs - # still take final precedence via the **extra_kwargs spread at the end. if _prec_eng_kwargs: - _prec_eng_kwargs.update(extra_kwargs) # runner YAML wins on conflict + _prec_eng_kwargs.update(extra_kwargs) extra_kwargs = _prec_eng_kwargs print(f"Loading model: precision={precision}, dtype={dtype}" @@ -259,8 +180,6 @@ def load_model(self, model_path: str, parallelism: dict) -> None: trust_remote_code=False, enforce_eager=enforce_eager, gpu_memory_utilization=gpu_memory_util, - # engine_kwargs values override named fields above if the same key appears in both. - # This is intentional — engine_kwargs is the power-user escape hatch. **extra_kwargs, ) if ep_size > 1: @@ -271,42 +190,23 @@ def load_model(self, model_path: str, parallelism: dict) -> None: self.engine = AsyncLLMEngine.from_engine_args(engine_args) def get_effective_dtype(self) -> Optional[str]: - """ - Report the actual compute dtype vLLM used after model loading. - - vLLM exposes the resolved dtype via model_config after initialization. - This captures cases like FP8 weights on A100 computing in BF16. - """ try: if self.llm is not None: - # Sync LLM path dtype = self.llm.llm_engine.model_config.dtype return str(dtype).replace("torch.", "") elif self.engine is not None: - # Async engine path dtype = self.engine.engine.model_config.dtype return str(dtype).replace("torch.", "") except Exception: pass - # Fall back to declared dtype if introspection fails return getattr(self, "_effective_dtype", None) def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]: - """Send all requests to vLLM at once. vLLM handles internal batching. - - total_time_ms in each returned InferenceResult is set to the wall-clock - elapsed time of the entire batch — NOT an individual per-request latency. - vLLM's sync LLM.generate() blocks until all requests finish, so there is - no per-request completion timestamp available. All results share the same - total_time_ms value, which is the correct denominator for throughput: - throughput = total_tokens / (elapsed_ms / 1000) - """ formatted = [self._format_prompt(r.prompt) for r in requests] t_start = time.perf_counter() outputs = self.llm.generate(formatted, self.sampling_params) elapsed = time.perf_counter() - t_start - # Store output text for _run_accuracy_integrated() self._last_accuracy_outputs = [o.outputs[0].text for o in outputs] results = [] @@ -322,7 +222,6 @@ def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[Inferen return results async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult: - """Stream a single request, measuring TTFT.""" from vllm.utils import random_uuid formatted = self._format_prompt(request.prompt) @@ -354,15 +253,6 @@ async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceRe ) async def inference_fn_token_stream(self, request: InferenceRequest): - """ - Async generator yielding decoded text deltas for the serve layer. - - Each yield is the delta text since the last output — new characters - only, not the full accumulated string. - - vLLM's engine.generate() yields cumulative outputs, so we track the - previous text length and slice off only the new portion each step. - """ from vllm.utils import random_uuid formatted = self._format_prompt(request.prompt) @@ -385,7 +275,6 @@ def get_peak_memory_gb(self) -> float: return None def release_resources(self) -> None: - """Release vLLM engines and distributed state.""" if self.llm is not None: try: del self.llm @@ -405,18 +294,10 @@ def release_resources(self) -> None: pass self.engine = None - # Destroy vLLM's distributed state so the next engine initialisation - # creates a fresh TCPStore server. Must call destroy_model_parallel() - # first to clear vLLM's cached group references; only then is it safe - # to destroy the underlying torch process group. Skipping this step - # leaves torch.distributed.is_initialized()==True, which causes - # init_distributed_environment() to skip creating the new TCPStore - # server, so spawned worker processes can never connect (→ 600 s timeout). try: from vllm.distributed.parallel_state import cleanup_dist_env_and_memory cleanup_dist_env_and_memory(shutdown_ray=False) except Exception: - # Fallback for older vLLM builds that lack cleanup_dist_env_and_memory try: from vllm.distributed.parallel_state import ( destroy_model_parallel, destroy_distributed_environment, @@ -426,12 +307,6 @@ def release_resources(self) -> None: except Exception: pass - # Final guard: if torch.distributed is still initialized after the cleanup - # attempts above, destroy the default process group here. Without this, - # vLLM's init_distributed_environment() skips TCPStore server creation on - # the next LLM() init, so new worker processes can never join the barrier - # (→ 1800 s Gloo timeout) because the main driver calls barrier() on the - # stale old group while workers wait on a fresh one that never reaches quorum. try: if torch.distributed.is_initialized(): torch.distributed.destroy_process_group() @@ -439,12 +314,9 @@ def release_resources(self) -> None: pass def parse_args(self): - """Add vLLM/NVIDIA-specific CLI flags. Base class pre-loads runner config.""" args = super().parse_args() cfg = self._runner_config - # ── Runner-specific CLI flags ───────────────────────────────────────── - # Defined here (not in benchmark_runner) — vLLM/NVIDIA-specific concepts. import argparse parser = argparse.ArgumentParser(add_help=False) parser.add_argument("--tensor-parallel-size", type=int, default=None, @@ -457,8 +329,6 @@ def parse_args(self): dest="enforce_eager") extra, _ = parser.parse_known_args() - # Priority: CLI flag > yaml config > required_chips > auto-detected > default 1 - # Fully resolved by base class. tp_size, _tp_source = self._resolve_tensor_parallel_size( extra.tensor_parallel_size ) @@ -469,7 +339,6 @@ def parse_args(self): ep_size = (extra.expert_parallel_size if extra.expert_parallel_size is not None else cfg.get("expert_parallel_size", 1)) - # enforce_eager: CLI flag OR yaml setting (either activates it) self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False) print(f" tensor_parallel_size = {tp_size} [{_tp_source}]") @@ -483,9 +352,6 @@ def parse_args(self): pp_size = 1 ep_size = 1 - # Report to base class — used by _compute_run_id(), _build_result_json(), etc. - # Note: for MoE with expert parallelism, chips are shared between TP and EP - # dimensions — ep_size does not add to chip count independently. self._parallelism = { "tensor_parallel_size": tp_size, "pipeline_parallel_size": pp_size, @@ -497,7 +363,6 @@ def parse_args(self): return args def get_extra_subprocess_args(self, args) -> list[str]: - """Forward vLLM/NVIDIA-specific flags to subprocess invocations.""" extra = [ "--tensor-parallel-size", str(self._parallelism.get("tensor_parallel_size", 1)), @@ -514,4 +379,4 @@ def get_extra_subprocess_args(self, args) -> list[str]: if __name__ == "__main__": - OneCatVLLMRunner().main() \ No newline at end of file + OneCatVLLMRunner().main() diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md deleted file mode 100644 index b358ebd..0000000 --- a/runners/nvidia_onecat_vllm_a43d1bcf/README.md +++ /dev/null @@ -1,188 +0,0 @@ -# nvidia_onecat_vllm_a43d1bcf — 1Cat-vLLM Runner (Tesla V100 / SM70) - -AccelMark runner for **Tesla V100 / V100S only**, using -[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) — the community vLLM fork -that re-enables modern AWQ 4-bit serving and FlashAttention on Volta GPUs -(SM70). - -> **Hardware scope:** This runner is intentionally narrow. On Ampere -> (A100/A800/A10/L4/4090/etc.) or newer, use the upstream -> `nvidia_vllm_*` runner — 1Cat-vLLM's kernels are tuned for SM70 and -> provide no benefit on later architectures. - -> **Status:** Committed without an end-to-end validation run yet. The runner -> code is a thin specialisation of the upstream NVIDIA vLLM runner (only -> capability flags + attention-backend default differ), so existing test -> coverage of the parent runner applies. Plan to add a reference -> `Tesla V100-SXM2-32GBx4 suite_B` result once a target box is available. - -## Why 1Cat-vLLM exists - -| Pain on stock vLLM + V100 | 1Cat-vLLM's fix | -|---|---| -| AWQ kernels require SM75+ | Integrated lmdeploy TurboMind WMMA kernels for SM70 | -| FlashAttention 2/3 require Ampere+ | Custom `FLASH_ATTN_V100` Volta backend | -| Qwen3.5 / Qwen3.6 dense + MoE not loadable | Model configs and runtime fixes shipped in fork | -| Long-context paged-prefill stability | SM70-specific MLA/GDN runtime fixes | -| FP8 KV cache | `fp8_e5m2` (experimental) on V100 FA path | - -For full release notes see - RELEASE_NOTES_1.0.0.md. - -## Defaults this runner injects - -| Knob | Default | Where set | Why | -|---|---|---|---| -| `attention_backend` | `FLASH_ATTN_V100` | `load_model()` if not already specified | 1Cat-vLLM's recommended V100 path | -| `SUPPORTED_PRECISIONS` | `["fp16", "fp32"]` | class attribute | V100 has no BF16 | -| `SUPPORTED_QUANTIZATION_BACKENDS` | `["awq"]` | class attribute | 1Cat's headline kernel; other formats not validated on this stack | -| `max_num_seqs` | `1` | runner config default | 1Cat 1.0.0 public default — 256K context on V100 | -| `gpu_memory_utilization` | `0.88` | runner config default | 1Cat 1.0.0 public default | - -To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump -`max_num_seqs` to `4` and pass `speculative_config` via the runner config -`engine_kwargs` — see the example config file. - -## Supported suites - -| Suite | Recommendation | -|-------|---------------| -| Suite A — Llama-3-8B 1× | Runs, but vanilla `nvidia_vllm_47f5d58e --enforce-eager` already covers this. Use 1Cat only if you want the FA-V100 attention path. | -| Suite B — Llama-3-70B multi-chip | **Primary target.** Recommended `--tensor-parallel-size 4`. | -| Suite C — Quantization | Restricted to AWQ — this is where 1Cat shines. | -| Suite D — Long context (~28K) | **Primary target.** `FLASH_ATTN_V100` is the only V100-friendly long-context path. | -| Suite E — Scaling | Same considerations as Suite B; useful for measuring how 1Cat's MCCL-equivalent scales. | -| Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. | -| Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. | - -## Environment setup - -1Cat-vLLM 1.0.0 ships **prebuilt wheels only** (no PyPI `vllm`). Install the -wheels **before** `requirements.txt` — the extras file intentionally omits -`torch` / `vllm` so it does not fight the cu128 index used by the wheels. - -### Validated stack (1Cat-vLLM 1.0.0) - -| Component | Version | -|-----------|---------| -| OS | Ubuntu **24.04** (glibc ≥ 2.38) | -| Python | **3.12** (`cp312` wheels only) | -| CUDA | **12.8** toolkit + matching driver (570.x recommended) | -| PyTorch | **2.9.1+cu128** (pulled in by the wheels) | -| GPU | Tesla V100 / V100S (SM70) | - -Upstream reference: [1Cat-vLLM releases](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0) -and [installation guide](https://github.com/1CatAI/1Cat-vLLM#quick-start). - -### Ubuntu 22.04 and other older hosts - -The release wheels are linked against **glibc 2.38**. On Ubuntu 22.04 (glibc -2.35), `pip install` may succeed but `import vllm` fails with -`GLIBC_2.38 not found`. Options: - -- Run on **Ubuntu 24.04** (bare metal or VM), or -- Use a **glibc ≥ 2.38 container** on the host (see the [1Cat-vLLM Docker - notes](https://github.com/1CatAI/1Cat-vLLM#docker-deployment) — build/run - on a machine where the Docker daemon is available; nested dev containers - without `docker.sock` bind-mount usually cannot host Docker), or -- **Build from source** on your host glibc (see 1Cat-vLLM “Source build”). - -### Install steps - -From the AccelMark repo root, in a fresh **Python 3.12** environment: - -```bash -# 1. CUDA 12.8 toolkit + driver -# https://developer.nvidia.com/cuda-12-8-0-download-archive - -conda create -y -n onecat-vllm python=3.12 -conda activate onecat-vllm -python -m pip install --upgrade pip setuptools wheel - -# 2. 1Cat-vLLM wheels (install BOTH together — do not use PyPI vllm) -python -m pip install --prefer-binary --no-cache-dir \ - --extra-index-url https://download.pytorch.org/whl/cu128 \ - "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ - "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" - -# 3. AccelMark runner extras only -python -m pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt -``` - -Do **not** install `vllm` from PyPI afterward — it will replace the fork. -Run benchmarks from a directory **outside** a cloned 1Cat-vLLM source tree so -Python does not import the local `vllm/` package instead of the wheel. - -## Smoke test the install - -```bash -python - <<'PY' -import torch, vllm -print("torch:", torch.__version__, " vllm:", vllm.__version__) -try: - import flash_attn_v100_cuda # SM70 FA kernels - print("flash_attn_v100: ok") -except Exception as e: - print("flash_attn_v100: MISSING ->", e) -PY -``` - -`flash_attn_v100` MUST be importable — if it isn't, you accidentally -installed plain vLLM from PyPI; reinstall from the 1Cat release wheels above. - -## Basic usage - -```bash -# Suite D (long-context) on 4 x V100 32 GB -python run.py --runner nvidia_onecat_vllm_a43d1bcf \ - --suite suite_D \ - --tensor-parallel-size 4 - -# Suite C with AWQ (Qwen3.5-27B-AWQ as the validation model) -python run.py --runner nvidia_onecat_vllm_a43d1bcf \ - --suite suite_C \ - --tensor-parallel-size 4 \ - --model-path /data/models/Qwen3.5-27B-AWQ - -# Override attention backend (rare — for benchmarking vs Triton fallback) -python run.py --runner nvidia_onecat_vllm_a43d1bcf \ - --suite suite_B \ - --tensor-parallel-size 4 \ - # Then set attention_backend in your runner config engine_kwargs. -``` - -## Runner config - -Copy the example: - -```bash -cp configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example \ - configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml -``` - -Key defaults differ from the upstream NVIDIA runner: - -| Field | 1Cat default | Upstream default | Notes | -|-------|--------------|------------------|-------| -| `max_num_seqs` | 1 | 512 | 256K context demands very tight KV cache budget | -| `gpu_memory_utilization` | 0.88 | 0.90 | Matches 1Cat 1.0.0 public reference | -| `engine_kwargs.attention_backend` | `FLASH_ATTN_V100` (auto) | — | Auto-injected unless overridden | - -## Known gaps (pre-smoke-test) - -- The Volta CUDA-graph capture path needs validation under - `--scenario sustained`. If startup hangs on the first request, set - `enforce_eager: true` in your runner config. -- The accuracy gate uses the suite's stock prompts — on AWQ checkpoints - the gate threshold may be too tight; the suite spec already allows - per-format thresholds (Suite C) so this is mostly relevant on Suite A/D. -- MTP / speculative profiles are documented in 1Cat 1.0.0 but not - exercised here yet; flat speculative keys in `_precision_engine_kwargs` - are still forwarded as `speculative_config` by `benchmark_runner.py`, - the same as for the upstream runner. - -## Requirements - -See `requirements.txt`. The heavy dependencies (`torch`, `flash_attn_v100`, -`vllm` fork) MUST come from the 1Cat-vLLM release wheels — do not install -upstream `vllm` from PyPI. diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json deleted file mode 100644 index b86b000..0000000 --- a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "id": "nvidia_onecat_vllm_a43d1bcf", - "platform": "nvidia", - "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA", - "framework": "1Cat-vLLM", - "submitted_by": "JuhaoLiang1997", - "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — the community vLLM fork that re-enables AWQ 4-bit inference on Volta via lmdeploy TurboMind kernels and the FLASH_ATTN_V100 attention backend. Targets Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB. Use the upstream nvidia_vllm runner on Ampere or newer.", - "supersedes_chain": [], - "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless the user overrides it. SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath). SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline contribution; FP8 KV cache and other formats are intentionally not exposed by default. Initial commit, not yet validated end-to-end on hardware.", - "created": "2026-05-15", - "hardware_label": "NVIDIA V100 (SM70)", - "suite_support": { - "A": "pending", - "B": "pending", - "C": "pending", - "D": "pending", - "E": "pending", - "F": "unsupported", - "G": "pending" - } -} diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt deleted file mode 100644 index 01e687d..0000000 --- a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt +++ /dev/null @@ -1,58 +0,0 @@ -# AccelMark -- 1Cat-vLLM (SM70 / V100) runner dependencies -# -# 1Cat-vLLM is a community fork of vLLM tuned for Tesla V100. It ships two -# wheels that must be installed together: -# - flash_attn_v100 (Volta-optimised FlashAttention kernels) -# - vllm (patched fork, exposes the FLASH_ATTN_V100 backend -# and AWQ 4-bit kernels for SM70) -# -# Both wheels are published as GitHub release assets at: -# https://github.com/1CatAI/1Cat-vLLM/releases -# -# Reference validated stack (1Cat-vLLM 1.0.0): -# OS: Ubuntu 24.04 -# Python: 3.12 -# CUDA: 12.8 -# PyTorch: 2.9.1+cu128 -# Driver: 570.211.01 -# GPU: 4 x Tesla V100 32 GB -# -# Installation: -# # 1. Install CUDA 12.8 toolkit and matching driver -# # 2. Create a fresh Python 3.12 environment -# # 3. Install the two 1Cat-vLLM wheels from the release page: -# pip install --prefer-binary --no-cache-dir \ -# --extra-index-url https://download.pytorch.org/whl/cu128 \ -# "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \ -# "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" -# -# # 4. Then install the AccelMark extras below: -# pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt -# -# Note: do NOT add `torch==2.x` here — the matching torch wheel is pulled in -# by the 1Cat-vLLM wheel install command above. Listing torch here would -# fight with the cu128 extra-index-url. - -# Transformers stack — compatible with 1Cat-vLLM 1.0.0 (vllm fork 1.0.0, -# based on upstream vLLM 0.7.x line). Versions match the upstream -# nvidia_vllm_47f5d58e runner so we know they're consistent. -transformers==4.57.6 -tokenizers==0.22.2 -huggingface-hub==0.35.0 -accelerate==1.10.1 -safetensors==0.6.2 - -# AccelMark dependencies -numpy==1.26.4 -jsonschema==4.25.1 -psutil==7.1.0 -tqdm==4.67.1 - -# NVIDIA monitoring (for power and GPU stats — same as upstream NVIDIA runner) -nvidia-ml-py==13.580.82 - -# Async support -aiohttp==3.12.15 - -# Config file parsing -PyYAML==6.0.2 From d78b58eababe4182b25ae703c579d4c48a620a29 Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 17:57:24 +0800 Subject: [PATCH 4/5] fix onecat --- README.md | 2 +- ..._nvidia_onecat_vllm_12a253c2.yaml.example} | 10 ++-- .../README.md | 46 +++++++++++++------ .../meta.json | 6 +-- .../requirements.txt | 0 .../runner.py | 4 +- 6 files changed, 42 insertions(+), 26 deletions(-) rename configs/runner_configs/{runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example => runner_nvidia_onecat_vllm_12a253c2.yaml.example} (52%) rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/README.md (81%) rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/meta.json (69%) rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/requirements.txt (100%) rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/runner.py (99%) diff --git a/README.md b/README.md index 7c171ee..922c479 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:| | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | -| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_4a9ca6c3` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | +| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ | | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — | | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — | diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example similarity index 52% rename from configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example rename to configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example index 8c6c310..6644d79 100644 --- a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example +++ b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example @@ -1,14 +1,14 @@ -# AccelMark runner config — nvidia_onecat_vllm_4a9ca6c3 (1Cat-vLLM on V100) -# Copy to runner_nvidia_onecat_vllm_4a9ca6c3.yaml (gitignored). See runner README. +# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100) +# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README. tensor_parallel_size: 1 enforce_eager: false -max_num_seqs: 1 -gpu_memory_utilization: 0.88 +max_num_seqs: 512 +gpu_memory_utilization: 0.90 -# V100 SM70: required for Suite A-style runs (see runner README). engine_kwargs: enable_prefix_caching: false + enable_chunked_prefill: false kv_cache_auto_trim_ratio: 0.0 suites: diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md b/runners/nvidia_onecat_vllm_12a253c2/README.md similarity index 81% rename from runners/nvidia_onecat_vllm_4a9ca6c3/README.md rename to runners/nvidia_onecat_vllm_12a253c2/README.md index d29b0e1..0556214 100644 --- a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md +++ b/runners/nvidia_onecat_vllm_12a253c2/README.md @@ -1,4 +1,4 @@ -# nvidia_onecat_vllm_4a9ca6c3 — 1Cat-vLLM Runner (Tesla V100 / SM70) +# nvidia_onecat_vllm_12a253c2 — 1Cat-vLLM Runner (Tesla V100 / SM70) AccelMark runner for **Tesla V100 / V100S only**, using [1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta). @@ -28,8 +28,8 @@ Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/t | `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) | | `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) | | `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only | -| `max_num_seqs` | `1` (via runner config) | -| `gpu_memory_utilization` | `0.88` | +| `max_num_seqs` | `512` global default (same as upstream vLLM); use `1` for suite D / long-context | +| `gpu_memory_utilization` | `0.90` | ## Supported suites @@ -75,7 +75,7 @@ python -m pip install --prefer-binary --no-cache-dir \ "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl" cd /path/to/AccelMark -pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt +pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt ``` ### Path B — Build from source (Ubuntu 22.04 / glibc 2.35) @@ -117,7 +117,7 @@ pip install "$DIST"/flash_attn_v100-*.whl cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl cd /path/to/AccelMark -pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt +pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt ``` Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python @@ -145,22 +145,36 @@ PY Copy and edit: ```bash -cp configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example \ - configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml +cp configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example \ + configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml ``` -**Single V100 32GB** — recommended `engine_kwargs` (avoids prefix prefill shared-memory -crash on SM70: `Shared memory exceeds 96KB`): +**Single V100 32GB** — recommended `engine_kwargs` (avoids SM70 +`Shared memory exceeds 96KB` in `prefill_paged_fwd`): ```yaml tensor_parallel_size: 1 -max_num_seqs: 1 -gpu_memory_utilization: 0.88 +max_num_seqs: 512 +gpu_memory_utilization: 0.90 engine_kwargs: enable_prefix_caching: false + enable_chunked_prefill: false kv_cache_auto_trim_ratio: 0.0 + +suites: + suite_D: + max_num_seqs: 1 + gpu_memory_utilization: 0.85 +``` + +If it still crashes, export before `python run.py`: + +```bash +export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1 ``` +That forces the slower paged-KV gather fallback instead of `prefill_paged_fwd`. + **4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs` unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see example file comments). @@ -169,6 +183,7 @@ Other tuning: | Symptom | Try | |---------|-----| +| `Shared memory exceeds 96KB` | `enable_chunked_prefill: false` + `enable_prefix_caching: false` (above); then `export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` | | First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` | | OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) | | `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ | @@ -184,11 +199,11 @@ cp configs/models_local.yaml.example configs/models_local.yaml # map local mod export PYTHONPATH=/path/to/AccelMark # if pip install -e . is unavailable # Suite A smoke (1× V100) -python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \ +python run.py --runner nvidia_onecat_vllm_12a253c2 \ --suite suite_A --scenario accuracy --tensor-parallel-size 1 # Suite B (4× V100) -python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \ +python run.py --runner nvidia_onecat_vllm_12a253c2 \ --suite suite_B --tensor-parallel-size 4 ``` @@ -196,8 +211,9 @@ python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \ ## Known limitations -- Prefix caching + chunked paged prefill can exceed V100's 96KB shared memory per SM; - disable `enable_prefix_caching` (see config above). +- Prefix caching and **chunked prefill** (even with prefix caching off) can hit the + `prefill_paged_fwd` kernel (>96KB shared memory on SM70). Disable both in config; + use `VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` if needed (see above). - `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional for 1Cat's long-context V100 profile. - Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed). diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json b/runners/nvidia_onecat_vllm_12a253c2/meta.json similarity index 69% rename from runners/nvidia_onecat_vllm_4a9ca6c3/meta.json rename to runners/nvidia_onecat_vllm_12a253c2/meta.json index b3d136e..394601f 100644 --- a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json +++ b/runners/nvidia_onecat_vllm_12a253c2/meta.json @@ -1,12 +1,12 @@ { - "id": "nvidia_onecat_vllm_4a9ca6c3", + "id": "nvidia_onecat_vllm_12a253c2", "platform": "nvidia", "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA", "framework": "1Cat-vLLM", "submitted_by": "JuhaoLiang1997", "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.", - "supersedes_chain": ["nvidia_onecat_vllm_a43d1bcf"], - "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM", + "supersedes_chain": ["nvidia_onecat_vllm_4a9ca6c3", "nvidia_onecat_vllm_a43d1bcf"], + "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching and chunked prefill in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM", "created": "2026-05-15", "hardware_label": "NVIDIA V100 (SM70)", "suite_support": { diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt similarity index 100% rename from runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt rename to runners/nvidia_onecat_vllm_12a253c2/requirements.txt diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py b/runners/nvidia_onecat_vllm_12a253c2/runner.py similarity index 99% rename from runners/nvidia_onecat_vllm_4a9ca6c3/runner.py rename to runners/nvidia_onecat_vllm_12a253c2/runner.py index 861b352..3462765 100644 --- a/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py +++ b/runners/nvidia_onecat_vllm_12a253c2/runner.py @@ -92,8 +92,8 @@ def load_model(self, model_path: str, parallelism: dict) -> None: enforce_eager = getattr(self, "_enforce_eager", False) cfg = getattr(self, "_runner_config", {}) - max_num_seqs = cfg.get("max_num_seqs", 1) - gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88) + max_num_seqs = cfg.get("max_num_seqs", 512) + gpu_memory_util = cfg.get("gpu_memory_utilization", 0.90) extra_kwargs = dict(cfg.get("engine_kwargs") or {}) import os From 9aad45b1e30faa3963c1e0f8fe7c664a55c3c72a Mon Sep 17 00:00:00 2001 From: Liang Juhao Date: Mon, 18 May 2026 22:06:52 +0800 Subject: [PATCH 5/5] upload onecat results --- .../accuracy/accuracy.json | 8 + .../env_info.json | 33 +++ .../offline/result.json | 159 +++++++++++++ .../online/result.json | 158 +++++++++++++ .../result.json | 210 ++++++++++++++++++ .../accuracy/accuracy.json | 8 + .../env_info.json | 33 +++ .../interactive/result.json | 126 +++++++++++ .../offline/result.json | 159 +++++++++++++ .../online/result.json | 146 ++++++++++++ .../result.json | 210 ++++++++++++++++++ 11 files changed, 1250 insertions(+) create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json new file mode 100644 index 0000000..304c3db --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.61, + "baseline_delta": null, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json new file mode 100644 index 0000000..52c2fdc --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json @@ -0,0 +1,33 @@ +{ + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json new file mode 100644 index 0000000..2e6fc7f --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 671.32, + "throughput_tokens_per_sec_per_chip": 671.32, + "throughput_tokens_per_sec_total": 1168.67, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 670.99, + "throughput_tokens_per_sec_per_chip": 670.99, + "throughput_tokens_per_sec_total": 1168.09, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 671.43, + "throughput_tokens_per_sec_per_chip": 671.43, + "throughput_tokens_per_sec_total": 1168.44, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "18:03:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", + "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", + "benchmark_elapsed_minutes": 10.3, + "model_load_seconds": 47.8 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json new file mode 100644 index 0000000..66aeb48 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json @@ -0,0 +1,158 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 113119.0, + "ttft_ms_p90": 832380.28, + "ttft_ms_p99": 872316.46, + "tpot_ms_p50": 1274.2, + "tpot_ms_p90": 1801.34, + "tpot_ms_p99": 4289.09, + "elapsed_seconds_median": 968.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 130646.03, + "ttft_ms_p90": 865522.04, + "ttft_ms_p99": 901339.26, + "tpot_ms_p50": 1262.15, + "tpot_ms_p90": 1785.02, + "tpot_ms_p99": 4287.18, + "elapsed_seconds_median": 936.5, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 132710.0, + "ttft_ms_p90": 863880.66, + "ttft_ms_p99": 888527.06, + "tpot_ms_p50": 1248.86, + "tpot_ms_p90": 1740.58, + "tpot_ms_p99": 4225.34, + "elapsed_seconds_median": 921.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:25:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00", + "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00", + "benchmark_elapsed_minutes": 140.9, + "model_load_seconds": 45.2 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json new file mode 100644 index 0000000..07930da --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json @@ -0,0 +1,210 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_A", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T09:38:50.346241+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 8.0, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 8, + "throughput_tokens_per_sec": 671.32, + "throughput_tokens_per_sec_per_chip": 671.32, + "throughput_tokens_per_sec_total": 1168.67, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 32, + "throughput_tokens_per_sec": 670.99, + "throughput_tokens_per_sec_per_chip": 670.99, + "throughput_tokens_per_sec_total": 1168.09, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 128, + "throughput_tokens_per_sec": 671.43, + "throughput_tokens_per_sec_per_chip": 671.43, + "throughput_tokens_per_sec_total": 1168.44, + "elapsed_seconds_median": 51.6, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 5, + "achieved_qps": 5.0, + "ttft_ms_p50": 113119.0, + "ttft_ms_p90": 832380.28, + "ttft_ms_p99": 872316.46, + "tpot_ms_p50": 1274.2, + "tpot_ms_p90": 1801.34, + "tpot_ms_p99": 4289.09, + "elapsed_seconds_median": 968.7, + "sla_met": false + }, + { + "target_qps": 25, + "achieved_qps": 25.0, + "ttft_ms_p50": 130646.03, + "ttft_ms_p90": 865522.04, + "ttft_ms_p99": 901339.26, + "tpot_ms_p50": 1262.15, + "tpot_ms_p90": 1785.02, + "tpot_ms_p99": 4287.18, + "elapsed_seconds_median": 936.5, + "sla_met": false + }, + { + "target_qps": 100, + "achieved_qps": 100.0, + "ttft_ms_p50": 132710.0, + "ttft_ms_p90": 863880.66, + "ttft_ms_p99": 888527.06, + "tpot_ms_p50": 1248.86, + "tpot_ms_p90": 1740.58, + "tpot_ms_p99": 4225.34, + "elapsed_seconds_median": 921.5, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": 0.61, + "baseline_delta": null, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "18:03:39", + "run_id": "4e0e6eba", + "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00", + "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00", + "benchmark_elapsed_minutes": 151.2, + "model_load_seconds": 47.8, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.", + "scenario_dirs": { + "offline": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline", + "online": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online" + } + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json new file mode 100644 index 0000000..94e5547 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json @@ -0,0 +1,8 @@ +{ + "subset_score": 0.37, + "baseline_delta": 0.0, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json new file mode 100644 index 0000000..1f8b6bd --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json @@ -0,0 +1,33 @@ +{ + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json new file mode 100644 index 0000000..f017bc2 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json @@ -0,0 +1,126 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "interactive", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "interactive": { + "ttft_ms_p50": 26.76, + "ttft_ms_p90": 29.57, + "ttft_ms_p99": 40.69, + "tpot_ms_p50": 3.51, + "tpot_ms_p90": 3.76, + "tpot_ms_p99": 3.81, + "peak_memory_gb": null, + "elapsed_seconds_median": 116.9 + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:45:36", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:39:46.224469+00:00", + "benchmark_end_time": "2026-05-18T12:45:36.498231+00:00", + "benchmark_elapsed_minutes": 5.8, + "model_load_seconds": 27.8 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json new file mode 100644 index 0000000..da8126b --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json @@ -0,0 +1,159 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "offline", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6234.82, + "throughput_tokens_per_sec_per_chip": 6234.82, + "throughput_tokens_per_sec_total": 9303.11, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6292.79, + "throughput_tokens_per_sec_per_chip": 6292.79, + "throughput_tokens_per_sec_total": 9356.18, + "elapsed_seconds_median": 6.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6243.51, + "throughput_tokens_per_sec_per_chip": 6243.51, + "throughput_tokens_per_sec_total": 9267.55, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:28:55", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", + "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", + "benchmark_elapsed_minutes": 1.4, + "model_load_seconds": 31.7 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json new file mode 100644 index 0000000..170f9d0 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json @@ -0,0 +1,146 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": null, + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenario": "online", + "num_runs": 3, + "warmup_runs": 1, + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + }, + "runtime_metrics": null + }, + "metrics": { + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 6316.13, + "ttft_ms_p90": 53409.43, + "ttft_ms_p99": 67932.56, + "tpot_ms_p50": 206.23, + "tpot_ms_p90": 291.3, + "tpot_ms_p99": 636.32, + "elapsed_seconds_median": 103.3, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 19238.78, + "ttft_ms_p90": 56898.27, + "ttft_ms_p99": 75398.9, + "tpot_ms_p50": 189.24, + "tpot_ms_p90": 300.17, + "tpot_ms_p99": 582.22, + "elapsed_seconds_median": 86.3, + "sla_met": false + } + ] + } + }, + "accuracy": { + "subset_score": null, + "baseline_delta": null, + "valid": false, + "notes": "Run --scenario accuracy to check model accuracy." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:38:56", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:29:46.673625+00:00", + "benchmark_end_time": "2026-05-18T12:38:56.798553+00:00", + "benchmark_elapsed_minutes": 9.2, + "model_load_seconds": 28.7 + } +} \ No newline at end of file diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json new file mode 100644 index 0000000..12baab4 --- /dev/null +++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json @@ -0,0 +1,210 @@ +{ + "schema_version": "1.0", + "suite_id": "suite_F", + "implementation_id": "nvidia_onecat_vllm_12a253c2", + "chip": { + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "count": 1, + "memory_gb": 32.0, + "interconnect_intra_node": null, + "interconnect_inter_node": null + }, + "environment": { + "collected_at": "2026-05-18T12:26:03.593928+00:00", + "accelerators": [ + { + "index": 0, + "name": "Tesla V100-PCIE-32GB", + "vendor": "NVIDIA", + "memory_gb": 32.0, + "driver_version": "580.82.07", + "firmware_version": null, + "compute_capability": "7.0", + "supports_bf16": false + } + ], + "accelerator_platform": "nvidia", + "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n X = Self\n SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n PIX = Connection traversing at most a single PCIe bridge\n NV# = Connection traversing a bonded set of # NVLinks\n", + "intra_node_interconnect": null, + "cpu": { + "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz", + "physical_cores": 26, + "logical_cores": 52, + "numa_nodes": 2 + }, + "system_memory_gb": 214.5, + "pcie_generation": "PCIe Gen 3", + "cpu_accelerator_bandwidth_gbs": null, + "network_interfaces": null, + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13", + "kernel_version": "5.4.0-149-generic", + "runtime_version": "CUDA 12.8", + "pytorch_version": "2.9.1+cu128" + }, + "software": { + "framework": "1Cat-vLLM", + "framework_version": "1.0.0+flash_attn_v100-1.0.0", + "driver_version": "580.82.07", + "runtime_version": "CUDA 12.8", + "os": "Ubuntu 22.04.5 LTS", + "python_version": "3.12.13" + }, + "model": { + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775", + "model_name": null, + "model_note": null, + "model_source": "local", + "architecture": "dense", + "parameter_count_b": 0.5, + "precision": "FP16", + "effective_dtype": "float16", + "quantization_method": null, + "model_format": "HuggingFace original" + }, + "task": { + "scenarios_run": [ + "offline", + "online", + "interactive" + ], + "parallelism": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "expert_parallel_size": 1, + "data_parallel_size": 1 + }, + "num_runs": 3, + "extra_config": { + "tensor_parallel_size": 1, + "enforce_eager": false, + "max_num_seqs": 512, + "gpu_memory_utilization": 0.9, + "engine_kwargs": { + "enable_prefix_caching": false, + "enable_chunked_prefill": false, + "kv_cache_auto_trim_ratio": 0.0 + } + } + }, + "metrics": { + "derived": {}, + "offline": { + "results_by_concurrency": [ + { + "client_concurrency": 4, + "throughput_tokens_per_sec": 6234.82, + "throughput_tokens_per_sec_per_chip": 6234.82, + "throughput_tokens_per_sec_total": 9303.11, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 16, + "throughput_tokens_per_sec": 6292.79, + "throughput_tokens_per_sec_per_chip": 6292.79, + "throughput_tokens_per_sec_total": 9356.18, + "elapsed_seconds_median": 6.7, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + }, + { + "client_concurrency": 64, + "throughput_tokens_per_sec": 6243.51, + "throughput_tokens_per_sec_per_chip": 6243.51, + "throughput_tokens_per_sec_total": 9267.55, + "elapsed_seconds_median": 6.8, + "peak_memory_gb": null, + "power_watts_avg": null, + "power_watts_peak": null, + "oom": false, + "_throughput_note": "output_only", + "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs." + } + ] + }, + "online": { + "sla_ttft_ms": 500, + "max_valid_qps": 0.0, + "results_by_qps": [ + { + "target_qps": 10, + "achieved_qps": 10.0, + "ttft_ms_p50": 6316.13, + "ttft_ms_p90": 53409.43, + "ttft_ms_p99": 67932.56, + "tpot_ms_p50": 206.23, + "tpot_ms_p90": 291.3, + "tpot_ms_p99": 636.32, + "elapsed_seconds_median": 103.3, + "sla_met": false + }, + { + "target_qps": 40, + "achieved_qps": 40.0, + "ttft_ms_p50": 19238.78, + "ttft_ms_p90": 56898.27, + "ttft_ms_p99": 75398.9, + "tpot_ms_p50": 189.24, + "tpot_ms_p90": 300.17, + "tpot_ms_p99": 582.22, + "elapsed_seconds_median": 86.3, + "sla_met": false + } + ] + }, + "interactive": { + "ttft_ms_p50": 26.76, + "ttft_ms_p90": 29.57, + "ttft_ms_p99": 40.69, + "tpot_ms_p50": 3.51, + "tpot_ms_p90": 3.76, + "tpot_ms_p99": 3.81, + "peak_memory_gb": null, + "elapsed_seconds_median": 116.9 + } + }, + "accuracy": { + "subset_score": 0.37, + "baseline_delta": 0.0, + "valid": true, + "framework": "1Cat-vLLM", + "precision": "FP16", + "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark." + }, + "meta": { + "submitted_by": "JuhaoLiang1997", + "submission_type": "individual", + "date": "2026-05-18", + "time": "20:28:55", + "run_id": "419b138c", + "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c", + "flagged": null, + "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py", + "env_info_file": "../env_info.json", + "log_file": "run.log", + "samples_file": "samples.jsonl", + "notes": null, + "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00", + "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00", + "benchmark_elapsed_minutes": 16.4, + "model_load_seconds": 31.7, + "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.", + "scenario_dirs": { + "offline": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline", + "online": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online", + "interactive": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive" + } + } +} \ No newline at end of file