From 2854c96b5730e63bed8360ff2a10e58ac648d65f Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Fri, 15 May 2026 11:09:09 +0800
Subject: [PATCH 1/5] =?UTF-8?q?feat:=20add=201Cat-vLLM=20runner=20for=20Te?=
 =?UTF-8?q?sla=20V100=20=E2=80=94=20nvidia=5Fonecat=5Fvllm=5Fa43d1bcf?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the AccelMark runner for the 1Cat-vLLM community fork that
re-enables AWQ 4-bit inference on Volta (SM70) Tesla V100 via lmdeploy
TurboMind kernels and the FLASH_ATTN_V100 attention backend.

What is included:

* runners/nvidia_onecat_vllm_a43d1bcf/ — runner.py, meta.json (with
  hardware_label="NVIDIA V100 (SM70)" and suite_support self-declaration),
  requirements.txt, README.md
* configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example

The README platforms matrix updates automatically — the hardware label
is taken from meta.hardware_label rather than the catalogue default,
so the V100-specific row is rendered correctly without touching
schema/platforms.json or any shared file.

Capability flags:

* SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath).
* SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline
  contribution; FP8 KV cache and other formats are intentionally not
  exposed by default.
* Auto-injects attention_backend=FLASH_ATTN_V100 unless the user
  overrides it.
* Suite F (Qwen2.5-0.5B-Instruct on a consumer/edge GPU) is marked
  unsupported — 1Cat-vLLM targets dense + MoE on 4 x V100, not edge
  inference.

Initial commit, not yet validated end-to-end on hardware; all
applicable suites are marked "pending".

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 README.md                                     |   1 +
 ...r_nvidia_onecat_vllm_a43d1bcf.yaml.example |  71 +++
 runners/nvidia_onecat_vllm_a43d1bcf/README.md | 150 +++++
 runners/nvidia_onecat_vllm_a43d1bcf/meta.json |  21 +
 .../requirements.txt                          |  58 ++
 runners/nvidia_onecat_vllm_a43d1bcf/runner.py | 517 ++++++++++++++++++
 6 files changed, 818 insertions(+)
 create mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
 create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/README.md
 create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/meta.json
 create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
 create mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/runner.py

diff --git a/README.md b/README.md
index 3007966..1c4acbd 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_a43d1bcf` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
new file mode 100644
index 0000000..d39949e
--- /dev/null
+++ b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
@@ -0,0 +1,71 @@
+# AccelMark runner config — nvidia_onecat_vllm_a43d1bcf (1Cat-vLLM on V100)
+#
+# Copy this file to runner_nvidia_onecat_vllm_a43d1bcf.yaml (remove the
+# .example suffix) and adjust for your hardware. The actual .yaml is
+# gitignored.
+#
+# These settings adapt the runner to your hardware environment. They are
+# recorded in result.json task.extra_config but are NOT part of the
+# benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of V100 cards (1Cat-vLLM 1.0.0 public ref:
+# 4 x V100 32 GB). For 256K context the recommended TP is 4 on 32 GB cards.
+tensor_parallel_size: 4
+
+# Disable Volta CUDA-graph capture. Set true if you hit Triton sm>=80 errors.
+# 1Cat-vLLM normally captures graphs without --enforce-eager — leave false
+# unless you've observed startup hangs on the first request.
+enforce_eager: false
+
+# 1Cat 1.0.0 public default is 1 (low-concurrency stable serving).
+# Bump to 4 only for the MTP + prefix-cache profile.
+max_num_seqs: 1
+
+# 1Cat 1.0.0 public default. Reduce to 0.85 if you see OOM at engine init.
+gpu_memory_utilization: 0.88
+
+# Pass-through kwargs forwarded to vLLM LLM() / AsyncEngineArgs(). 1Cat-vLLM
+# recognises `attention_backend` and a few extra knobs (kv-cache-auto-trim-ratio,
+# compilation-config, speculative-config). Unknown keys are dropped at engine
+# init with a warning, so this is safe across vLLM 0.7.x / 1Cat-1.0.0.
+#
+# Defaults left commented out below — the runner auto-injects
+# `attention_backend: FLASH_ATTN_V100` if you don't set it here or via the
+# VLLM_ATTENTION_BACKEND environment variable.
+#
+# engine_kwargs:
+#   attention_backend: FLASH_ATTN_V100        # auto-injected by the runner
+#   kv_cache_auto_trim_ratio: 0.0             # disables 1Cat KV auto-trim (MTP profile)
+#   mamba_cache_mode: align                   # required for Qwen3.6 hybrid Mamba
+#   compilation_config:
+#     cudagraph_mode: full_and_piecewise
+#     cudagraph_capture_sizes: [1, 2, 4, 8]
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+
+suites:
+  suite_D:
+    # Long-context — keep max_num_seqs low and reserve more memory.
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+
+  suite_C:
+    # Quantization suite — AWQ is the primary 1Cat target.
+    # max_model_len: 12288 reproduces the 1Cat 1.0.0 internal speed harness.
+    max_num_seqs: 1
+
+  # MTP profile for Qwen3.6-27B-AWQ — copy this and uncomment to use.
+  # suite_A:
+  #   max_num_seqs: 4
+  #   engine_kwargs:
+  #     enable_prefix_caching: true
+  #     speculative_config:
+  #       method: mtp
+  #       num_speculative_tokens: 4
+  #     compilation_config:
+  #       cudagraph_mode: full_and_piecewise
+  #       cudagraph_capture_sizes: [1, 2, 4, 8, 9, 18]
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md
new file mode 100644
index 0000000..eb9f5c0
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_a43d1bcf/README.md
@@ -0,0 +1,150 @@
+# nvidia_onecat_vllm_a43d1bcf — 1Cat-vLLM Runner (Tesla V100 / SM70)
+
+AccelMark runner for **Tesla V100 / V100S only**, using
+[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) — the community vLLM fork
+that re-enables modern AWQ 4-bit serving and FlashAttention on Volta GPUs
+(SM70).
+
+> **Hardware scope:** This runner is intentionally narrow. On Ampere
+> (A100/A800/A10/L4/4090/etc.) or newer, use the upstream
+> `nvidia_vllm_*` runner — 1Cat-vLLM's kernels are tuned for SM70 and
+> provide no benefit on later architectures.
+
+> **Status:** Committed without an end-to-end validation run yet. The runner
+> code is a thin specialisation of the upstream NVIDIA vLLM runner (only
+> capability flags + attention-backend default differ), so existing test
+> coverage of the parent runner applies. Plan to add a reference
+> `Tesla V100-SXM2-32GBx4 suite_B` result once a target box is available.
+
+## Why 1Cat-vLLM exists
+
+| Pain on stock vLLM + V100 | 1Cat-vLLM's fix |
+|---|---|
+| AWQ kernels require SM75+ | Integrated lmdeploy TurboMind WMMA kernels for SM70 |
+| FlashAttention 2/3 require Ampere+ | Custom `FLASH_ATTN_V100` Volta backend |
+| Qwen3.5 / Qwen3.6 dense + MoE not loadable | Model configs and runtime fixes shipped in fork |
+| Long-context paged-prefill stability | SM70-specific MLA/GDN runtime fixes |
+| FP8 KV cache | `fp8_e5m2` (experimental) on V100 FA path |
+
+For full release notes see
+<https://github.com/1CatAI/1Cat-vLLM> RELEASE_NOTES_1.0.0.md.
+
+## Defaults this runner injects
+
+| Knob | Default | Where set | Why |
+|---|---|---|---|
+| `attention_backend` | `FLASH_ATTN_V100` | `load_model()` if not already specified | 1Cat-vLLM's recommended V100 path |
+| `SUPPORTED_PRECISIONS` | `["fp16", "fp32"]` | class attribute | V100 has no BF16 |
+| `SUPPORTED_QUANTIZATION_BACKENDS` | `["awq"]` | class attribute | 1Cat's headline kernel; other formats not validated on this stack |
+| `max_num_seqs` | `1` | runner config default | 1Cat 1.0.0 public default — 256K context on V100 |
+| `gpu_memory_utilization` | `0.88` | runner config default | 1Cat 1.0.0 public default |
+
+To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump
+`max_num_seqs` to `4` and pass `speculative_config` via the runner config
+`engine_kwargs` — see the example config file.
+
+## Supported suites
+
+| Suite | Recommendation |
+|-------|---------------|
+| Suite A — Llama-3-8B 1× | Runs, but vanilla `nvidia_vllm_47f5d58e --enforce-eager` already covers this. Use 1Cat only if you want the FA-V100 attention path. |
+| Suite B — Llama-3-70B multi-chip | **Primary target.** Recommended `--tensor-parallel-size 4`. |
+| Suite C — Quantization | Restricted to AWQ — this is where 1Cat shines. |
+| Suite D — Long context (~28K) | **Primary target.** `FLASH_ATTN_V100` is the only V100-friendly long-context path. |
+| Suite E — Scaling | Same considerations as Suite B; useful for measuring how 1Cat's MCCL-equivalent scales. |
+| Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. |
+| Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. |
+
+## Prerequisites
+
+```bash
+# 1. CUDA 12.8 toolkit + matching driver (570.x recommended)
+#    https://developer.nvidia.com/cuda-12-8-0-download-archive
+
+# 2. Python 3.12 (1Cat 1.0.0 ships cp312 wheels only)
+conda create -y -n 1cat-vllm-1.0.0 python=3.12
+conda activate 1cat-vllm-1.0.0
+
+# 3. Install the 1Cat-vLLM wheels
+pip install --prefer-binary --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu128 \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
+
+# 4. Install AccelMark extras
+pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
+```
+
+## Smoke test the install
+
+```bash
+python - <<'PY'
+import torch, vllm
+print("torch:", torch.__version__, "  vllm:", vllm.__version__)
+try:
+    import flash_attn_v100_cuda  # SM70 FA kernels
+    print("flash_attn_v100: ok")
+except Exception as e:
+    print("flash_attn_v100: MISSING ->", e)
+PY
+```
+
+`flash_attn_v100` MUST be importable — if it isn't, you accidentally
+installed plain vLLM from PyPI; reinstall from the 1Cat release wheels above.
+
+## Basic usage
+
+```bash
+# Suite D (long-context) on 4 x V100 32 GB
+python run.py --runner nvidia_onecat_vllm_a43d1bcf \
+    --suite suite_D \
+    --tensor-parallel-size 4
+
+# Suite C with AWQ (Qwen3.5-27B-AWQ as the validation model)
+python run.py --runner nvidia_onecat_vllm_a43d1bcf \
+    --suite suite_C \
+    --tensor-parallel-size 4 \
+    --model-path /data/models/Qwen3.5-27B-AWQ
+
+# Override attention backend (rare — for benchmarking vs Triton fallback)
+python run.py --runner nvidia_onecat_vllm_a43d1bcf \
+    --suite suite_B \
+    --tensor-parallel-size 4 \
+    # Then set attention_backend in your runner config engine_kwargs.
+```
+
+## Runner config
+
+Copy the example:
+
+```bash
+cp configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example \
+   configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml
+```
+
+Key defaults differ from the upstream NVIDIA runner:
+
+| Field | 1Cat default | Upstream default | Notes |
+|-------|--------------|------------------|-------|
+| `max_num_seqs` | 1 | 512 | 256K context demands very tight KV cache budget |
+| `gpu_memory_utilization` | 0.88 | 0.90 | Matches 1Cat 1.0.0 public reference |
+| `engine_kwargs.attention_backend` | `FLASH_ATTN_V100` (auto) | — | Auto-injected unless overridden |
+
+## Known gaps (pre-smoke-test)
+
+- The Volta CUDA-graph capture path needs validation under
+  `--scenario sustained`. If startup hangs on the first request, set
+  `enforce_eager: true` in your runner config.
+- The accuracy gate uses the suite's stock prompts — on AWQ checkpoints
+  the gate threshold may be too tight; the suite spec already allows
+  per-format thresholds (Suite C) so this is mostly relevant on Suite A/D.
+- MTP / speculative profiles are documented in 1Cat 1.0.0 but not
+  exercised here yet; flat speculative keys in `_precision_engine_kwargs`
+  are still forwarded as `speculative_config` by `benchmark_runner.py`,
+  the same as for the upstream runner.
+
+## Requirements
+
+See `requirements.txt`. The heavy dependencies (`torch`, `flash_attn_v100`,
+`vllm` fork) MUST come from the 1Cat-vLLM release wheels — do not install
+upstream `vllm` from PyPI.
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json
new file mode 100644
index 0000000..b86b000
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "nvidia_onecat_vllm_a43d1bcf",
+  "platform": "nvidia",
+  "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA",
+  "framework": "1Cat-vLLM",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — the community vLLM fork that re-enables AWQ 4-bit inference on Volta via lmdeploy TurboMind kernels and the FLASH_ATTN_V100 attention backend. Targets Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB. Use the upstream nvidia_vllm runner on Ampere or newer.",
+  "supersedes_chain": [],
+  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless the user overrides it. SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath). SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline contribution; FP8 KV cache and other formats are intentionally not exposed by default. Initial commit, not yet validated end-to-end on hardware.",
+  "created": "2026-05-15",
+  "hardware_label": "NVIDIA V100 (SM70)",
+  "suite_support": {
+    "A": "pending",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "unsupported",
+    "G": "pending"
+  }
+}
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
new file mode 100644
index 0000000..01e687d
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
@@ -0,0 +1,58 @@
+# AccelMark -- 1Cat-vLLM (SM70 / V100) runner dependencies
+#
+# 1Cat-vLLM is a community fork of vLLM tuned for Tesla V100. It ships two
+# wheels that must be installed together:
+#   - flash_attn_v100  (Volta-optimised FlashAttention kernels)
+#   - vllm             (patched fork, exposes the FLASH_ATTN_V100 backend
+#                       and AWQ 4-bit kernels for SM70)
+#
+# Both wheels are published as GitHub release assets at:
+#   https://github.com/1CatAI/1Cat-vLLM/releases
+#
+# Reference validated stack (1Cat-vLLM 1.0.0):
+#   OS:       Ubuntu 24.04
+#   Python:   3.12
+#   CUDA:     12.8
+#   PyTorch:  2.9.1+cu128
+#   Driver:   570.211.01
+#   GPU:      4 x Tesla V100 32 GB
+#
+# Installation:
+#   # 1. Install CUDA 12.8 toolkit and matching driver
+#   # 2. Create a fresh Python 3.12 environment
+#   # 3. Install the two 1Cat-vLLM wheels from the release page:
+#   pip install --prefer-binary --no-cache-dir \
+#     --extra-index-url https://download.pytorch.org/whl/cu128 \
+#     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
+#     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
+#
+#   # 4. Then install the AccelMark extras below:
+#   pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
+#
+# Note: do NOT add `torch==2.x` here — the matching torch wheel is pulled in
+# by the 1Cat-vLLM wheel install command above. Listing torch here would
+# fight with the cu128 extra-index-url.
+
+# Transformers stack — compatible with 1Cat-vLLM 1.0.0 (vllm fork 1.0.0,
+# based on upstream vLLM 0.7.x line). Versions match the upstream
+# nvidia_vllm_47f5d58e runner so we know they're consistent.
+transformers==4.57.6
+tokenizers==0.22.2
+huggingface-hub==0.35.0
+accelerate==1.10.1
+safetensors==0.6.2
+
+# AccelMark dependencies
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+
+# NVIDIA monitoring (for power and GPU stats — same as upstream NVIDIA runner)
+nvidia-ml-py==13.580.82
+
+# Async support
+aiohttp==3.12.15
+
+# Config file parsing
+PyYAML==6.0.2
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py b/runners/nvidia_onecat_vllm_a43d1bcf/runner.py
new file mode 100644
index 0000000..56002bb
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_a43d1bcf/runner.py
@@ -0,0 +1,517 @@
+"""
+AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script.
+
+Implements BenchmarkRunner for `1Cat-vLLM`, the community fork of vLLM that
+re-enables modern model serving on Tesla V100 / SM70 hardware. The fork
+preserves the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``,
+``SamplingParams``) but ships several SM70-specific pieces:
+
+  - the ``FLASH_ATTN_V100`` attention backend (FlashAttention re-implemented
+    for Volta) — set as this runner's default
+  - AWQ 4-bit kernels patched in from lmdeploy TurboMind for SM70
+  - validated paths for Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB
+  - FP8 KV cache (``fp8_e5m2``) as an experimental option
+
+Because the Python entry points are identical to upstream vLLM, this runner
+is structurally a clone of ``nvidia_vllm_*`` with three runtime overrides:
+
+  1. ``SUPPORTED_PRECISIONS`` drops BF16 (V100 cannot do BF16 in hardware).
+  2. ``SUPPORTED_QUANTIZATION_BACKENDS`` advertises only ``awq`` — the fork's
+     headline feature; other quantizations are unproven on this stack.
+  3. ``load_model()`` auto-injects ``attention_backend="FLASH_ATTN_V100"``
+     into the engine kwargs unless the user has explicitly set one.
+
+Reference build:
+    Python 3.12 · CUDA 12.8 · torch 2.9.1+cu128 · 1Cat-vLLM 1.0.0 wheels
+    Validated on 4 x Tesla PG503 / V100 32 GB.
+
+All orchestration logic still lives in runners/benchmark_runner.py.
+"""
+
+import asyncio
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+# Add repo root to path
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_REPO_ROOT))
+
+import torch
+from vllm import LLM, AsyncLLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from transformers import AutoTokenizer
+
+from runners.benchmark_runner import BenchmarkRunner, InferenceRequest
+from loadgen.types import InferenceResult
+
+
+
+# Suppress per-request vLLM logs by default
+import logging
+logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
+logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
+
+
+class OneCatVLLMRunner(BenchmarkRunner):
+    """
+    AccelMark benchmark runner using 1Cat-vLLM (SM70 / V100 fork) on NVIDIA.
+
+    This runner is intended **specifically for Tesla V100 / V100S (SM70)**.
+    Other NVIDIA GPUs should use the upstream ``nvidia_vllm_*`` runner —
+    1Cat-vLLM's kernels are tuned for Volta and provide no advantage on
+    Ampere or newer hardware.
+    """
+
+    SUPPORTS_STREAMING = True
+    SUPPORTS_BATCHING = True
+    SUPPORTS_ONLINE = True
+    SUPPORTS_MULTI_CHIP = True
+
+    # V100 has no BF16 datapath — drop BF16 entirely so the suite picks
+    # FP16 as the effective precision without an amber warning. FP32 is left
+    # in for completeness but is essentially never used in inference.
+    SUPPORTED_PRECISIONS = ["fp16", "fp32"]
+
+    # 1Cat-vLLM's flagship contribution is AWQ 4-bit on SM70 (via lmdeploy
+    # TurboMind kernels). Other quantization backends are not validated on
+    # this stack — keep the surface conservative and let users opt-in by
+    # subclassing if they want to try FP8 KV cache (``fp8_e5m2`` only).
+    SUPPORTED_QUANTIZATION_BACKENDS = ["awq"]
+
+    def __init__(self):
+        self.llm: LLM = None
+        self.engine: AsyncLLMEngine = None
+        self.tokenizer: AutoTokenizer = None
+        self.sampling_params: SamplingParams = None
+        self._loop: asyncio.AbstractEventLoop = None
+
+    def _get_chip_count(self) -> int:
+        """Return the number of available CUDA GPUs."""
+        try:
+            import torch
+            n = torch.cuda.device_count()
+            return n if n > 0 else 1
+        except Exception:
+            return 1
+
+    def _get_framework_name(self) -> str:
+        return "1Cat-vLLM"
+
+    def _get_framework_version(self) -> str:
+        """Report vllm.__version__ plus the flash_attn_v100 wheel version.
+
+        1Cat-vLLM ships two coupled wheels (`vllm` patched fork + `flash_attn_v100`)
+        and the FA-V100 wheel is the bit that actually changes attention
+        performance on Volta. Recording both makes the result reproducible.
+        """
+        core = "unknown"
+        try:
+            import vllm
+            core = vllm.__version__
+        except Exception:
+            pass
+
+        fa_v100 = None
+        try:
+            from importlib.metadata import version as _pkg_version
+            fa_v100 = _pkg_version("flash_attn_v100")
+        except Exception:
+            try:
+                import flash_attn_v100_cuda  # type: ignore  # noqa: F401
+                fa_v100 = "installed"
+            except Exception:
+                fa_v100 = None
+
+        if fa_v100:
+            return f"{core}+flash_attn_v100-{fa_v100}"
+        return core
+
+    def load_model(self, model_path: str, parallelism: dict) -> None:
+        """Load model — sync LLM for offline/accuracy, async engine for streaming."""
+        tp_size = parallelism["tensor_parallel_size"]
+        pp_size = parallelism["pipeline_parallel_size"]
+        ep_size = parallelism.get("expert_parallel_size", 1)
+        assert pp_size <= 1, "Pipeline parallelism is not supported in OneCatVLLMRunner"
+
+        max_tokens    = parallelism["max_tokens"]
+        max_model_len = parallelism["max_model_len"]
+        use_async     = parallelism["use_async"]
+        enforce_eager = getattr(self, "_enforce_eager", False)
+
+        cfg             = getattr(self, "_runner_config", {})
+        # 1Cat-vLLM public defaults for 4 x V100 32 GB:
+        #   max_num_seqs = 1 (baseline) or 4 (MTP profile)
+        #   gpu_memory_utilization = 0.88
+        # These differ noticeably from upstream vLLM's 512 / 0.90 defaults
+        # because 256K context on V100 demands a much tighter KV budget.
+        max_num_seqs    = cfg.get("max_num_seqs", 1)
+        gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88)
+        extra_kwargs    = dict(cfg.get("engine_kwargs") or {})
+
+        # ── Default to FLASH_ATTN_V100 attention backend ──────────────────────
+        # 1Cat-vLLM's public recommendation for V100 is FLASH_ATTN_V100. Inject
+        # it as the default unless the user explicitly set a backend in their
+        # runner config engine_kwargs, or via the VLLM_ATTENTION_BACKEND
+        # environment variable. We support both spellings because vLLM accepts
+        # the kwarg as `attention_backend` and the env var as
+        # VLLM_ATTENTION_BACKEND.
+        import os
+        if (
+            "attention_backend" not in extra_kwargs
+            and "VLLM_ATTENTION_BACKEND" not in os.environ
+        ):
+            extra_kwargs["attention_backend"] = "FLASH_ATTN_V100"
+
+        # ── Filter engine_kwargs to only fields this vLLM version accepts ─────
+        # Avoids TypeError when the runner config YAML references a field that
+        # doesn't exist in the installed vLLM version (EngineArgs is a strict
+        # dataclass — unknown keyword arguments raise TypeError immediately).
+        try:
+            import dataclasses
+            from vllm.engine.arg_utils import EngineArgs as _EngineArgs
+            _valid = {f.name for f in dataclasses.fields(_EngineArgs)}
+            _dropped = {k: v for k, v in extra_kwargs.items() if k not in _valid}
+            if _dropped:
+                print(f"  Warning: engine_kwargs keys not supported by this "
+                      f"1Cat-vLLM version and will be ignored: {list(_dropped)}")
+            extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid}
+        except Exception:
+            pass  # If introspection fails, pass kwargs as-is and let vLLM report the error
+
+        # Use precision resolved by BenchmarkRunner._resolve_precision()
+        effective_precision = getattr(self, "_effective_precision", "BF16").upper()
+        precision           = getattr(self, "_precision", None) or effective_precision
+
+        # dtype_override and quantization may be injected by benchmark_runner from
+        # precision_model_map entry fields (dtype_override, engine_kwargs.quantization).
+        # These take priority over the runner's own precision→dtype mapping below.
+        _dtype_override  = getattr(self, "_precision_dtype_override", None)
+        _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
+
+        quantization = _prec_eng_kwargs.pop("quantization", None)
+
+        # Map native precision names to explicit dtypes.
+        # Quantized formats (anything not in this map) use dtype="auto" — vLLM reads
+        # the storage dtype from the checkpoint's config.json, and the quantization
+        # kernel is set explicitly via the `quantization` kwarg already populated above
+        # from precision_model_map engine_kwargs. No fallback guessing needed here.
+        _NATIVE_DTYPE_MAP = {
+            "BF16":  "bfloat16",
+            "FP16":  "float16",
+            "FP32":  "float32",
+        }
+        dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
+        self._quantization_method = quantization  # None for native, explicit str for quantized
+
+        # dtype_override from precision_model_map wins over the mapping above.
+        # Used for e.g. FP16 baseline on pre-Ampere hardware (V100/T4).
+        if _dtype_override:
+            dtype = _dtype_override
+
+        # Merge remaining precision_engine_kwargs (after popping quantization) into
+        # extra_kwargs so they reach LLM() / AsyncEngineArgs. Runner YAML engine_kwargs
+        # still take final precedence via the **extra_kwargs spread at the end.
+        if _prec_eng_kwargs:
+            _prec_eng_kwargs.update(extra_kwargs)   # runner YAML wins on conflict
+            extra_kwargs = _prec_eng_kwargs
+
+        print(f"Loading model: precision={precision}, dtype={dtype}"
+              + (f", quantization_method={self._quantization_method}"
+                 if self._quantization_method else ""))
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=False
+        )
+
+        self.sampling_params = SamplingParams(
+            max_tokens=max_tokens,
+            temperature=0.0,
+        )
+
+        if not use_async:
+            llm_kwargs = dict(
+                model=model_path,
+                dtype=dtype,
+                tensor_parallel_size=tp_size,
+                trust_remote_code=False,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+                gpu_memory_utilization=gpu_memory_util,
+                **extra_kwargs,
+            )
+            if ep_size > 1:
+                llm_kwargs["enable_expert_parallel"] = True
+                llm_kwargs["tensor_parallel_size"]   = tp_size
+            if quantization:
+                llm_kwargs["quantization"] = quantization
+            if max_model_len:
+                llm_kwargs["max_model_len"] = max_model_len
+            self.llm = LLM(**llm_kwargs)
+        else:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            engine_kwargs = dict(
+                model=model_path,
+                dtype=dtype,
+                tensor_parallel_size=tp_size,
+                trust_remote_code=False,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=gpu_memory_util,
+                # engine_kwargs values override named fields above if the same key appears in both.
+                # This is intentional — engine_kwargs is the power-user escape hatch.
+                **extra_kwargs,
+            )
+            if ep_size > 1:
+                engine_kwargs["enable_expert_parallel"] = True
+            if max_model_len:
+                engine_kwargs["max_model_len"] = max_model_len
+            engine_args = AsyncEngineArgs(**engine_kwargs)
+            self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    def get_effective_dtype(self) -> Optional[str]:
+        """
+        Report the actual compute dtype vLLM used after model loading.
+
+        vLLM exposes the resolved dtype via model_config after initialization.
+        This captures cases like FP8 weights on A100 computing in BF16.
+        """
+        try:
+            if self.llm is not None:
+                # Sync LLM path
+                dtype = self.llm.llm_engine.model_config.dtype
+                return str(dtype).replace("torch.", "")
+            elif self.engine is not None:
+                # Async engine path
+                dtype = self.engine.engine.model_config.dtype
+                return str(dtype).replace("torch.", "")
+        except Exception:
+            pass
+        # Fall back to declared dtype if introspection fails
+        return getattr(self, "_effective_dtype", None)
+
+    def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
+        """Send all requests to vLLM at once. vLLM handles internal batching.
+
+        total_time_ms in each returned InferenceResult is set to the wall-clock
+        elapsed time of the entire batch — NOT an individual per-request latency.
+        vLLM's sync LLM.generate() blocks until all requests finish, so there is
+        no per-request completion timestamp available. All results share the same
+        total_time_ms value, which is the correct denominator for throughput:
+            throughput = total_tokens / (elapsed_ms / 1000)
+        """
+        formatted = [self._format_prompt(r.prompt) for r in requests]
+        t_start = time.perf_counter()
+        outputs = self.llm.generate(formatted, self.sampling_params)
+        elapsed = time.perf_counter() - t_start
+
+        # Store output text for _run_accuracy_integrated()
+        self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
+
+        results = []
+        for output in outputs:
+            results.append(InferenceResult(
+                first_token_time_ms=None,
+                total_time_ms=elapsed * 1000,
+                output_tokens=len(output.outputs[0].token_ids),
+                input_tokens=len(output.prompt_token_ids),
+                success=True,
+                output_text=output.outputs[0].text,
+            ))
+        return results
+
+    async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
+        """Stream a single request, measuring TTFT."""
+        from vllm.utils import random_uuid
+
+        formatted = self._format_prompt(request.prompt)
+        request_id = random_uuid()
+        t_start = time.perf_counter()
+        first_token_time_ms = None
+        output_tokens = 0
+        output_text = ""
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            if (
+                first_token_time_ms is None
+                and len(output.outputs[0].token_ids) > 0
+            ):
+                first_token_time_ms = (time.perf_counter() - t_start) * 1000
+            output_tokens = len(output.outputs[0].token_ids)
+            output_text = output.outputs[0].text
+
+        total_time_ms = (time.perf_counter() - t_start) * 1000
+        return InferenceResult(
+            first_token_time_ms=first_token_time_ms,
+            total_time_ms=total_time_ms,
+            output_tokens=output_tokens,
+            input_tokens=0,
+            success=True,
+            output_text=output_text,
+        )
+
+    async def inference_fn_token_stream(self, request: InferenceRequest):
+        """
+        Async generator yielding decoded text deltas for the serve layer.
+
+        Each yield is the delta text since the last output — new characters
+        only, not the full accumulated string.
+
+        vLLM's engine.generate() yields cumulative outputs, so we track the
+        previous text length and slice off only the new portion each step.
+        """
+        from vllm.utils import random_uuid
+
+        formatted   = self._format_prompt(request.prompt)
+        request_id  = random_uuid()
+        prev_length = 0
+
+        async for output in self.engine.generate(
+            formatted, self.sampling_params, request_id
+        ):
+            current_text = output.outputs[0].text
+            delta = current_text[prev_length:]
+            if delta:
+                yield delta
+                prev_length = len(current_text)
+
+    def get_peak_memory_gb(self) -> float:
+        try:
+            return torch.cuda.max_memory_allocated() / (1024 ** 3)
+        except Exception:
+            return None
+
+    def release_resources(self) -> None:
+        """Release vLLM engines and distributed state."""
+        if self.llm is not None:
+            try:
+                del self.llm
+            except Exception:
+                pass
+            self.llm = None
+
+        if self.engine is not None:
+            try:
+                if self._loop and not self._loop.is_closed():
+                    self._loop.run_until_complete(self.engine.shutdown())
+            except Exception:
+                pass
+            try:
+                del self.engine
+            except Exception:
+                pass
+            self.engine = None
+
+        # Destroy vLLM's distributed state so the next engine initialisation
+        # creates a fresh TCPStore server.  Must call destroy_model_parallel()
+        # first to clear vLLM's cached group references; only then is it safe
+        # to destroy the underlying torch process group.  Skipping this step
+        # leaves torch.distributed.is_initialized()==True, which causes
+        # init_distributed_environment() to skip creating the new TCPStore
+        # server, so spawned worker processes can never connect (→ 600 s timeout).
+        try:
+            from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+            cleanup_dist_env_and_memory(shutdown_ray=False)
+        except Exception:
+            # Fallback for older vLLM builds that lack cleanup_dist_env_and_memory
+            try:
+                from vllm.distributed.parallel_state import (
+                    destroy_model_parallel, destroy_distributed_environment,
+                )
+                destroy_model_parallel()
+                destroy_distributed_environment()
+            except Exception:
+                pass
+
+        # Final guard: if torch.distributed is still initialized after the cleanup
+        # attempts above, destroy the default process group here.  Without this,
+        # vLLM's init_distributed_environment() skips TCPStore server creation on
+        # the next LLM() init, so new worker processes can never join the barrier
+        # (→ 1800 s Gloo timeout) because the main driver calls barrier() on the
+        # stale old group while workers wait on a fresh one that never reaches quorum.
+        try:
+            if torch.distributed.is_initialized():
+                torch.distributed.destroy_process_group()
+        except Exception:
+            pass
+
+    def parse_args(self):
+        """Add vLLM/NVIDIA-specific CLI flags. Base class pre-loads runner config."""
+        args = super().parse_args()
+        cfg = self._runner_config
+
+        # ── Runner-specific CLI flags ─────────────────────────────────────────
+        # Defined here (not in benchmark_runner) — vLLM/NVIDIA-specific concepts.
+        import argparse
+        parser = argparse.ArgumentParser(add_help=False)
+        parser.add_argument("--tensor-parallel-size", type=int, default=None,
+                            dest="tensor_parallel_size")
+        parser.add_argument("--pipeline-parallel-size", type=int, default=None,
+                            dest="pipeline_parallel_size")
+        parser.add_argument("--expert-parallel-size", type=int, default=None,
+                            dest="expert_parallel_size")
+        parser.add_argument("--enforce-eager", action="store_true", default=False,
+                            dest="enforce_eager")
+        extra, _ = parser.parse_known_args()
+
+        # Priority: CLI flag > yaml config > required_chips > auto-detected > default 1
+        # Fully resolved by base class.
+        tp_size, _tp_source = self._resolve_tensor_parallel_size(
+            extra.tensor_parallel_size
+        )
+
+        pp_size = (extra.pipeline_parallel_size
+                   if extra.pipeline_parallel_size is not None
+                   else cfg.get("pipeline_parallel_size", 1))
+        ep_size = (extra.expert_parallel_size
+                   if extra.expert_parallel_size is not None
+                   else cfg.get("expert_parallel_size", 1))
+        # enforce_eager: CLI flag OR yaml setting (either activates it)
+        self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
+
+        print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
+        if ep_size > 1:
+            print(f"  expert_parallel_size = {ep_size}  [cli/yaml]")
+
+        if not self.SUPPORTS_MULTI_CHIP and tp_size * pp_size > 1:
+            print(f"Warning: {self.__class__.__name__} does not support multi-chip. "
+                  f"Ignoring tensor_parallel_size={tp_size}, using 1.")
+            tp_size = 1
+            pp_size = 1
+            ep_size = 1
+
+        # Report to base class — used by _compute_run_id(), _build_result_json(), etc.
+        # Note: for MoE with expert parallelism, chips are shared between TP and EP
+        # dimensions — ep_size does not add to chip count independently.
+        self._parallelism = {
+            "tensor_parallel_size":   tp_size,
+            "pipeline_parallel_size": pp_size,
+            "expert_parallel_size":   ep_size,
+            "data_parallel_size":     1,
+        }
+        self._chip_count = tp_size * pp_size
+        self._precision  = getattr(args, "precision", None)
+        return args
+
+    def get_extra_subprocess_args(self, args) -> list[str]:
+        """Forward vLLM/NVIDIA-specific flags to subprocess invocations."""
+        extra = [
+            "--tensor-parallel-size",
+            str(self._parallelism.get("tensor_parallel_size", 1)),
+        ]
+        if self._parallelism.get("pipeline_parallel_size", 1) > 1:
+            extra += ["--pipeline-parallel-size",
+                      str(self._parallelism["pipeline_parallel_size"])]
+        if self._parallelism.get("expert_parallel_size", 1) > 1:
+            extra += ["--expert-parallel-size",
+                      str(self._parallelism["expert_parallel_size"])]
+        if self._enforce_eager:
+            extra += ["--enforce-eager"]
+        return extra
+
+
+if __name__ == "__main__":
+    OneCatVLLMRunner().main()
\ No newline at end of file

From c8d36376413097589c75bfe2ed5fce4110dc30a4 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 02:59:18 +0000
Subject: [PATCH 2/5] update readme

---
 runners/nvidia_onecat_vllm_a43d1bcf/README.md | 56 ++++++++++++++++---
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md
index eb9f5c0..b358ebd 100644
--- a/runners/nvidia_onecat_vllm_a43d1bcf/README.md
+++ b/runners/nvidia_onecat_vllm_a43d1bcf/README.md
@@ -55,26 +55,64 @@ To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump
 | Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. |
 | Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. |
 
-## Prerequisites
+## Environment setup
+
+1Cat-vLLM 1.0.0 ships **prebuilt wheels only** (no PyPI `vllm`). Install the
+wheels **before** `requirements.txt` — the extras file intentionally omits
+`torch` / `vllm` so it does not fight the cu128 index used by the wheels.
+
+### Validated stack (1Cat-vLLM 1.0.0)
+
+| Component | Version |
+|-----------|---------|
+| OS | Ubuntu **24.04** (glibc ≥ 2.38) |
+| Python | **3.12** (`cp312` wheels only) |
+| CUDA | **12.8** toolkit + matching driver (570.x recommended) |
+| PyTorch | **2.9.1+cu128** (pulled in by the wheels) |
+| GPU | Tesla V100 / V100S (SM70) |
+
+Upstream reference: [1Cat-vLLM releases](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0)
+and [installation guide](https://github.com/1CatAI/1Cat-vLLM#quick-start).
+
+### Ubuntu 22.04 and other older hosts
+
+The release wheels are linked against **glibc 2.38**. On Ubuntu 22.04 (glibc
+2.35), `pip install` may succeed but `import vllm` fails with
+`GLIBC_2.38 not found`. Options:
+
+- Run on **Ubuntu 24.04** (bare metal or VM), or
+- Use a **glibc ≥ 2.38 container** on the host (see the [1Cat-vLLM Docker
+  notes](https://github.com/1CatAI/1Cat-vLLM#docker-deployment) — build/run
+  on a machine where the Docker daemon is available; nested dev containers
+  without `docker.sock` bind-mount usually cannot host Docker), or
+- **Build from source** on your host glibc (see 1Cat-vLLM “Source build”).
+
+### Install steps
+
+From the AccelMark repo root, in a fresh **Python 3.12** environment:
 
 ```bash
-# 1. CUDA 12.8 toolkit + matching driver (570.x recommended)
+# 1. CUDA 12.8 toolkit + driver
 #    https://developer.nvidia.com/cuda-12-8-0-download-archive
 
-# 2. Python 3.12 (1Cat 1.0.0 ships cp312 wheels only)
-conda create -y -n 1cat-vllm-1.0.0 python=3.12
-conda activate 1cat-vllm-1.0.0
+conda create -y -n onecat-vllm python=3.12
+conda activate onecat-vllm
+python -m pip install --upgrade pip setuptools wheel
 
-# 3. Install the 1Cat-vLLM wheels
-pip install --prefer-binary --no-cache-dir \
+# 2. 1Cat-vLLM wheels (install BOTH together — do not use PyPI vllm)
+python -m pip install --prefer-binary --no-cache-dir \
     --extra-index-url https://download.pytorch.org/whl/cu128 \
     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
 
-# 4. Install AccelMark extras
-pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
+# 3. AccelMark runner extras only
+python -m pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
 ```
 
+Do **not** install `vllm` from PyPI afterward — it will replace the fork.
+Run benchmarks from a directory **outside** a cloned 1Cat-vLLM source tree so
+Python does not import the local `vllm/` package instead of the wheel.
+
 ## Smoke test the install
 
 ```bash

From a6726015b8eb4f54bdd883de049bbe5ef8073d4e Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 16:48:41 +0800
Subject: [PATCH 3/5] update onecat runner

---
 README.md                                     |   2 +-
 ...r_nvidia_onecat_vllm_4a9ca6c3.yaml.example |  19 ++
 ...r_nvidia_onecat_vllm_a43d1bcf.yaml.example |  71 ------
 runners/nvidia_onecat_vllm_4a9ca6c3/README.md | 211 ++++++++++++++++++
 runners/nvidia_onecat_vllm_4a9ca6c3/meta.json |  21 ++
 .../requirements.txt                          |  17 ++
 .../runner.py                                 | 149 +------------
 runners/nvidia_onecat_vllm_a43d1bcf/README.md | 188 ----------------
 runners/nvidia_onecat_vllm_a43d1bcf/meta.json |  21 --
 .../requirements.txt                          |  58 -----
 10 files changed, 276 insertions(+), 481 deletions(-)
 create mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example
 delete mode 100644 configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
 create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/README.md
 create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/meta.json
 create mode 100644 runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
 rename runners/{nvidia_onecat_vllm_a43d1bcf => nvidia_onecat_vllm_4a9ca6c3}/runner.py (60%)
 delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/README.md
 delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/meta.json
 delete mode 100644 runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt

diff --git a/README.md b/README.md
index 1c4acbd..7c171ee 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
-| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_a43d1bcf` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
+| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_4a9ca6c3` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example
new file mode 100644
index 0000000..8c6c310
--- /dev/null
+++ b/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example
@@ -0,0 +1,19 @@
+# AccelMark runner config — nvidia_onecat_vllm_4a9ca6c3 (1Cat-vLLM on V100)
+# Copy to runner_nvidia_onecat_vllm_4a9ca6c3.yaml (gitignored). See runner README.
+
+tensor_parallel_size: 1
+enforce_eager: false
+max_num_seqs: 1
+gpu_memory_utilization: 0.88
+
+# V100 SM70: required for Suite A-style runs (see runner README).
+engine_kwargs:
+  enable_prefix_caching: false
+  kv_cache_auto_trim_ratio: 0.0
+
+suites:
+  suite_D:
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+  suite_C:
+    max_num_seqs: 1
diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
deleted file mode 100644
index d39949e..0000000
--- a/configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example
+++ /dev/null
@@ -1,71 +0,0 @@
-# AccelMark runner config — nvidia_onecat_vllm_a43d1bcf (1Cat-vLLM on V100)
-#
-# Copy this file to runner_nvidia_onecat_vllm_a43d1bcf.yaml (remove the
-# .example suffix) and adjust for your hardware. The actual .yaml is
-# gitignored.
-#
-# These settings adapt the runner to your hardware environment. They are
-# recorded in result.json task.extra_config but are NOT part of the
-# benchmark identity (not hashed into run_id).
-#
-# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
-
-# ── Global defaults (apply to all suites) ─────────────────────────────────────
-
-# Tensor parallel size — number of V100 cards (1Cat-vLLM 1.0.0 public ref:
-# 4 x V100 32 GB). For 256K context the recommended TP is 4 on 32 GB cards.
-tensor_parallel_size: 4
-
-# Disable Volta CUDA-graph capture. Set true if you hit Triton sm>=80 errors.
-# 1Cat-vLLM normally captures graphs without --enforce-eager — leave false
-# unless you've observed startup hangs on the first request.
-enforce_eager: false
-
-# 1Cat 1.0.0 public default is 1 (low-concurrency stable serving).
-# Bump to 4 only for the MTP + prefix-cache profile.
-max_num_seqs: 1
-
-# 1Cat 1.0.0 public default. Reduce to 0.85 if you see OOM at engine init.
-gpu_memory_utilization: 0.88
-
-# Pass-through kwargs forwarded to vLLM LLM() / AsyncEngineArgs(). 1Cat-vLLM
-# recognises `attention_backend` and a few extra knobs (kv-cache-auto-trim-ratio,
-# compilation-config, speculative-config). Unknown keys are dropped at engine
-# init with a warning, so this is safe across vLLM 0.7.x / 1Cat-1.0.0.
-#
-# Defaults left commented out below — the runner auto-injects
-# `attention_backend: FLASH_ATTN_V100` if you don't set it here or via the
-# VLLM_ATTENTION_BACKEND environment variable.
-#
-# engine_kwargs:
-#   attention_backend: FLASH_ATTN_V100        # auto-injected by the runner
-#   kv_cache_auto_trim_ratio: 0.0             # disables 1Cat KV auto-trim (MTP profile)
-#   mamba_cache_mode: align                   # required for Qwen3.6 hybrid Mamba
-#   compilation_config:
-#     cudagraph_mode: full_and_piecewise
-#     cudagraph_capture_sizes: [1, 2, 4, 8]
-
-# ── Suite-specific overrides ───────────────────────────────────────────────────
-
-suites:
-  suite_D:
-    # Long-context — keep max_num_seqs low and reserve more memory.
-    max_num_seqs: 1
-    gpu_memory_utilization: 0.85
-
-  suite_C:
-    # Quantization suite — AWQ is the primary 1Cat target.
-    # max_model_len: 12288 reproduces the 1Cat 1.0.0 internal speed harness.
-    max_num_seqs: 1
-
-  # MTP profile for Qwen3.6-27B-AWQ — copy this and uncomment to use.
-  # suite_A:
-  #   max_num_seqs: 4
-  #   engine_kwargs:
-  #     enable_prefix_caching: true
-  #     speculative_config:
-  #       method: mtp
-  #       num_speculative_tokens: 4
-  #     compilation_config:
-  #       cudagraph_mode: full_and_piecewise
-  #       cudagraph_capture_sizes: [1, 2, 4, 8, 9, 18]
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md b/runners/nvidia_onecat_vllm_4a9ca6c3/README.md
new file mode 100644
index 0000000..d29b0e1
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_4a9ca6c3/README.md
@@ -0,0 +1,211 @@
+# nvidia_onecat_vllm_4a9ca6c3 — 1Cat-vLLM Runner (Tesla V100 / SM70)
+
+AccelMark runner for **Tesla V100 / V100S only**, using
+[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta).
+
+> **Hardware:** Use this runner only on V100 / V100S (SM70). On Ampere or newer,
+> use upstream `nvidia_vllm_*`.
+
+> **Third-party software:** 1Cat-vLLM is maintained by [1CatAI](https://github.com/1CatAI/1Cat-vLLM)
+> under its own license. AccelMark ships only the thin `runner.py` wrapper; install
+> 1Cat-vLLM separately as described below.
+
+## Why 1Cat-vLLM
+
+| Limitation on stock vLLM + V100 | 1Cat-vLLM |
+|--------------------------------|-----------|
+| AWQ kernels need SM75+ | SM70 AWQ via lmdeploy TurboMind |
+| FlashAttention 2/3 need Ampere+ | `FLASH_ATTN_V100` backend |
+| Qwen3.5 / Qwen3.6 on V100 | Fork model/runtime fixes |
+| Long-context on Volta | SM70 paged-attention path |
+
+Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0).
+
+## Runner defaults (code)
+
+| Setting | Default |
+|---------|---------|
+| `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) |
+| `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) |
+| `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only |
+| `max_num_seqs` | `1` (via runner config) |
+| `gpu_memory_utilization` | `0.88` |
+
+## Supported suites
+
+| Suite | Notes |
+|-------|-------|
+| A | Runs on 1× V100; upstream `nvidia_vllm_*` + `--enforce-eager` is often enough |
+| B | **Primary** — use `--tensor-parallel-size 4` on 4× V100 32GB |
+| C | **Primary** — AWQ |
+| D | **Primary** — long context + `FLASH_ATTN_V100` |
+| E | Multi-chip scaling (same TP guidance as B) |
+| F | Not recommended (edge model; use upstream runner) |
+| G | **Primary** — MoE + AWQ (Qwen3.5/3.6 class models) |
+
+---
+
+## Environment setup
+
+### Reference stack (1Cat-vLLM 1.0.0)
+
+| Component | Version |
+|-----------|---------|
+| GPU | Tesla V100 / V100S (SM70) |
+| Python | **3.12** (`cp312` wheels only) |
+| CUDA toolkit | **12.8** |
+| Driver | 570.x recommended (CUDA 12.8) |
+| PyTorch | **2.9.1+cu128** (from 1Cat wheels or build env) |
+
+### Path A — Prebuilt wheels (Ubuntu 24.04+, glibc ≥ 2.38)
+
+Official wheels require **glibc 2.38+** (e.g. Ubuntu 24.04). On Ubuntu 22.04,
+`pip install` may succeed but `import vllm` fails with `GLIBC_2.38 not found`
+— use Path B instead.
+
+```bash
+conda create -y -n onecat-vllm python=3.12
+conda activate onecat-vllm
+python -m pip install --upgrade pip setuptools wheel
+
+# Install BOTH wheels together — never `pip install vllm` from PyPI
+python -m pip install --prefer-binary --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu128 \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
+    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
+
+cd /path/to/AccelMark
+pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
+```
+
+### Path B — Build from source (Ubuntu 22.04 / glibc 2.35)
+
+Build on the **host glibc** so binaries link against 2.35. Typical AutoDL /
+Ubuntu 22.04 V100 boxes use this path.
+
+**Prerequisites:** CUDA 12.8 toolkit (`nvcc` on PATH), conda Python 3.12, ~20GB
+free disk for build tree + wheels.
+
+```bash
+conda create -y -n onecat-vllm python=3.12
+conda activate onecat-vllm
+export CUDA_HOME=/usr/local/cuda-12.8
+export PATH="$CUDA_HOME/bin:$PATH"
+export TORCH_CUDA_ARCH_LIST="7.0"
+export MAX_JOBS=6
+export PIP_CACHE_DIR=/path/to/fast/disk/pip-cache   # optional
+
+git clone --depth 1 --branch v1.0.0 https://github.com/1CatAI/1Cat-vLLM.git
+cd 1Cat-vLLM
+pip install -r requirements/build.txt -r requirements/cuda.txt -r requirements/common.txt
+pip install cmake build ninja
+
+DIST=/path/to/dist-cu128-sm70-v1.0.0
+mkdir -p "$DIST"
+
+# 1) flash_attn_v100 wheel
+pushd flash-attention-v100
+python -m build --wheel --no-isolation --outdir "$DIST"
+popd
+
+# 2) vllm wheel (30–90 min on V100 host)
+export VLLM_TARGET_DEVICE=cuda
+python -m build --wheel --no-isolation --outdir "$DIST"
+
+# 3) Install — run from /tmp so Python does not import the source tree
+pip install "$DIST"/flash_attn_v100-*.whl
+cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl
+
+cd /path/to/AccelMark
+pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
+```
+
+Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python
+may import the local `vllm/` package instead of the installed wheel.
+
+### Smoke test
+
+Run from `/tmp` or the AccelMark repo root (not inside `1Cat-vLLM/`):
+
+```bash
+python - <<'PY'
+import torch, vllm
+print("torch:", torch.__version__, "vllm:", vllm.__version__)
+import flash_attn_v100_cuda
+print("flash_attn_v100: ok")
+from vllm import LLM
+print("LLM import: ok")
+PY
+```
+
+---
+
+## AccelMark runner config (required on V100)
+
+Copy and edit:
+
+```bash
+cp configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example \
+   configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml
+```
+
+**Single V100 32GB** — recommended `engine_kwargs` (avoids prefix prefill shared-memory
+crash on SM70: `Shared memory exceeds 96KB`):
+
+```yaml
+tensor_parallel_size: 1
+max_num_seqs: 1
+gpu_memory_utilization: 0.88
+engine_kwargs:
+  enable_prefix_caching: false
+  kv_cache_auto_trim_ratio: 0.0
+```
+
+**4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs`
+unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see
+example file comments).
+
+Other tuning:
+
+| Symptom | Try |
+|---------|-----|
+| First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` |
+| OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) |
+| `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ |
+
+---
+
+## Basic usage
+
+```bash
+cp configs/submitter.yaml.example configs/submitter.yaml   # once
+cp configs/models_local.yaml.example configs/models_local.yaml   # map local model paths
+
+export PYTHONPATH=/path/to/AccelMark   # if pip install -e . is unavailable
+
+# Suite A smoke (1× V100)
+python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \
+    --suite suite_A --scenario accuracy --tensor-parallel-size 1
+
+# Suite B (4× V100)
+python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \
+    --suite suite_B --tensor-parallel-size 4
+```
+
+---
+
+## Known limitations
+
+- Prefix caching + chunked paged prefill can exceed V100's 96KB shared memory per SM;
+  disable `enable_prefix_caching` (see config above).
+- `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional
+  for 1Cat's long-context V100 profile.
+- Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed).
+- End-to-end validation on 4× V100 reference hardware is still community-pending in
+  `meta.json`; single-GPU smoke (Suite A accuracy) has been exercised on V100 32GB.
+
+## Requirements
+
+See `requirements.txt`. Install `torch`, `flash_attn_v100`, and the `vllm` fork
+from 1Cat-vLLM **before** the AccelMark extras file. Do not install upstream
+`vllm` from PyPI after the fork.
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json b/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json
new file mode 100644
index 0000000..b3d136e
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json
@@ -0,0 +1,21 @@
+{
+  "id": "nvidia_onecat_vllm_4a9ca6c3",
+  "platform": "nvidia",
+  "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA",
+  "framework": "1Cat-vLLM",
+  "submitted_by": "JuhaoLiang1997",
+  "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.",
+  "supersedes_chain": ["nvidia_onecat_vllm_a43d1bcf"],
+  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM",
+  "created": "2026-05-15",
+  "hardware_label": "NVIDIA V100 (SM70)",
+  "suite_support": {
+    "A": "pending",
+    "B": "pending",
+    "C": "pending",
+    "D": "pending",
+    "E": "pending",
+    "F": "unsupported",
+    "G": "pending"
+  }
+}
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt b/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
new file mode 100644
index 0000000..b6d4c62
--- /dev/null
+++ b/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
@@ -0,0 +1,17 @@
+# AccelMark extras for nvidia_onecat_vllm_4a9ca6c3.
+# Install 1Cat-vLLM (flash_attn_v100 + vllm fork) first — see README.md.
+
+transformers==4.57.6
+tokenizers==0.22.2
+huggingface-hub==0.35.0
+accelerate==1.10.1
+safetensors==0.6.2
+
+numpy==1.26.4
+jsonschema==4.25.1
+psutil==7.1.0
+tqdm==4.67.1
+
+nvidia-ml-py==13.580.82
+aiohttp==3.12.15
+PyYAML==6.0.2
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py b/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py
similarity index 60%
rename from runners/nvidia_onecat_vllm_a43d1bcf/runner.py
rename to runners/nvidia_onecat_vllm_4a9ca6c3/runner.py
index 56002bb..861b352 100644
--- a/runners/nvidia_onecat_vllm_a43d1bcf/runner.py
+++ b/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py
@@ -1,31 +1,8 @@
 """
 AccelMark — NVIDIA 1Cat-vLLM (SM70 / V100) benchmark script.
 
-Implements BenchmarkRunner for `1Cat-vLLM`, the community fork of vLLM that
-re-enables modern model serving on Tesla V100 / SM70 hardware. The fork
-preserves the standard vLLM Python API (``LLM``, ``AsyncLLMEngine``,
-``SamplingParams``) but ships several SM70-specific pieces:
-
-  - the ``FLASH_ATTN_V100`` attention backend (FlashAttention re-implemented
-    for Volta) — set as this runner's default
-  - AWQ 4-bit kernels patched in from lmdeploy TurboMind for SM70
-  - validated paths for Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB
-  - FP8 KV cache (``fp8_e5m2``) as an experimental option
-
-Because the Python entry points are identical to upstream vLLM, this runner
-is structurally a clone of ``nvidia_vllm_*`` with three runtime overrides:
-
-  1. ``SUPPORTED_PRECISIONS`` drops BF16 (V100 cannot do BF16 in hardware).
-  2. ``SUPPORTED_QUANTIZATION_BACKENDS`` advertises only ``awq`` — the fork's
-     headline feature; other quantizations are unproven on this stack.
-  3. ``load_model()`` auto-injects ``attention_backend="FLASH_ATTN_V100"``
-     into the engine kwargs unless the user has explicitly set one.
-
-Reference build:
-    Python 3.12 · CUDA 12.8 · torch 2.9.1+cu128 · 1Cat-vLLM 1.0.0 wheels
-    Validated on 4 x Tesla PG503 / V100 32 GB.
-
-All orchestration logic still lives in runners/benchmark_runner.py.
+Thin vLLM runner wrapper for the 1Cat-vLLM fork on Tesla V100 / V100S.
+See README.md in this folder for install, hardware scope, and tuning.
 """
 
 import asyncio
@@ -34,7 +11,6 @@
 from pathlib import Path
 from typing import Optional
 
-# Add repo root to path
 _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 sys.path.insert(0, str(_REPO_ROOT))
 
@@ -47,37 +23,20 @@
 from loadgen.types import InferenceResult
 
 
-
-# Suppress per-request vLLM logs by default
 import logging
 logging.getLogger("vllm.engine.async_llm_engine").setLevel(logging.WARNING)
 logging.getLogger("vllm.engine.llm_engine").setLevel(logging.WARNING)
 
 
 class OneCatVLLMRunner(BenchmarkRunner):
-    """
-    AccelMark benchmark runner using 1Cat-vLLM (SM70 / V100 fork) on NVIDIA.
-
-    This runner is intended **specifically for Tesla V100 / V100S (SM70)**.
-    Other NVIDIA GPUs should use the upstream ``nvidia_vllm_*`` runner —
-    1Cat-vLLM's kernels are tuned for Volta and provide no advantage on
-    Ampere or newer hardware.
-    """
+    """1Cat-vLLM on NVIDIA V100 / V100S (SM70). Use nvidia_vllm_* on newer GPUs."""
 
     SUPPORTS_STREAMING = True
     SUPPORTS_BATCHING = True
     SUPPORTS_ONLINE = True
     SUPPORTS_MULTI_CHIP = True
 
-    # V100 has no BF16 datapath — drop BF16 entirely so the suite picks
-    # FP16 as the effective precision without an amber warning. FP32 is left
-    # in for completeness but is essentially never used in inference.
     SUPPORTED_PRECISIONS = ["fp16", "fp32"]
-
-    # 1Cat-vLLM's flagship contribution is AWQ 4-bit on SM70 (via lmdeploy
-    # TurboMind kernels). Other quantization backends are not validated on
-    # this stack — keep the surface conservative and let users opt-in by
-    # subclassing if they want to try FP8 KV cache (``fp8_e5m2`` only).
     SUPPORTED_QUANTIZATION_BACKENDS = ["awq"]
 
     def __init__(self):
@@ -88,7 +47,6 @@ def __init__(self):
         self._loop: asyncio.AbstractEventLoop = None
 
     def _get_chip_count(self) -> int:
-        """Return the number of available CUDA GPUs."""
         try:
             import torch
             n = torch.cuda.device_count()
@@ -100,12 +58,6 @@ def _get_framework_name(self) -> str:
         return "1Cat-vLLM"
 
     def _get_framework_version(self) -> str:
-        """Report vllm.__version__ plus the flash_attn_v100 wheel version.
-
-        1Cat-vLLM ships two coupled wheels (`vllm` patched fork + `flash_attn_v100`)
-        and the FA-V100 wheel is the bit that actually changes attention
-        performance on Volta. Recording both makes the result reproducible.
-        """
         core = "unknown"
         try:
             import vllm
@@ -129,7 +81,6 @@ def _get_framework_version(self) -> str:
         return core
 
     def load_model(self, model_path: str, parallelism: dict) -> None:
-        """Load model — sync LLM for offline/accuracy, async engine for streaming."""
         tp_size = parallelism["tensor_parallel_size"]
         pp_size = parallelism["pipeline_parallel_size"]
         ep_size = parallelism.get("expert_parallel_size", 1)
@@ -141,22 +92,10 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
         enforce_eager = getattr(self, "_enforce_eager", False)
 
         cfg             = getattr(self, "_runner_config", {})
-        # 1Cat-vLLM public defaults for 4 x V100 32 GB:
-        #   max_num_seqs = 1 (baseline) or 4 (MTP profile)
-        #   gpu_memory_utilization = 0.88
-        # These differ noticeably from upstream vLLM's 512 / 0.90 defaults
-        # because 256K context on V100 demands a much tighter KV budget.
         max_num_seqs    = cfg.get("max_num_seqs", 1)
         gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88)
         extra_kwargs    = dict(cfg.get("engine_kwargs") or {})
 
-        # ── Default to FLASH_ATTN_V100 attention backend ──────────────────────
-        # 1Cat-vLLM's public recommendation for V100 is FLASH_ATTN_V100. Inject
-        # it as the default unless the user explicitly set a backend in their
-        # runner config engine_kwargs, or via the VLLM_ATTENTION_BACKEND
-        # environment variable. We support both spellings because vLLM accepts
-        # the kwarg as `attention_backend` and the env var as
-        # VLLM_ATTENTION_BACKEND.
         import os
         if (
             "attention_backend" not in extra_kwargs
@@ -164,10 +103,6 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
         ):
             extra_kwargs["attention_backend"] = "FLASH_ATTN_V100"
 
-        # ── Filter engine_kwargs to only fields this vLLM version accepts ─────
-        # Avoids TypeError when the runner config YAML references a field that
-        # doesn't exist in the installed vLLM version (EngineArgs is a strict
-        # dataclass — unknown keyword arguments raise TypeError immediately).
         try:
             import dataclasses
             from vllm.engine.arg_utils import EngineArgs as _EngineArgs
@@ -178,43 +113,29 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
                       f"1Cat-vLLM version and will be ignored: {list(_dropped)}")
             extra_kwargs = {k: v for k, v in extra_kwargs.items() if k in _valid}
         except Exception:
-            pass  # If introspection fails, pass kwargs as-is and let vLLM report the error
+            pass
 
-        # Use precision resolved by BenchmarkRunner._resolve_precision()
         effective_precision = getattr(self, "_effective_precision", "BF16").upper()
         precision           = getattr(self, "_precision", None) or effective_precision
 
-        # dtype_override and quantization may be injected by benchmark_runner from
-        # precision_model_map entry fields (dtype_override, engine_kwargs.quantization).
-        # These take priority over the runner's own precision→dtype mapping below.
         _dtype_override  = getattr(self, "_precision_dtype_override", None)
         _prec_eng_kwargs = dict(getattr(self, "_precision_engine_kwargs", None) or {})
 
         quantization = _prec_eng_kwargs.pop("quantization", None)
 
-        # Map native precision names to explicit dtypes.
-        # Quantized formats (anything not in this map) use dtype="auto" — vLLM reads
-        # the storage dtype from the checkpoint's config.json, and the quantization
-        # kernel is set explicitly via the `quantization` kwarg already populated above
-        # from precision_model_map engine_kwargs. No fallback guessing needed here.
         _NATIVE_DTYPE_MAP = {
             "BF16":  "bfloat16",
             "FP16":  "float16",
             "FP32":  "float32",
         }
         dtype = _NATIVE_DTYPE_MAP.get(precision, "auto")
-        self._quantization_method = quantization  # None for native, explicit str for quantized
+        self._quantization_method = quantization
 
-        # dtype_override from precision_model_map wins over the mapping above.
-        # Used for e.g. FP16 baseline on pre-Ampere hardware (V100/T4).
         if _dtype_override:
             dtype = _dtype_override
 
-        # Merge remaining precision_engine_kwargs (after popping quantization) into
-        # extra_kwargs so they reach LLM() / AsyncEngineArgs. Runner YAML engine_kwargs
-        # still take final precedence via the **extra_kwargs spread at the end.
         if _prec_eng_kwargs:
-            _prec_eng_kwargs.update(extra_kwargs)   # runner YAML wins on conflict
+            _prec_eng_kwargs.update(extra_kwargs)
             extra_kwargs = _prec_eng_kwargs
 
         print(f"Loading model: precision={precision}, dtype={dtype}"
@@ -259,8 +180,6 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
                 trust_remote_code=False,
                 enforce_eager=enforce_eager,
                 gpu_memory_utilization=gpu_memory_util,
-                # engine_kwargs values override named fields above if the same key appears in both.
-                # This is intentional — engine_kwargs is the power-user escape hatch.
                 **extra_kwargs,
             )
             if ep_size > 1:
@@ -271,42 +190,23 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
             self.engine = AsyncLLMEngine.from_engine_args(engine_args)
 
     def get_effective_dtype(self) -> Optional[str]:
-        """
-        Report the actual compute dtype vLLM used after model loading.
-
-        vLLM exposes the resolved dtype via model_config after initialization.
-        This captures cases like FP8 weights on A100 computing in BF16.
-        """
         try:
             if self.llm is not None:
-                # Sync LLM path
                 dtype = self.llm.llm_engine.model_config.dtype
                 return str(dtype).replace("torch.", "")
             elif self.engine is not None:
-                # Async engine path
                 dtype = self.engine.engine.model_config.dtype
                 return str(dtype).replace("torch.", "")
         except Exception:
             pass
-        # Fall back to declared dtype if introspection fails
         return getattr(self, "_effective_dtype", None)
 
     def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[InferenceResult]:
-        """Send all requests to vLLM at once. vLLM handles internal batching.
-
-        total_time_ms in each returned InferenceResult is set to the wall-clock
-        elapsed time of the entire batch — NOT an individual per-request latency.
-        vLLM's sync LLM.generate() blocks until all requests finish, so there is
-        no per-request completion timestamp available. All results share the same
-        total_time_ms value, which is the correct denominator for throughput:
-            throughput = total_tokens / (elapsed_ms / 1000)
-        """
         formatted = [self._format_prompt(r.prompt) for r in requests]
         t_start = time.perf_counter()
         outputs = self.llm.generate(formatted, self.sampling_params)
         elapsed = time.perf_counter() - t_start
 
-        # Store output text for _run_accuracy_integrated()
         self._last_accuracy_outputs = [o.outputs[0].text for o in outputs]
 
         results = []
@@ -322,7 +222,6 @@ def inference_fn_offline(self, requests: list[InferenceRequest]) -> list[Inferen
         return results
 
     async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceResult:
-        """Stream a single request, measuring TTFT."""
         from vllm.utils import random_uuid
 
         formatted = self._format_prompt(request.prompt)
@@ -354,15 +253,6 @@ async def inference_fn_streaming(self, request: InferenceRequest) -> InferenceRe
         )
 
     async def inference_fn_token_stream(self, request: InferenceRequest):
-        """
-        Async generator yielding decoded text deltas for the serve layer.
-
-        Each yield is the delta text since the last output — new characters
-        only, not the full accumulated string.
-
-        vLLM's engine.generate() yields cumulative outputs, so we track the
-        previous text length and slice off only the new portion each step.
-        """
         from vllm.utils import random_uuid
 
         formatted   = self._format_prompt(request.prompt)
@@ -385,7 +275,6 @@ def get_peak_memory_gb(self) -> float:
             return None
 
     def release_resources(self) -> None:
-        """Release vLLM engines and distributed state."""
         if self.llm is not None:
             try:
                 del self.llm
@@ -405,18 +294,10 @@ def release_resources(self) -> None:
                 pass
             self.engine = None
 
-        # Destroy vLLM's distributed state so the next engine initialisation
-        # creates a fresh TCPStore server.  Must call destroy_model_parallel()
-        # first to clear vLLM's cached group references; only then is it safe
-        # to destroy the underlying torch process group.  Skipping this step
-        # leaves torch.distributed.is_initialized()==True, which causes
-        # init_distributed_environment() to skip creating the new TCPStore
-        # server, so spawned worker processes can never connect (→ 600 s timeout).
         try:
             from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
             cleanup_dist_env_and_memory(shutdown_ray=False)
         except Exception:
-            # Fallback for older vLLM builds that lack cleanup_dist_env_and_memory
             try:
                 from vllm.distributed.parallel_state import (
                     destroy_model_parallel, destroy_distributed_environment,
@@ -426,12 +307,6 @@ def release_resources(self) -> None:
             except Exception:
                 pass
 
-        # Final guard: if torch.distributed is still initialized after the cleanup
-        # attempts above, destroy the default process group here.  Without this,
-        # vLLM's init_distributed_environment() skips TCPStore server creation on
-        # the next LLM() init, so new worker processes can never join the barrier
-        # (→ 1800 s Gloo timeout) because the main driver calls barrier() on the
-        # stale old group while workers wait on a fresh one that never reaches quorum.
         try:
             if torch.distributed.is_initialized():
                 torch.distributed.destroy_process_group()
@@ -439,12 +314,9 @@ def release_resources(self) -> None:
             pass
 
     def parse_args(self):
-        """Add vLLM/NVIDIA-specific CLI flags. Base class pre-loads runner config."""
         args = super().parse_args()
         cfg = self._runner_config
 
-        # ── Runner-specific CLI flags ─────────────────────────────────────────
-        # Defined here (not in benchmark_runner) — vLLM/NVIDIA-specific concepts.
         import argparse
         parser = argparse.ArgumentParser(add_help=False)
         parser.add_argument("--tensor-parallel-size", type=int, default=None,
@@ -457,8 +329,6 @@ def parse_args(self):
                             dest="enforce_eager")
         extra, _ = parser.parse_known_args()
 
-        # Priority: CLI flag > yaml config > required_chips > auto-detected > default 1
-        # Fully resolved by base class.
         tp_size, _tp_source = self._resolve_tensor_parallel_size(
             extra.tensor_parallel_size
         )
@@ -469,7 +339,6 @@ def parse_args(self):
         ep_size = (extra.expert_parallel_size
                    if extra.expert_parallel_size is not None
                    else cfg.get("expert_parallel_size", 1))
-        # enforce_eager: CLI flag OR yaml setting (either activates it)
         self._enforce_eager = extra.enforce_eager or cfg.get("enforce_eager", False)
 
         print(f"  tensor_parallel_size = {tp_size}  [{_tp_source}]")
@@ -483,9 +352,6 @@ def parse_args(self):
             pp_size = 1
             ep_size = 1
 
-        # Report to base class — used by _compute_run_id(), _build_result_json(), etc.
-        # Note: for MoE with expert parallelism, chips are shared between TP and EP
-        # dimensions — ep_size does not add to chip count independently.
         self._parallelism = {
             "tensor_parallel_size":   tp_size,
             "pipeline_parallel_size": pp_size,
@@ -497,7 +363,6 @@ def parse_args(self):
         return args
 
     def get_extra_subprocess_args(self, args) -> list[str]:
-        """Forward vLLM/NVIDIA-specific flags to subprocess invocations."""
         extra = [
             "--tensor-parallel-size",
             str(self._parallelism.get("tensor_parallel_size", 1)),
@@ -514,4 +379,4 @@ def get_extra_subprocess_args(self, args) -> list[str]:
 
 
 if __name__ == "__main__":
-    OneCatVLLMRunner().main()
\ No newline at end of file
+    OneCatVLLMRunner().main()
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/README.md b/runners/nvidia_onecat_vllm_a43d1bcf/README.md
deleted file mode 100644
index b358ebd..0000000
--- a/runners/nvidia_onecat_vllm_a43d1bcf/README.md
+++ /dev/null
@@ -1,188 +0,0 @@
-# nvidia_onecat_vllm_a43d1bcf — 1Cat-vLLM Runner (Tesla V100 / SM70)
-
-AccelMark runner for **Tesla V100 / V100S only**, using
-[1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) — the community vLLM fork
-that re-enables modern AWQ 4-bit serving and FlashAttention on Volta GPUs
-(SM70).
-
-> **Hardware scope:** This runner is intentionally narrow. On Ampere
-> (A100/A800/A10/L4/4090/etc.) or newer, use the upstream
-> `nvidia_vllm_*` runner — 1Cat-vLLM's kernels are tuned for SM70 and
-> provide no benefit on later architectures.
-
-> **Status:** Committed without an end-to-end validation run yet. The runner
-> code is a thin specialisation of the upstream NVIDIA vLLM runner (only
-> capability flags + attention-backend default differ), so existing test
-> coverage of the parent runner applies. Plan to add a reference
-> `Tesla V100-SXM2-32GBx4 suite_B` result once a target box is available.
-
-## Why 1Cat-vLLM exists
-
-| Pain on stock vLLM + V100 | 1Cat-vLLM's fix |
-|---|---|
-| AWQ kernels require SM75+ | Integrated lmdeploy TurboMind WMMA kernels for SM70 |
-| FlashAttention 2/3 require Ampere+ | Custom `FLASH_ATTN_V100` Volta backend |
-| Qwen3.5 / Qwen3.6 dense + MoE not loadable | Model configs and runtime fixes shipped in fork |
-| Long-context paged-prefill stability | SM70-specific MLA/GDN runtime fixes |
-| FP8 KV cache | `fp8_e5m2` (experimental) on V100 FA path |
-
-For full release notes see
-<https://github.com/1CatAI/1Cat-vLLM> RELEASE_NOTES_1.0.0.md.
-
-## Defaults this runner injects
-
-| Knob | Default | Where set | Why |
-|---|---|---|---|
-| `attention_backend` | `FLASH_ATTN_V100` | `load_model()` if not already specified | 1Cat-vLLM's recommended V100 path |
-| `SUPPORTED_PRECISIONS` | `["fp16", "fp32"]` | class attribute | V100 has no BF16 |
-| `SUPPORTED_QUANTIZATION_BACKENDS` | `["awq"]` | class attribute | 1Cat's headline kernel; other formats not validated on this stack |
-| `max_num_seqs` | `1` | runner config default | 1Cat 1.0.0 public default — 256K context on V100 |
-| `gpu_memory_utilization` | `0.88` | runner config default | 1Cat 1.0.0 public default |
-
-To opt into the MTP + prefix-cache profile (Qwen3.6-27B-AWQ), bump
-`max_num_seqs` to `4` and pass `speculative_config` via the runner config
-`engine_kwargs` — see the example config file.
-
-## Supported suites
-
-| Suite | Recommendation |
-|-------|---------------|
-| Suite A — Llama-3-8B 1× | Runs, but vanilla `nvidia_vllm_47f5d58e --enforce-eager` already covers this. Use 1Cat only if you want the FA-V100 attention path. |
-| Suite B — Llama-3-70B multi-chip | **Primary target.** Recommended `--tensor-parallel-size 4`. |
-| Suite C — Quantization | Restricted to AWQ — this is where 1Cat shines. |
-| Suite D — Long context (~28K) | **Primary target.** `FLASH_ATTN_V100` is the only V100-friendly long-context path. |
-| Suite E — Scaling | Same considerations as Suite B; useful for measuring how 1Cat's MCCL-equivalent scales. |
-| Suite F — Qwen2.5-0.5B edge | Not interesting on V100 — the model fits trivially; use upstream runner. |
-| Suite G — MoE | Sweet spot — `Qwen3.6-35B-A3B-AWQ`, `Qwen3.5-122B-A10B-AWQ` are exactly the validated MoE models in 1Cat 1.0.0. |
-
-## Environment setup
-
-1Cat-vLLM 1.0.0 ships **prebuilt wheels only** (no PyPI `vllm`). Install the
-wheels **before** `requirements.txt` — the extras file intentionally omits
-`torch` / `vllm` so it does not fight the cu128 index used by the wheels.
-
-### Validated stack (1Cat-vLLM 1.0.0)
-
-| Component | Version |
-|-----------|---------|
-| OS | Ubuntu **24.04** (glibc ≥ 2.38) |
-| Python | **3.12** (`cp312` wheels only) |
-| CUDA | **12.8** toolkit + matching driver (570.x recommended) |
-| PyTorch | **2.9.1+cu128** (pulled in by the wheels) |
-| GPU | Tesla V100 / V100S (SM70) |
-
-Upstream reference: [1Cat-vLLM releases](https://github.com/1CatAI/1Cat-vLLM/releases/tag/v1.0.0)
-and [installation guide](https://github.com/1CatAI/1Cat-vLLM#quick-start).
-
-### Ubuntu 22.04 and other older hosts
-
-The release wheels are linked against **glibc 2.38**. On Ubuntu 22.04 (glibc
-2.35), `pip install` may succeed but `import vllm` fails with
-`GLIBC_2.38 not found`. Options:
-
-- Run on **Ubuntu 24.04** (bare metal or VM), or
-- Use a **glibc ≥ 2.38 container** on the host (see the [1Cat-vLLM Docker
-  notes](https://github.com/1CatAI/1Cat-vLLM#docker-deployment) — build/run
-  on a machine where the Docker daemon is available; nested dev containers
-  without `docker.sock` bind-mount usually cannot host Docker), or
-- **Build from source** on your host glibc (see 1Cat-vLLM “Source build”).
-
-### Install steps
-
-From the AccelMark repo root, in a fresh **Python 3.12** environment:
-
-```bash
-# 1. CUDA 12.8 toolkit + driver
-#    https://developer.nvidia.com/cuda-12-8-0-download-archive
-
-conda create -y -n onecat-vllm python=3.12
-conda activate onecat-vllm
-python -m pip install --upgrade pip setuptools wheel
-
-# 2. 1Cat-vLLM wheels (install BOTH together — do not use PyPI vllm)
-python -m pip install --prefer-binary --no-cache-dir \
-    --extra-index-url https://download.pytorch.org/whl/cu128 \
-    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
-    "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
-
-# 3. AccelMark runner extras only
-python -m pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
-```
-
-Do **not** install `vllm` from PyPI afterward — it will replace the fork.
-Run benchmarks from a directory **outside** a cloned 1Cat-vLLM source tree so
-Python does not import the local `vllm/` package instead of the wheel.
-
-## Smoke test the install
-
-```bash
-python - <<'PY'
-import torch, vllm
-print("torch:", torch.__version__, "  vllm:", vllm.__version__)
-try:
-    import flash_attn_v100_cuda  # SM70 FA kernels
-    print("flash_attn_v100: ok")
-except Exception as e:
-    print("flash_attn_v100: MISSING ->", e)
-PY
-```
-
-`flash_attn_v100` MUST be importable — if it isn't, you accidentally
-installed plain vLLM from PyPI; reinstall from the 1Cat release wheels above.
-
-## Basic usage
-
-```bash
-# Suite D (long-context) on 4 x V100 32 GB
-python run.py --runner nvidia_onecat_vllm_a43d1bcf \
-    --suite suite_D \
-    --tensor-parallel-size 4
-
-# Suite C with AWQ (Qwen3.5-27B-AWQ as the validation model)
-python run.py --runner nvidia_onecat_vllm_a43d1bcf \
-    --suite suite_C \
-    --tensor-parallel-size 4 \
-    --model-path /data/models/Qwen3.5-27B-AWQ
-
-# Override attention backend (rare — for benchmarking vs Triton fallback)
-python run.py --runner nvidia_onecat_vllm_a43d1bcf \
-    --suite suite_B \
-    --tensor-parallel-size 4 \
-    # Then set attention_backend in your runner config engine_kwargs.
-```
-
-## Runner config
-
-Copy the example:
-
-```bash
-cp configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml.example \
-   configs/runner_configs/runner_nvidia_onecat_vllm_a43d1bcf.yaml
-```
-
-Key defaults differ from the upstream NVIDIA runner:
-
-| Field | 1Cat default | Upstream default | Notes |
-|-------|--------------|------------------|-------|
-| `max_num_seqs` | 1 | 512 | 256K context demands very tight KV cache budget |
-| `gpu_memory_utilization` | 0.88 | 0.90 | Matches 1Cat 1.0.0 public reference |
-| `engine_kwargs.attention_backend` | `FLASH_ATTN_V100` (auto) | — | Auto-injected unless overridden |
-
-## Known gaps (pre-smoke-test)
-
-- The Volta CUDA-graph capture path needs validation under
-  `--scenario sustained`. If startup hangs on the first request, set
-  `enforce_eager: true` in your runner config.
-- The accuracy gate uses the suite's stock prompts — on AWQ checkpoints
-  the gate threshold may be too tight; the suite spec already allows
-  per-format thresholds (Suite C) so this is mostly relevant on Suite A/D.
-- MTP / speculative profiles are documented in 1Cat 1.0.0 but not
-  exercised here yet; flat speculative keys in `_precision_engine_kwargs`
-  are still forwarded as `speculative_config` by `benchmark_runner.py`,
-  the same as for the upstream runner.
-
-## Requirements
-
-See `requirements.txt`. The heavy dependencies (`torch`, `flash_attn_v100`,
-`vllm` fork) MUST come from the 1Cat-vLLM release wheels — do not install
-upstream `vllm` from PyPI.
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json b/runners/nvidia_onecat_vllm_a43d1bcf/meta.json
deleted file mode 100644
index b86b000..0000000
--- a/runners/nvidia_onecat_vllm_a43d1bcf/meta.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-  "id": "nvidia_onecat_vllm_a43d1bcf",
-  "platform": "nvidia",
-  "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA",
-  "framework": "1Cat-vLLM",
-  "submitted_by": "JuhaoLiang1997",
-  "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — the community vLLM fork that re-enables AWQ 4-bit inference on Volta via lmdeploy TurboMind kernels and the FLASH_ATTN_V100 attention backend. Targets Qwen3.5 / Qwen3.6 dense + MoE on 4 x V100 32 GB. Use the upstream nvidia_vllm runner on Ampere or newer.",
-  "supersedes_chain": [],
-  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless the user overrides it. SUPPORTED_PRECISIONS drops BF16 (V100 has no native BF16 datapath). SUPPORTED_QUANTIZATION_BACKENDS lists only AWQ — the fork's headline contribution; FP8 KV cache and other formats are intentionally not exposed by default. Initial commit, not yet validated end-to-end on hardware.",
-  "created": "2026-05-15",
-  "hardware_label": "NVIDIA V100 (SM70)",
-  "suite_support": {
-    "A": "pending",
-    "B": "pending",
-    "C": "pending",
-    "D": "pending",
-    "E": "pending",
-    "F": "unsupported",
-    "G": "pending"
-  }
-}
diff --git a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt b/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
deleted file mode 100644
index 01e687d..0000000
--- a/runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
+++ /dev/null
@@ -1,58 +0,0 @@
-# AccelMark -- 1Cat-vLLM (SM70 / V100) runner dependencies
-#
-# 1Cat-vLLM is a community fork of vLLM tuned for Tesla V100. It ships two
-# wheels that must be installed together:
-#   - flash_attn_v100  (Volta-optimised FlashAttention kernels)
-#   - vllm             (patched fork, exposes the FLASH_ATTN_V100 backend
-#                       and AWQ 4-bit kernels for SM70)
-#
-# Both wheels are published as GitHub release assets at:
-#   https://github.com/1CatAI/1Cat-vLLM/releases
-#
-# Reference validated stack (1Cat-vLLM 1.0.0):
-#   OS:       Ubuntu 24.04
-#   Python:   3.12
-#   CUDA:     12.8
-#   PyTorch:  2.9.1+cu128
-#   Driver:   570.211.01
-#   GPU:      4 x Tesla V100 32 GB
-#
-# Installation:
-#   # 1. Install CUDA 12.8 toolkit and matching driver
-#   # 2. Create a fresh Python 3.12 environment
-#   # 3. Install the two 1Cat-vLLM wheels from the release page:
-#   pip install --prefer-binary --no-cache-dir \
-#     --extra-index-url https://download.pytorch.org/whl/cu128 \
-#     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/flash_attn_v100-1.0.0-cp312-cp312-linux_x86_64.whl" \
-#     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
-#
-#   # 4. Then install the AccelMark extras below:
-#   pip install -r runners/nvidia_onecat_vllm_a43d1bcf/requirements.txt
-#
-# Note: do NOT add `torch==2.x` here — the matching torch wheel is pulled in
-# by the 1Cat-vLLM wheel install command above. Listing torch here would
-# fight with the cu128 extra-index-url.
-
-# Transformers stack — compatible with 1Cat-vLLM 1.0.0 (vllm fork 1.0.0,
-# based on upstream vLLM 0.7.x line). Versions match the upstream
-# nvidia_vllm_47f5d58e runner so we know they're consistent.
-transformers==4.57.6
-tokenizers==0.22.2
-huggingface-hub==0.35.0
-accelerate==1.10.1
-safetensors==0.6.2
-
-# AccelMark dependencies
-numpy==1.26.4
-jsonschema==4.25.1
-psutil==7.1.0
-tqdm==4.67.1
-
-# NVIDIA monitoring (for power and GPU stats — same as upstream NVIDIA runner)
-nvidia-ml-py==13.580.82
-
-# Async support
-aiohttp==3.12.15
-
-# Config file parsing
-PyYAML==6.0.2

From d78b58eababe4182b25ae703c579d4c48a620a29 Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 17:57:24 +0800
Subject: [PATCH 4/5] fix onecat

---
 README.md                                     |  2 +-
 ..._nvidia_onecat_vllm_12a253c2.yaml.example} | 10 ++--
 .../README.md                                 | 46 +++++++++++++------
 .../meta.json                                 |  6 +--
 .../requirements.txt                          |  0
 .../runner.py                                 |  4 +-
 6 files changed, 42 insertions(+), 26 deletions(-)
 rename configs/runner_configs/{runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example => runner_nvidia_onecat_vllm_12a253c2.yaml.example} (52%)
 rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/README.md (81%)
 rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/meta.json (69%)
 rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/requirements.txt (100%)
 rename runners/{nvidia_onecat_vllm_4a9ca6c3 => nvidia_onecat_vllm_12a253c2}/runner.py (99%)

diff --git a/README.md b/README.md
index 7c171ee..922c479 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 |---|---|---|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
 | NVIDIA GPU | `nvidia_sglang_c43a8309` | SGLang | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | NVIDIA GPU | `nvidia_vllm_47f5d58e` | vLLM | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
-| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_4a9ca6c3` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
+| NVIDIA V100 (SM70) | `nvidia_onecat_vllm_12a253c2` | 1Cat-vLLM | ⋯ | ⋯ | ⋯ | ⋯ | ⋯ | — | ⋯ |
 | AMD GPU | `amd_vllm_rocm_6c18cd8f` | vLLM-ROCm | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
diff --git a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
similarity index 52%
rename from configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example
rename to configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
index 8c6c310..6644d79 100644
--- a/configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example
+++ b/configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example
@@ -1,14 +1,14 @@
-# AccelMark runner config — nvidia_onecat_vllm_4a9ca6c3 (1Cat-vLLM on V100)
-# Copy to runner_nvidia_onecat_vllm_4a9ca6c3.yaml (gitignored). See runner README.
+# AccelMark runner config — nvidia_onecat_vllm_12a253c2 (1Cat-vLLM on V100)
+# Copy to runner_nvidia_onecat_vllm_12a253c2.yaml (gitignored). See runner README.
 
 tensor_parallel_size: 1
 enforce_eager: false
-max_num_seqs: 1
-gpu_memory_utilization: 0.88
+max_num_seqs: 512
+gpu_memory_utilization: 0.90
 
-# V100 SM70: required for Suite A-style runs (see runner README).
 engine_kwargs:
   enable_prefix_caching: false
+  enable_chunked_prefill: false
   kv_cache_auto_trim_ratio: 0.0
 
 suites:
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md b/runners/nvidia_onecat_vllm_12a253c2/README.md
similarity index 81%
rename from runners/nvidia_onecat_vllm_4a9ca6c3/README.md
rename to runners/nvidia_onecat_vllm_12a253c2/README.md
index d29b0e1..0556214 100644
--- a/runners/nvidia_onecat_vllm_4a9ca6c3/README.md
+++ b/runners/nvidia_onecat_vllm_12a253c2/README.md
@@ -1,4 +1,4 @@
-# nvidia_onecat_vllm_4a9ca6c3 — 1Cat-vLLM Runner (Tesla V100 / SM70)
+# nvidia_onecat_vllm_12a253c2 — 1Cat-vLLM Runner (Tesla V100 / SM70)
 
 AccelMark runner for **Tesla V100 / V100S only**, using
 [1Cat-vLLM](https://github.com/1CatAI/1Cat-vLLM) (community vLLM fork for Volta).
@@ -28,8 +28,8 @@ Release notes: [1Cat-vLLM v1.0.0](https://github.com/1CatAI/1Cat-vLLM/releases/t
 | `attention_backend` | `FLASH_ATTN_V100` (auto unless overridden) |
 | `SUPPORTED_PRECISIONS` | `fp16`, `fp32` (no BF16 on V100) |
 | `SUPPORTED_QUANTIZATION_BACKENDS` | `awq` only |
-| `max_num_seqs` | `1` (via runner config) |
-| `gpu_memory_utilization` | `0.88` |
+| `max_num_seqs` | `512` global default (same as upstream vLLM); use `1` for suite D / long-context |
+| `gpu_memory_utilization` | `0.90` |
 
 ## Supported suites
 
@@ -75,7 +75,7 @@ python -m pip install --prefer-binary --no-cache-dir \
     "https://github.com/1CatAI/1Cat-vLLM/releases/download/v1.0.0/vllm-1.0.0-cp312-cp312-linux_x86_64.whl"
 
 cd /path/to/AccelMark
-pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
+pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt
 ```
 
 ### Path B — Build from source (Ubuntu 22.04 / glibc 2.35)
@@ -117,7 +117,7 @@ pip install "$DIST"/flash_attn_v100-*.whl
 cd /tmp && pip install --no-deps --force-reinstall "$DIST"/vllm-*.whl
 
 cd /path/to/AccelMark
-pip install -r runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
+pip install -r runners/nvidia_onecat_vllm_12a253c2/requirements.txt
 ```
 
 Do **not** run AccelMark from inside the cloned `1Cat-vLLM/` directory; Python
@@ -145,22 +145,36 @@ PY
 Copy and edit:
 
 ```bash
-cp configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml.example \
-   configs/runner_configs/runner_nvidia_onecat_vllm_4a9ca6c3.yaml
+cp configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml.example \
+   configs/runner_configs/runner_nvidia_onecat_vllm_12a253c2.yaml
 ```
 
-**Single V100 32GB** — recommended `engine_kwargs` (avoids prefix prefill shared-memory
-crash on SM70: `Shared memory exceeds 96KB`):
+**Single V100 32GB** — recommended `engine_kwargs` (avoids SM70
+`Shared memory exceeds 96KB` in `prefill_paged_fwd`):
 
 ```yaml
 tensor_parallel_size: 1
-max_num_seqs: 1
-gpu_memory_utilization: 0.88
+max_num_seqs: 512
+gpu_memory_utilization: 0.90
 engine_kwargs:
   enable_prefix_caching: false
+  enable_chunked_prefill: false
   kv_cache_auto_trim_ratio: 0.0
+
+suites:
+  suite_D:
+    max_num_seqs: 1
+    gpu_memory_utilization: 0.85
+```
+
+If it still crashes, export before `python run.py`:
+
+```bash
+export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1
 ```
 
+That forces the slower paged-KV gather fallback instead of `prefill_paged_fwd`.
+
 **4× V100 32GB** — set `tensor_parallel_size: 4`; keep the same `engine_kwargs`
 unless you are deliberately testing 1Cat's MTP / prefix-cache profile (see
 example file comments).
@@ -169,6 +183,7 @@ Other tuning:
 
 | Symptom | Try |
 |---------|-----|
+| `Shared memory exceeds 96KB` | `enable_chunked_prefill: false` + `enable_prefix_caching: false` (above); then `export VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` |
 | First request hangs (CUDA graph) | `enforce_eager: true` or `--enforce-eager` |
 | OOM at engine init | Lower `gpu_memory_utilization` (e.g. `0.85`) |
 | `GLIBC_2.38 not found` | Path B source build, or Ubuntu 24.04+ |
@@ -184,11 +199,11 @@ cp configs/models_local.yaml.example configs/models_local.yaml   # map local mod
 export PYTHONPATH=/path/to/AccelMark   # if pip install -e . is unavailable
 
 # Suite A smoke (1× V100)
-python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \
+python run.py --runner nvidia_onecat_vllm_12a253c2 \
     --suite suite_A --scenario accuracy --tensor-parallel-size 1
 
 # Suite B (4× V100)
-python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \
+python run.py --runner nvidia_onecat_vllm_12a253c2 \
     --suite suite_B --tensor-parallel-size 4
 ```
 
@@ -196,8 +211,9 @@ python run.py --runner nvidia_onecat_vllm_4a9ca6c3 \
 
 ## Known limitations
 
-- Prefix caching + chunked paged prefill can exceed V100's 96KB shared memory per SM;
-  disable `enable_prefix_caching` (see config above).
+- Prefix caching and **chunked prefill** (even with prefix caching off) can hit the
+  `prefill_paged_fwd` kernel (>96KB shared memory on SM70). Disable both in config;
+  use `VLLM_FLASH_V100_DISABLE_PAGED_PREFILL=1` if needed (see above).
 - `max_num_seqs: 1` limits batch throughput vs upstream vLLM defaults — intentional
   for 1Cat's long-context V100 profile.
 - Suite F is marked unsupported in `meta.json` (use upstream runner on V100 if needed).
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json b/runners/nvidia_onecat_vllm_12a253c2/meta.json
similarity index 69%
rename from runners/nvidia_onecat_vllm_4a9ca6c3/meta.json
rename to runners/nvidia_onecat_vllm_12a253c2/meta.json
index b3d136e..394601f 100644
--- a/runners/nvidia_onecat_vllm_4a9ca6c3/meta.json
+++ b/runners/nvidia_onecat_vllm_12a253c2/meta.json
@@ -1,12 +1,12 @@
 {
-  "id": "nvidia_onecat_vllm_4a9ca6c3",
+  "id": "nvidia_onecat_vllm_12a253c2",
   "platform": "nvidia",
   "name": "1Cat-vLLM (V100 / SM70 fork) on NVIDIA",
   "framework": "1Cat-vLLM",
   "submitted_by": "JuhaoLiang1997",
   "description": "AccelMark runner for Tesla V100 (SM70) using 1Cat-vLLM 1.0.0 — community vLLM fork with FLASH_ATTN_V100 and SM70 AWQ kernels. Use nvidia_vllm_* on Ampere or newer.",
-  "supersedes_chain": ["nvidia_onecat_vllm_a43d1bcf"],
-  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM",
+  "supersedes_chain": ["nvidia_onecat_vllm_4a9ca6c3", "nvidia_onecat_vllm_a43d1bcf"],
+  "notes": "Auto-injects attention_backend=FLASH_ATTN_V100 unless overridden. V100: disable prefix caching and chunked prefill in runner config (see README). External dependency: https://github.com/1CatAI/1Cat-vLLM",
   "created": "2026-05-15",
   "hardware_label": "NVIDIA V100 (SM70)",
   "suite_support": {
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt b/runners/nvidia_onecat_vllm_12a253c2/requirements.txt
similarity index 100%
rename from runners/nvidia_onecat_vllm_4a9ca6c3/requirements.txt
rename to runners/nvidia_onecat_vllm_12a253c2/requirements.txt
diff --git a/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py b/runners/nvidia_onecat_vllm_12a253c2/runner.py
similarity index 99%
rename from runners/nvidia_onecat_vllm_4a9ca6c3/runner.py
rename to runners/nvidia_onecat_vllm_12a253c2/runner.py
index 861b352..3462765 100644
--- a/runners/nvidia_onecat_vllm_4a9ca6c3/runner.py
+++ b/runners/nvidia_onecat_vllm_12a253c2/runner.py
@@ -92,8 +92,8 @@ def load_model(self, model_path: str, parallelism: dict) -> None:
         enforce_eager = getattr(self, "_enforce_eager", False)
 
         cfg             = getattr(self, "_runner_config", {})
-        max_num_seqs    = cfg.get("max_num_seqs", 1)
-        gpu_memory_util = cfg.get("gpu_memory_utilization", 0.88)
+        max_num_seqs    = cfg.get("max_num_seqs", 512)
+        gpu_memory_util = cfg.get("gpu_memory_utilization", 0.90)
         extra_kwargs    = dict(cfg.get("engine_kwargs") or {})
 
         import os

From 9aad45b1e30faa3963c1e0f8fe7c664a55c3c72a Mon Sep 17 00:00:00 2001
From: Liang Juhao <juhaoliang1997@gmail.com>
Date: Mon, 18 May 2026 22:06:52 +0800
Subject: [PATCH 5/5] upload onecat results

---
 .../accuracy/accuracy.json                    |   8 +
 .../env_info.json                             |  33 +++
 .../offline/result.json                       | 159 +++++++++++++
 .../online/result.json                        | 158 +++++++++++++
 .../result.json                               | 210 ++++++++++++++++++
 .../accuracy/accuracy.json                    |   8 +
 .../env_info.json                             |  33 +++
 .../interactive/result.json                   | 126 +++++++++++
 .../offline/result.json                       | 159 +++++++++++++
 .../online/result.json                        | 146 ++++++++++++
 .../result.json                               | 210 ++++++++++++++++++
 11 files changed, 1250 insertions(+)
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json
 create mode 100644 results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json

diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
new file mode 100644
index 0000000..304c3db
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.61,
+  "baseline_delta": null,
+  "valid": true,
+  "framework": "1Cat-vLLM",
+  "precision": "FP16",
+  "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
new file mode 100644
index 0000000..52c2fdc
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/env_info.json
@@ -0,0 +1,33 @@
+{
+  "collected_at": "2026-05-18T09:38:50.346241+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "Tesla V100-PCIE-32GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 32.0,
+      "driver_version": "580.82.07",
+      "firmware_version": null,
+      "compute_capability": "7.0",
+      "supports_bf16": false
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+    "physical_cores": 26,
+    "logical_cores": 52,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 214.5,
+  "pcie_generation": "PCIe Gen 3",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": null,
+  "os": "Ubuntu 22.04.5 LTS",
+  "python_version": "3.12.13",
+  "kernel_version": "5.4.0-149-generic",
+  "runtime_version": "CUDA 12.8",
+  "pytorch_version": "2.9.1+cu128"
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
new file mode 100644
index 0000000..2e6fc7f
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline/result.json
@@ -0,0 +1,159 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 671.32,
+          "throughput_tokens_per_sec_per_chip": 671.32,
+          "throughput_tokens_per_sec_total": 1168.67,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 670.99,
+          "throughput_tokens_per_sec_per_chip": 670.99,
+          "throughput_tokens_per_sec_total": 1168.09,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 671.43,
+          "throughput_tokens_per_sec_per_chip": 671.43,
+          "throughput_tokens_per_sec_total": 1168.44,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "18:03:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
+    "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
+    "benchmark_elapsed_minutes": 10.3,
+    "model_load_seconds": 47.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
new file mode 100644
index 0000000..66aeb48
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online/result.json
@@ -0,0 +1,158 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 113119.0,
+          "ttft_ms_p90": 832380.28,
+          "ttft_ms_p99": 872316.46,
+          "tpot_ms_p50": 1274.2,
+          "tpot_ms_p90": 1801.34,
+          "tpot_ms_p99": 4289.09,
+          "elapsed_seconds_median": 968.7,
+          "sla_met": false
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 130646.03,
+          "ttft_ms_p90": 865522.04,
+          "ttft_ms_p99": 901339.26,
+          "tpot_ms_p50": 1262.15,
+          "tpot_ms_p90": 1785.02,
+          "tpot_ms_p99": 4287.18,
+          "elapsed_seconds_median": 936.5,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 132710.0,
+          "ttft_ms_p90": 863880.66,
+          "ttft_ms_p99": 888527.06,
+          "tpot_ms_p50": 1248.86,
+          "tpot_ms_p90": 1740.58,
+          "tpot_ms_p99": 4225.34,
+          "elapsed_seconds_median": 921.5,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:25:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T10:04:46.235502+00:00",
+    "benchmark_end_time": "2026-05-18T12:25:39.450279+00:00",
+    "benchmark_elapsed_minutes": 140.9,
+    "model_load_seconds": 45.2
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json
new file mode 100644
index 0000000..07930da
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/result.json
@@ -0,0 +1,210 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:38:50.346241+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    }
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 671.32,
+          "throughput_tokens_per_sec_per_chip": 671.32,
+          "throughput_tokens_per_sec_total": 1168.67,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 670.99,
+          "throughput_tokens_per_sec_per_chip": 670.99,
+          "throughput_tokens_per_sec_total": 1168.09,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 671.43,
+          "throughput_tokens_per_sec_per_chip": 671.43,
+          "throughput_tokens_per_sec_total": 1168.44,
+          "elapsed_seconds_median": 51.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 5,
+          "achieved_qps": 5.0,
+          "ttft_ms_p50": 113119.0,
+          "ttft_ms_p90": 832380.28,
+          "ttft_ms_p99": 872316.46,
+          "tpot_ms_p50": 1274.2,
+          "tpot_ms_p90": 1801.34,
+          "tpot_ms_p99": 4289.09,
+          "elapsed_seconds_median": 968.7,
+          "sla_met": false
+        },
+        {
+          "target_qps": 25,
+          "achieved_qps": 25.0,
+          "ttft_ms_p50": 130646.03,
+          "ttft_ms_p90": 865522.04,
+          "ttft_ms_p99": 901339.26,
+          "tpot_ms_p50": 1262.15,
+          "tpot_ms_p90": 1785.02,
+          "tpot_ms_p99": 4287.18,
+          "elapsed_seconds_median": 936.5,
+          "sla_met": false
+        },
+        {
+          "target_qps": 100,
+          "achieved_qps": 100.0,
+          "ttft_ms_p50": 132710.0,
+          "ttft_ms_p90": 863880.66,
+          "ttft_ms_p99": 888527.06,
+          "tpot_ms_p50": 1248.86,
+          "tpot_ms_p90": 1740.58,
+          "tpot_ms_p99": 4225.34,
+          "elapsed_seconds_median": 921.5,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.61,
+    "baseline_delta": null,
+    "valid": true,
+    "framework": "1Cat-vLLM",
+    "precision": "FP16",
+    "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "18:03:39",
+    "run_id": "4e0e6eba",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:53:19.928949+00:00",
+    "benchmark_end_time": "2026-05-18T10:03:39.512440+00:00",
+    "benchmark_elapsed_minutes": 151.2,
+    "model_load_seconds": 47.8,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/offline",
+      "online": "results/community/tesla_v100_pcie_32gbx1_suite_A_nvidia_onecat_vllm_12a253c2_4e0e6eba/online"
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json
new file mode 100644
index 0000000..94e5547
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/accuracy/accuracy.json
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.37,
+  "baseline_delta": 0.0,
+  "valid": true,
+  "framework": "1Cat-vLLM",
+  "precision": "FP16",
+  "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json
new file mode 100644
index 0000000..1f8b6bd
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/env_info.json
@@ -0,0 +1,33 @@
+{
+  "collected_at": "2026-05-18T12:26:03.593928+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "Tesla V100-PCIE-32GB",
+      "vendor": "NVIDIA",
+      "memory_gb": 32.0,
+      "driver_version": "580.82.07",
+      "firmware_version": null,
+      "compute_capability": "7.0",
+      "supports_bf16": false
+    }
+  ],
+  "accelerator_platform": "nvidia",
+  "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+    "physical_cores": 26,
+    "logical_cores": 52,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 214.5,
+  "pcie_generation": "PCIe Gen 3",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": null,
+  "os": "Ubuntu 22.04.5 LTS",
+  "python_version": "3.12.13",
+  "kernel_version": "5.4.0-149-generic",
+  "runtime_version": "CUDA 12.8",
+  "pytorch_version": "2.9.1+cu128"
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json
new file mode 100644
index 0000000..f017bc2
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive/result.json
@@ -0,0 +1,126 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "interactive",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "interactive": {
+      "ttft_ms_p50": 26.76,
+      "ttft_ms_p90": 29.57,
+      "ttft_ms_p99": 40.69,
+      "tpot_ms_p50": 3.51,
+      "tpot_ms_p90": 3.76,
+      "tpot_ms_p99": 3.81,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 116.9
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:45:36",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:39:46.224469+00:00",
+    "benchmark_end_time": "2026-05-18T12:45:36.498231+00:00",
+    "benchmark_elapsed_minutes": 5.8,
+    "model_load_seconds": 27.8
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json
new file mode 100644
index 0000000..da8126b
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline/result.json
@@ -0,0 +1,159 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 6234.82,
+          "throughput_tokens_per_sec_per_chip": 6234.82,
+          "throughput_tokens_per_sec_total": 9303.11,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 6292.79,
+          "throughput_tokens_per_sec_per_chip": 6292.79,
+          "throughput_tokens_per_sec_total": 9356.18,
+          "elapsed_seconds_median": 6.7,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 6243.51,
+          "throughput_tokens_per_sec_per_chip": 6243.51,
+          "throughput_tokens_per_sec_total": 9267.55,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:28:55",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00",
+    "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00",
+    "benchmark_elapsed_minutes": 1.4,
+    "model_load_seconds": 31.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json
new file mode 100644
index 0000000..170f9d0
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online/result.json
@@ -0,0 +1,146 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": null,
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "online",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    },
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 6316.13,
+          "ttft_ms_p90": 53409.43,
+          "ttft_ms_p99": 67932.56,
+          "tpot_ms_p50": 206.23,
+          "tpot_ms_p90": 291.3,
+          "tpot_ms_p99": 636.32,
+          "elapsed_seconds_median": 103.3,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 19238.78,
+          "ttft_ms_p90": 56898.27,
+          "ttft_ms_p99": 75398.9,
+          "tpot_ms_p50": 189.24,
+          "tpot_ms_p90": 300.17,
+          "tpot_ms_p99": 582.22,
+          "elapsed_seconds_median": 86.3,
+          "sla_met": false
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:38:56",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:29:46.673625+00:00",
+    "benchmark_end_time": "2026-05-18T12:38:56.798553+00:00",
+    "benchmark_elapsed_minutes": 9.2,
+    "model_load_seconds": 28.7
+  }
+}
\ No newline at end of file
diff --git a/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json
new file mode 100644
index 0000000..12baab4
--- /dev/null
+++ b/results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/result.json
@@ -0,0 +1,210 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_F",
+  "implementation_id": "nvidia_onecat_vllm_12a253c2",
+  "chip": {
+    "name": "Tesla V100-PCIE-32GB",
+    "vendor": "NVIDIA",
+    "count": 1,
+    "memory_gb": 32.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T12:26:03.593928+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "Tesla V100-PCIE-32GB",
+        "vendor": "NVIDIA",
+        "memory_gb": 32.0,
+        "driver_version": "580.82.07",
+        "firmware_version": null,
+        "compute_capability": "7.0",
+        "supports_bf16": false
+      }
+    ],
+    "accelerator_platform": "nvidia",
+    "accelerator_topology": "\tGPU0\tCPU Affinity\tNUMA Affinity\tGPU NUMA ID\nGPU0\t X \t0-25\t0\t\tN/A\n\nLegend:\n\n  X    = Self\n  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)\n  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node\n  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)\n  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)\n  PIX  = Connection traversing at most a single PCIe bridge\n  NV#  = Connection traversing a bonded set of # NVLinks\n",
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz",
+      "physical_cores": 26,
+      "logical_cores": 52,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 214.5,
+    "pcie_generation": "PCIe Gen 3",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": null,
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13",
+    "kernel_version": "5.4.0-149-generic",
+    "runtime_version": "CUDA 12.8",
+    "pytorch_version": "2.9.1+cu128"
+  },
+  "software": {
+    "framework": "1Cat-vLLM",
+    "framework_version": "1.0.0+flash_attn_v100-1.0.0",
+    "driver_version": "580.82.07",
+    "runtime_version": "CUDA 12.8",
+    "os": "Ubuntu 22.04.5 LTS",
+    "python_version": "3.12.13"
+  },
+  "model": {
+    "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
+    "model_revision": "7ae557604adf67be50417f59c2c2f167def9a775",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 0.5,
+    "precision": "FP16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenarios_run": [
+      "offline",
+      "online",
+      "interactive"
+    ],
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "num_runs": 3,
+    "extra_config": {
+      "tensor_parallel_size": 1,
+      "enforce_eager": false,
+      "max_num_seqs": 512,
+      "gpu_memory_utilization": 0.9,
+      "engine_kwargs": {
+        "enable_prefix_caching": false,
+        "enable_chunked_prefill": false,
+        "kv_cache_auto_trim_ratio": 0.0
+      }
+    }
+  },
+  "metrics": {
+    "derived": {},
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 4,
+          "throughput_tokens_per_sec": 6234.82,
+          "throughput_tokens_per_sec_per_chip": 6234.82,
+          "throughput_tokens_per_sec_total": 9303.11,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 16,
+          "throughput_tokens_per_sec": 6292.79,
+          "throughput_tokens_per_sec_per_chip": 6292.79,
+          "throughput_tokens_per_sec_total": 9356.18,
+          "elapsed_seconds_median": 6.7,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 64,
+          "throughput_tokens_per_sec": 6243.51,
+          "throughput_tokens_per_sec_per_chip": 6243.51,
+          "throughput_tokens_per_sec_total": 9267.55,
+          "elapsed_seconds_median": 6.8,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    },
+    "online": {
+      "sla_ttft_ms": 500,
+      "max_valid_qps": 0.0,
+      "results_by_qps": [
+        {
+          "target_qps": 10,
+          "achieved_qps": 10.0,
+          "ttft_ms_p50": 6316.13,
+          "ttft_ms_p90": 53409.43,
+          "ttft_ms_p99": 67932.56,
+          "tpot_ms_p50": 206.23,
+          "tpot_ms_p90": 291.3,
+          "tpot_ms_p99": 636.32,
+          "elapsed_seconds_median": 103.3,
+          "sla_met": false
+        },
+        {
+          "target_qps": 40,
+          "achieved_qps": 40.0,
+          "ttft_ms_p50": 19238.78,
+          "ttft_ms_p90": 56898.27,
+          "ttft_ms_p99": 75398.9,
+          "tpot_ms_p50": 189.24,
+          "tpot_ms_p90": 300.17,
+          "tpot_ms_p99": 582.22,
+          "elapsed_seconds_median": 86.3,
+          "sla_met": false
+        }
+      ]
+    },
+    "interactive": {
+      "ttft_ms_p50": 26.76,
+      "ttft_ms_p90": 29.57,
+      "ttft_ms_p99": 40.69,
+      "tpot_ms_p50": 3.51,
+      "tpot_ms_p90": 3.76,
+      "tpot_ms_p99": 3.81,
+      "peak_memory_gb": null,
+      "elapsed_seconds_median": 116.9
+    }
+  },
+  "accuracy": {
+    "subset_score": 0.37,
+    "baseline_delta": 0.0,
+    "valid": true,
+    "framework": "1Cat-vLLM",
+    "precision": "FP16",
+    "notes": "Integrated accuracy check \u2014 used same 1Cat-vLLM instance as benchmark."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "20:28:55",
+    "run_id": "419b138c",
+    "run_name": "tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c",
+    "flagged": null,
+    "reproduce_script": "runners/nvidia_onecat_vllm_12a253c2/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T12:27:34.502139+00:00",
+    "benchmark_end_time": "2026-05-18T12:28:55.745031+00:00",
+    "benchmark_elapsed_minutes": 16.4,
+    "model_load_seconds": 31.7,
+    "benchmark_elapsed_minutes_note": "Total across ['offline', 'online', 'interactive'] scenarios.",
+    "scenario_dirs": {
+      "offline": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/offline",
+      "online": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/online",
+      "interactive": "results/community/tesla_v100_pcie_32gbx1_suite_F_nvidia_onecat_vllm_12a253c2_419b138c/interactive"
+    }
+  }
+}
\ No newline at end of file