Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
| Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
| Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
| Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — |

_Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
<!-- platforms-matrix:end -->
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads)
#
# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove
# .example suffix) and edit as needed for your hardware. The actual .yaml
# is gitignored.
#
# These settings adapt the runner to your hardware environment. They are
# recorded in result.json task.extra_config for transparency but are NOT
# part of the benchmark identity (not hashed into run_id).
#
# Merge priority: CLI flags > suite-specific > global defaults > runner defaults

# ── Global defaults (apply to all suites) ─────────────────────────────────────

# Tensor parallel size — number of Moore Threads GPUs to use (default: 1).
# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn.
tensor_parallel_size: 1

# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel
# errors on first request (most common on S3000 / S80 paths).
enforce_eager: false

# Maximum number of sequences in a batch (default: 256).
# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards.
max_num_seqs: 256

# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if
# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to
# MUSA HBM via torchada.
gpu_memory_utilization: 0.85

# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
# Unknown keys are dropped automatically with a warning, so this is safe to
# use across vLLM 0.10.x / 0.13.x.
# engine_kwargs:
# swap_space: 8
# max_seq_len_to_capture: 4096

# ── Suite-specific overrides ───────────────────────────────────────────────────

suites:
suite_D:
# Long-context — reduce batch size and reserve more memory.
max_num_seqs: 32
gpu_memory_utilization: 0.80

suite_F:
max_num_seqs: 128

# ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
# Uncomment to enable. vllm-musa accepts the same speculative_config dict as
# upstream vLLM; the runner translates flat keys (speculative_model,
# num_speculative_tokens, ...) into speculative_config automatically.
#
# suites:
# suite_A:
# engine_kwargs:
# speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
# num_speculative_tokens: 4
# speculative_draft_tensor_parallel_size: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"subset_score": 0.07,
"baseline_delta": -0.53,
"valid": false,
"framework": "vllm-musa",
"precision": "BF16",
"notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"collected_at": "2026-05-18T09:21:31.092840+00:00",
"accelerators": [
{
"index": 0,
"name": "MTT S4000",
"vendor": "Moore Threads",
"memory_gb": 48.0,
"driver_version": "2.7.0",
"firmware_version": null,
"supports_bf16": true
}
],
"accelerator_platform": "moorethreads",
"accelerator_topology": null,
"intra_node_interconnect": null,
"cpu": {
"model": "Intel(R) Xeon(R) Gold 6430",
"physical_cores": 64,
"logical_cores": 128,
"numa_nodes": 2
},
"system_memory_gb": 1007.5,
"pcie_generation": "PCIe 16x/16x",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": [
{
"name": "mlx5_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_1",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_bond_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
}
],
"os": "Ubuntu Jammy Jellyfish (development branch)",
"python_version": "3.10.8",
"kernel_version": "5.15.0-105-generic",
"runtime_version": "Moore Threads Driver 2.7.0",
"pytorch_version": "2.2.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"schema_version": "1.0",
"suite_id": "suite_A",
"implementation_id": "moorethreads_vllm_musa_f2f6f965",
"chip": {
"name": "MTT S4000",
"vendor": "Moore Threads",
"count": 1,
"memory_gb": 48.0,
"interconnect_intra_node": null,
"interconnect_inter_node": null
},
"environment": {
"collected_at": "2026-05-18T09:21:31.092840+00:00",
"accelerators": [
{
"index": 0,
"name": "MTT S4000",
"vendor": "Moore Threads",
"memory_gb": 48.0,
"driver_version": "2.7.0",
"firmware_version": null,
"supports_bf16": true
}
],
"accelerator_platform": "moorethreads",
"accelerator_topology": null,
"intra_node_interconnect": null,
"cpu": {
"model": "Intel(R) Xeon(R) Gold 6430",
"physical_cores": 64,
"logical_cores": 128,
"numa_nodes": 2
},
"system_memory_gb": 1007.5,
"pcie_generation": "PCIe 16x/16x",
"cpu_accelerator_bandwidth_gbs": null,
"network_interfaces": [
{
"name": "mlx5_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_1",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
},
{
"name": "mlx5_bond_0",
"type": "InfiniBand/RoCE",
"bandwidth_gbps": null
}
],
"os": "Ubuntu Jammy Jellyfish (development branch)",
"python_version": "3.10.8",
"kernel_version": "5.15.0-105-generic",
"runtime_version": "Moore Threads Driver 2.7.0",
"pytorch_version": "2.2.0"
},
"software": {
"framework": "vllm-musa",
"framework_version": "0.4.2",
"driver_version": "2.7.0",
"runtime_version": "Moore Threads Driver 2.7.0",
"os": "Ubuntu Jammy Jellyfish (development branch)",
"python_version": "3.10.8"
},
"model": {
"model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
"model_name": null,
"model_note": null,
"model_source": "local",
"architecture": "dense",
"parameter_count_b": 8.0,
"precision": "BF16",
"effective_dtype": "float16",
"quantization_method": null,
"model_format": "HuggingFace original"
},
"task": {
"scenario": "offline",
"num_runs": 3,
"warmup_runs": 1,
"parallelism": {
"tensor_parallel_size": 1,
"pipeline_parallel_size": 1,
"expert_parallel_size": 1,
"data_parallel_size": 1
},
"extra_config": null,
"runtime_metrics": null
},
"metrics": {
"offline": {
"results_by_concurrency": [
{
"client_concurrency": 8,
"throughput_tokens_per_sec": 332.62,
"throughput_tokens_per_sec_per_chip": 332.62,
"throughput_tokens_per_sec_total": 922.83,
"elapsed_seconds_median": 43.4,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 32,
"throughput_tokens_per_sec": 331.64,
"throughput_tokens_per_sec_per_chip": 331.64,
"throughput_tokens_per_sec_total": 920.1,
"elapsed_seconds_median": 43.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
},
{
"client_concurrency": 128,
"throughput_tokens_per_sec": 331.76,
"throughput_tokens_per_sec_per_chip": 331.76,
"throughput_tokens_per_sec_total": 920.46,
"elapsed_seconds_median": 43.6,
"peak_memory_gb": null,
"power_watts_avg": null,
"power_watts_peak": null,
"oom": false,
"_throughput_note": "output_only",
"_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
}
]
}
},
"accuracy": {
"subset_score": null,
"baseline_delta": null,
"valid": false,
"notes": "Run --scenario accuracy to check model accuracy."
},
"meta": {
"submitted_by": "JuhaoLiang1997",
"submission_type": "individual",
"date": "2026-05-18",
"time": "17:34:52",
"run_id": "cabb7bd0",
"run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
"flagged": null,
"reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
"env_info_file": "../env_info.json",
"log_file": "run.log",
"samples_file": "samples.jsonl",
"notes": null,
"benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
"benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
"benchmark_elapsed_minutes": 8.7,
"model_load_seconds": 116.8
}
}
Loading
Loading