From bbdd923d53275ffd09b195baa7d2113da8fb521c Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 17:59:43 -0700
Subject: [PATCH 01/33] feat: MiniMax-M3 MXFP8 full sweep config for GB200

Add minimaxm3-fp8-gb200-dynamo-vllm to nvidia-master.yaml with 6
topologies covering the full concurrency range:
- TP4/TP8 (low latency, conc 4-64)
- TP4+EP4 agg + 1P+1D disagg (mid curve, conc 64-512)
- DEP4/DEP8 (high throughput, conc 256-2048)

All recipe YAMLs included under minimax-m3-gb200-fp8/{1k1k,8k1k}/.
---
 .github/configs/nvidia-master.yaml            | 111 ++++++++++++++++++
 .../workflows/benchmark-multinode-tmpl.yml    |   5 +
 .../1k1k/agg-gb200-dep4-1n.yaml               |  74 ++++++++++++
 .../1k1k/agg-gb200-dep8-2n.yaml               |  74 ++++++++++++
 .../1k1k/agg-gb200-tp4-1n.yaml                |  71 +++++++++++
 .../1k1k/agg-gb200-tp4ep4-1n.yaml             |  72 ++++++++++++
 .../1k1k/agg-gb200-tp8-2n.yaml                |  71 +++++++++++
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     |  89 ++++++++++++++
 .../8k1k/agg-gb200-dep8-2n.yaml               |  74 ++++++++++++
 .../8k1k/agg-gb200-tp4-1n.yaml                |  71 +++++++++++
 .../8k1k/agg-gb200-tp4ep4-1n.yaml             |  72 ++++++++++++
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     |  89 ++++++++++++++
 perf-changelog.yaml                           |  12 ++
 runners/launch_gb200-nv.sh                    |  50 ++++++--
 14 files changed, 927 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 187824347..e68adb5f4 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11679,6 +11679,117 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
           ep: 4
           dp-attn: true
 
+# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
+# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
+# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
+# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release;
+# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64)
+# image built from the m3_release branch (vllm-project/vllm#45381).
+# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
+minimaxm3-fp8-gb200-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: gb200
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      # Low latency: TP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 8
+          ep: 1
+          dp-attn: false
+
+      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      - conc-list: [64, 128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
+      - conc-list: [256, 512, 1024]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 4
+          dp-attn: true
+
+      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml"
+        decode:
+          num-worker: 0
+          tp: 1
+          ep: 8
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 81727ef39..85b399e6c 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -123,6 +123,11 @@ on:
 
 env:
   RANDOM_RANGE_RATIO: 0.8
+  # Day-zero models resolved via hf: ids download from the Hub inside the
+  # slurm job (srtctl pre-download + dynamo hub fetch). Anonymous requests
+  # get 429-rate-limited when several workers pull a 444 GB snapshot at
+  # once; sbatch/srun inherit this env so the token reaches the workers.
+  HF_TOKEN: ${{ secrets.INFERENCEX_OFFICIAL_RO_HF_TOKEN }}
   EXP_NAME: ${{ inputs.exp-name }}
   IMAGE: ${{ inputs.image }}
   MODEL_PREFIX: ${{ inputs.model-prefix }}
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
new file mode 100644
index 000000000..a95d2df41
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
new file mode 100644
index 000000000..ab231e733
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
new file mode 100644
index 000000000..ce431c3c0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
new file mode 100644
index 000000000..29efa7ecc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -0,0 +1,72 @@
+name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
new file mode 100644
index 000000000..29a5934bd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
new file mode 100644
index 000000000..17769abf3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200.
+# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
new file mode 100644
index 000000000..db729764a
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -0,0 +1,74 @@
+name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+  gpus_per_agg: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
new file mode 100644
index 000000000..8c7ecbe17
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -0,0 +1,71 @@
+name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 64
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
new file mode 100644
index 000000000..3e146af8b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -0,0 +1,72 @@
+name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
+
+# MiniMax-M3 day-zero aggregated recipe
+# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
+# day zero: NixlConnector KV transfer has never been exercised against
+# M3's MSA index cache, so disagg shapes are deferred until agg
+# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
+# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
+# benchmark is text-only, so language-model-only skips the vision
+# encoder. max-model-len / cudagraph capture / batched tokens are
+# trimmed to the fixed-seq-len scenario instead of the 1M default.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+# 444 GB of weights off shared FS (cold HF cache on the first run):
+# allow up to 2 h for engine readiness.
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_agg: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  aggregated_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    aggregated:
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 256
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
new file mode 100644
index 000000000..54980f7d3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k.
+# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d29c9a5d3..647121c12 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3646,3 +3646,15 @@
     - "Layouts: TP8 and TP4 (latency), TP4+EP4 / TP8+EP8 (TEP throughput), tp2-ep2, TP8+EP8 dp-attn (DEP) across 1k1k and 8k1k"
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
+
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)"
+    - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode"
+    - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
+    - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
+    - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
+  pr-link: TBD
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 36c8af203..9c3430289 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -60,8 +60,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5"
         export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8"
+    elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+        export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8"
+        export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8"
         exit 1
     fi
 else
@@ -81,7 +84,7 @@ NGINX_IMAGE="nginx:1.27.4"
 # squash dir on a path that's also visible to compute nodes. Falls
 # back to the legacy sa-shared path so other configs are untouched.
 SQUASH_DIR="/mnt/lustre01/users-public/sa-shared"
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     echo "=== cluster diagnostic (minimax sweep) ==="
     echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)"
     echo "HOME=$HOME"
@@ -128,8 +131,32 @@ fi
 SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-enroot import -o $SQUASH_FILE docker://$IMAGE
-enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+# Concurrent matrix jobs (three gb200-nv runners) all import to the same
+# shared-FS squash path. An unsynchronized `enroot import -o` onto an
+# existing file APPENDS to it (mksquashfs default), corrupting the image
+# while other jobs' pyxis extractions are reading it — observed on the
+# minimaxm3 day-zero sweep (R1: an eval job appended to the live squash
+# mid-run). Serialize with a lock, skip when the existing file is valid,
+# and build to a temp path + atomic mv so readers never see a half-written
+# file. Mirrors the import_squash pattern in launch_gb300-nv.sh.
+import_squash() {
+    local squash="$1" image="$2"
+    local lock="${squash}.lock"
+    (
+        exec 9>"$lock"
+        flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; }
+        if unsquashfs -l "$squash" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import: $squash"
+        else
+            rm -f "$squash" "$squash".tmp.*
+            enroot import -o "${squash}.tmp.$$" "docker://$image"
+            mv -f "${squash}.tmp.$$" "$squash"
+        fi
+    ) || exit 1
+}
+
+import_squash "$SQUASH_FILE" "$IMAGE"
+import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
@@ -202,7 +229,7 @@ SRT_REPO_DIR="srt-slurm"
 # cross-mounted to compute nodes. Put the srt-slurm workspace and staged
 # InferenceX checkout on a writable shared-FS path that compute can see.
 # Per-run-unique paths avoid races between parallel sweep jobs.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_BASE=""
     for cand in \
         /mnt/lustre01/users-public/sa-shared/gha-runs \
@@ -269,6 +296,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then
         echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2
         exit 1
     fi
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
+    cd "$SRT_REPO_DIR" || exit 1
+    git checkout main || exit 1
+    mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1
 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
@@ -292,7 +325,7 @@ source $HOME/.local/bin/env
 # under a head-node-only path, .venv/bin/python3 becomes a broken
 # symlink on compute. Pin the venv to /usr/bin/python3 — a system
 # path that exists at the same location on both head and compute.
-if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then
+if [[ ( $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ) && -x /usr/bin/python3 ]]; then
     uv venv --seed --python /usr/bin/python3
 else
     uv venv --seed
@@ -312,7 +345,7 @@ SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
 # Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path
 # above so srtctl's outputs/ directory (which lives under
 # SRTCTL_ROOT) is visible to compute nodes.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SRTCTL_ROOT="$SRT_REPO_DIR"
 fi
 echo "Creating srtslurm.yaml configuration..."
@@ -354,7 +387,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 # can't see. Stage the relevant subset to shared FS and repoint
 # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already
 # on shared FS) and .git (not needed in container) for speed.
-if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then
+if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "minimaxm3" ]]; then
     SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}"
     mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1
     rsync -a --delete \
@@ -379,6 +412,7 @@ if [[ ! -f "$CONFIG_PATH" ]]; then
 fi
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH"
 
+
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
 else

From dbf5135c0299f26b19ff814519651f17efdc68e8 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 18:06:32 -0700
Subject: [PATCH 02/33] chore: update perf-changelog pr-link to #1734

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 647121c12..e1d38dd9f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3657,4 +3657,4 @@
     - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
     - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
-  pr-link: TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734

From ed63c1e042078379d6f555d573528c82e7559623 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 20:52:36 -0700
Subject: [PATCH 03/33] feat: switch GB200 M3 to ai-dynamo vllm-runtime 1.3.0
 image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adopt the NVIDIA Dynamo vLLM runtime image
(nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1), the
canonical M3 runtime from ai-dynamo/dynamo
release/1.3.0-minimax-m3-dev.1.

Changes mirrored from that release's
recipes/minimax-m3/vllm/disagg/MXFP8/deploy.yaml:
- dynamo.install: false — the runtime image bundles dynamo 1.3.0, so
  the prior 1.2.0 wheel install is dropped (srtctl defaults install=true)
- attention-backend: FLASH_ATTN on every prefill/decode/agg engine

Benchmark-specific knobs kept over the reference's serving defaults:
language-model-only (text-only), no-enable-prefix-caching (random data),
scenario-trimmed max-model-len.
---
 .github/configs/nvidia-master.yaml                     | 10 ++++++----
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml    |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml |  6 +++---
 .../minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml    |  6 +++---
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml              |  7 ++++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml   |  6 +++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml    |  6 +++---
 .../minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml |  6 +++---
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml              |  7 ++++---
 11 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c11f6505b..d1926f30f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11682,12 +11682,14 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. M3 has not shipped in a stable vLLM release;
-# vllm/vllm-openai:minimax-m3 is the dedicated multi-arch (arm64+amd64)
-# image built from the m3_release branch (vllm-project/vllm#45381).
+# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
+# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
+# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
+# Engine args mirror the canonical recipe (ai-dynamo/dynamo
+# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
 # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
index a95d2df41..3b328ea28 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
index ab231e733..81b000039 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
index ce431c3c0..f7684fe8d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
index 29efa7ecc..1fc4a3d98 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
index 29a5934bd..65e85f441 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 2048
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 17769abf3..90ec1d007 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
@@ -75,6 +75,7 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
index db729764a..c3f50da69 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -60,6 +59,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 128
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
index 8c7ecbe17..444f1e1df 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -57,6 +56,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 64
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
index 3e146af8b..ca8ea7e48 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,12 +13,11 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-batched-tokens: 16384
       max-cudagraph-capture-size: 256
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 54980f7d3..6a13b50d1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,12 +6,11 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
+  install: false
 
 slurm:
   time_limit: "8:00:00"
@@ -58,6 +57,7 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
@@ -75,6 +75,7 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
+      attention-backend: FLASH_ATTN
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"

From 8738f42aabdbffb4235c5b5bc89c359c59ff26c4 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 12 Jun 2026 21:11:07 -0700
Subject: [PATCH 04/33] fix: use enroot registry syntax (nvcr.io#) for GB200 M3
 image

enroot's docker:// URI needs `#` to separate the registry host from
the image path; `nvcr.io/...` was parsed as a Docker Hub repo and 401'd
against registry-1.docker.io. Matches the existing nvcr.io# convention
in nvidia-master.yaml. Recipe container fields kept byte-identical to
the master image: field (srtslurm.yaml maps "${IMAGE}" -> squashfile).
---
 .github/configs/nvidia-master.yaml                            | 4 ++--
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml      | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml   | 2 +-
 .../vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml      | 2 +-
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml                     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml     | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml      | 2 +-
 .../vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml   | 2 +-
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml                     | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1d24e2857..9e3977232 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11777,13 +11777,13 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
 # tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
-# (nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
+# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
 # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
 # GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
+  image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
index 3b328ea28..921f99b8e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
index 81b000039..50eb3ff64 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
index f7684fe8d..6115d210c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
index 1fc4a3d98..94df4c8ec 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
index 65e85f441..1ac2612bd 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 90ec1d007..4f9c01c6b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
index c3f50da69..adb36f646 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
index 444f1e1df..8cfbcb616 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
index ca8ea7e48..1567ca57c 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
@@ -13,7 +13,7 @@ name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 6a13b50d1..86d48468a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,7 +6,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
   precision: "fp8"
 
 dynamo:

From 3415fb4e6a815393fd6c8ba12210bc9cd2f5074d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 13:44:48 -0700
Subject: [PATCH 05/33] feat: convert MiniMax-M3 GB200 sweep to fully
 disaggregated inference

Replace the mostly-aggregated GB200 sweep (5 agg + 1 disagg) with a fully
disaggregated sweep that splits prefill/decode over NixlConnector, mirroring
the minimaxm2.5-fp8-gb200 reference. Every worker = one 4-GPU node since the
444 GB MXFP8 checkpoint can't fit in fewer.

Topologies (1k1k): 1P1D TP4 (low-lat), 1P1D TP4+EP4 (mid), 1P2D TP4+EP4
(decode-scaled), 2P1D TP4+EP4 (prefill-scaled), 1P1D DEP4 (max-tput),
spanning conc 4-2048.

- add 4 disagg recipes; remove 8 orphaned agg recipes (1k1k + 8k1k)
- rewire nvidia-master.yaml search-space to the 5 disagg entries
- perf-changelog: describe disagg sweep; fix stale Image line
  (vllm/vllm-openai:minimax-m3 -> nvcr.io#.../vllm-runtime:1.3.0-minimax-m3-dev.1)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 66 ++++++-------
 .../1k1k/agg-gb200-dep4-1n.yaml               | 74 --------------
 .../1k1k/agg-gb200-dep8-2n.yaml               | 74 --------------
 .../1k1k/agg-gb200-tp4-1n.yaml                | 71 --------------
 .../1k1k/agg-gb200-tp4ep4-1n.yaml             | 72 --------------
 .../1k1k/agg-gb200-tp8-2n.yaml                | 71 --------------
 .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml       | 96 +++++++++++++++++++
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 89 +++++++++++++++++
 .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml     | 92 ++++++++++++++++++
 .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml     | 92 ++++++++++++++++++
 .../8k1k/agg-gb200-dep8-2n.yaml               | 74 --------------
 .../8k1k/agg-gb200-tp4-1n.yaml                | 71 --------------
 .../8k1k/agg-gb200-tp4ep4-1n.yaml             | 72 --------------
 perf-changelog.yaml                           | 10 +-
 14 files changed, 401 insertions(+), 623 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e3977232..15aee30c5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11781,7 +11781,10 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# GB200 full sweep: TP/TP8 (low conc), TEP (mid), disagg (mid), DEP (high).
+# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split,
+# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in
+# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4
+# (decode- and prefill-scaled), 1P1D DEP4 (max throughput).
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -11796,7 +11799,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: TP=4 aggregated, 1 node (4 GPU).
+      # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each).
       - conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -11804,86 +11807,71 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml"
         decode:
-          num-worker: 0
+          num-worker: 1
           tp: 4
           ep: 1
           dp-attn: false
 
-      # Low latency: TP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [4, 8, 16, 32, 64]
+      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
+      - conc-list: [64, 128, 256, 512]
         prefill:
           num-worker: 1
-          tp: 8
-          ep: 1
+          tp: 4
+          ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
         decode:
-          num-worker: 0
-          tp: 8
-          ep: 1
+          num-worker: 1
+          tp: 4
+          ep: 4
           dp-attn: false
 
-      # Mid curve: TP4+EP4 aggregated, 1 node (4 GPU).
-      - conc-list: [128, 256, 512]
+      # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each).
+      - conc-list: [256, 512, 1024]
         prefill:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml"
         decode:
-          num-worker: 0
+          num-worker: 2
           tp: 4
           ep: 4
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
-      - conc-list: [64, 128, 256, 512]
+      # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each).
+      - conc-list: [256, 512, 1024]
         prefill:
-          num-worker: 1
+          num-worker: 2
           tp: 4
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 4
           dp-attn: false
 
-      # High throughput: DEP=4 aggregated, 1 node (4 GPU).
-      - conc-list: [256, 512, 1024]
+      # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each).
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
           tp: 1
           ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml"
         decode:
-          num-worker: 0
-          tp: 1
-          ep: 4
-          dp-attn: true
-
-      # Max throughput: DEP=8 aggregated, 2 nodes (8 GPU).
-      - conc-list: [512, 1024, 2048]
-        prefill:
           num-worker: 1
           tp: 1
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml"
-        decode:
-          num-worker: 0
-          tp: 1
-          ep: 8
+          ep: 4
           dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
deleted file mode 100644
index 921f99b8e..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep4-1n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 4
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
deleted file mode 100644
index 50eb3ff64..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-dep8-2n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
deleted file mode 100644
index 6115d210c..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4-1n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
deleted file mode 100644
index 94df4c8ec..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
deleted file mode 100644
index 1ac2612bd..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/agg-gb200-tp8-2n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp8-2n-1k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-batched-tokens: 2048
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
new file mode 100644
index 000000000..0749dbc86
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
@@ -0,0 +1,96 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve).
+# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode
+# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode
+# token throughput at high concurrency; engine shape mirrors the proven
+# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index
+# cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-batched-tokens: 2048
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..927066e42
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,89 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
+# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP,
+# no expert parallel: lowest TTFT/ITL for small concurrencies.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
new file mode 100644
index 000000000..fbb99a3dd
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
@@ -0,0 +1,92 @@
+name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled).
+# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node
+# each) = 3 nodes. Two decode workers absorb more in-flight sequences for
+# mid/high concurrencies while a single prefill keeps TTFT low.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
new file mode 100644
index 000000000..fb27934cb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
@@ -0,0 +1,92 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
+# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4,
+# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at
+# mid/high concurrencies without starving a single decode worker.
+# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  precision: "fp8"
+
+dynamo:
+  install: false
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASH_ATTN
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      stream-interval: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
deleted file mode 100644
index adb36f646..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-dep8-2n.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-dep8-2n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 2
-  agg_workers: 1
-  gpus_per_agg: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
deleted file mode 100644
index 8cfbcb616..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4-1n.yaml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 64
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
deleted file mode 100644
index 1567ca57c..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/agg-gb200-tp4ep4-1n.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: "minimax-m3-vllm-agg-gb200-tp4ep4-1n-8k1k"
-
-# MiniMax-M3 day-zero aggregated recipe
-# (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3). Aggregated-only on
-# day zero: NixlConnector KV transfer has never been exercised against
-# M3's MSA index cache, so disagg shapes are deferred until agg
-# baselines exist. model.path uses a staged-model alias — srtctl resolves it via
-# model_paths in srtslurm.yaml to /mnt/lustre01/models/MiniMax-M3-MXFP8.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment); the
-# benchmark is text-only, so language-model-only skips the vision
-# encoder. max-model-len / cudagraph capture / batched tokens are
-# trimmed to the fixed-seq-len scenario instead of the 1M default.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
-  precision: "fp8"
-
-dynamo:
-  install: false
-
-slurm:
-  time_limit: "8:00:00"
-
-# 444 GB of weights off shared FS (cold HF cache on the first run):
-# allow up to 2 h for engine readiness.
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  agg_nodes: 1
-  agg_workers: 1
-  gpus_per_agg: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  aggregated_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-
-  vllm_config:
-    aggregated:
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 256
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8ab05189e..5327dbd02 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3652,12 +3652,12 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Initial submission: MiniMax-M3 MXFP8 day-zero vLLM sweep for GB200 via Dynamo"
+    - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
     - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch arm64+amd64 from m3_release branch, vllm-project/vllm#45381)"
-    - "Dynamo orchestration with NixlConnector for disaggregated prefill/decode"
-    - "6 topologies: TP4 (1n), TP8 (2n), TP4+EP4 (1n), 1P+1D disagg TP4+EP4 (2n), DEP4 (1n), DEP8 (2n)"
-    - "Concurrency sweep: TP 4-64, TEP 128-512, disagg 64-512, DEP4 256-1024, DEP8 512-2048"
+    - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)"
+    - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
+    - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
+    - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 

From 803cd20f243bb841b2013364af932e6aa9690850 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 14:19:36 -0700
Subject: [PATCH 06/33] fix: restore NIXL-bearing image for M3 GB200 disagg +
 enable MNNVL KV transfer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Run 27478698552 failed: every disagg worker crashed at NixlConnector init
with "NIXL is not available" (RuntimeError, vllm .../nixl/worker.py:248).
The ai-dynamo vllm-runtime:1.3.0-minimax-m3-dev.1 image ships dynamo but
NOT the nixl bindings (cupy missing too), so kv_connector=NixlConnector
cannot initialize and the engine core never becomes healthy.

Revert to the pre-ed63c1e0 runtime path that pulls NIXL in via the dynamo
wheel (same as the working minimaxm2.5-gb200 disagg recipes):
- image/container: vllm/vllm-openai:minimax-m3 (the m3_release build all
  other m3 entries already use)
- dynamo.install=true + wheel 1.2.0.dev20260526 (nixl is a dynamo dep)
- keep attention-backend FLASH_ATTN (added in the image-switch commit)

Also enable NVLink (MNNVL) KV transfer so NIXL doesn't fall back to TCP,
mirroring the deepseek-v4 gb200 disagg recipes — on every prefill/decode
env block:
  UCX_TLS=cuda_copy,cuda_ipc,tcp
  UCX_CUDA_IPC_ENABLE_MNNVL=y
  UCX_MEMTYPE_CACHE=n / UCX_MEMTYPE_REG_WHOLE=n
  NCCL_CUMEM_ENABLE=1   (cuMem-allocate buffers so they are IPC-exportable)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 18 +++++++++-------
 .../1k1k/disagg-gb200-1p1d-dep4-2n.yaml       | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml     | 21 +++++++++++++++++--
 .../1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml     | 21 +++++++++++++++++--
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 21 +++++++++++++++++--
 perf-changelog.yaml                           |  4 ++--
 8 files changed, 126 insertions(+), 22 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b0b99d53f..f246f518a 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11776,17 +11776,19 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. Image is the NVIDIA Dynamo vLLM runtime
-# (nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1) with
-# dynamo 1.3.0 pre-installed, so recipes set dynamo.install=false.
+# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
+# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
+# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
+# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
+# shipped without NIXL, so disagg workers crashed at NixlConnector init).
 # Engine args mirror the canonical recipe (ai-dynamo/dynamo
 # recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split,
-# every worker = one 4-GPU node since the 444 GB checkpoint can't fit in
-# fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid), 1P2D / 2P1D TP4+EP4
-# (decode- and prefill-scaled), 1P1D DEP4 (max throughput).
+# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over
+# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB
+# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid),
+# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput).
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1
+  image: vllm/vllm-openai:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
index 0749dbc86..4b56e9e6f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
@@ -9,11 +9,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -44,10 +45,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 927066e42..558c5d894 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -7,11 +7,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -42,10 +43,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 4f9c01c6b..eeefc68c1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -41,10 +42,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
index fbb99a3dd..02d9bd98e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
@@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -43,10 +44,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
index fb27934cb..4a440766a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
@@ -8,11 +8,12 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -43,10 +44,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
index 86d48468a..c14b9fb3b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
@@ -6,11 +6,12 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1"
+  container: "vllm/vllm-openai:minimax-m3"
   precision: "fp8"
 
 dynamo:
-  install: false
+  install: true
+  wheel: "1.2.0.dev20260526"
 
 slurm:
   time_limit: "8:00:00"
@@ -41,10 +42,26 @@ backend:
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
+    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
+    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
 
   vllm_config:
     prefill:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index be638f5f1..627ed5bb1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3654,8 +3654,8 @@
   description:
     - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
     - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: nvcr.io#nvidia/ai-dynamo/vllm-runtime:1.3.0-minimax-m3-dev.1 (Dynamo 1.3.0 pre-installed; dynamo.install=false)"
-    - "Disaggregated prefill/decode over NixlConnector; every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
+    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
+    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
     - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
     - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
     - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"

From 1320056380a6f095211fbbb016a9fcc57fdbfbb6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 20:04:13 -0700
Subject: [PATCH 07/33] feat: rack-scale wide-EP (DeepSeek megamoe) M3 GB200
 disagg + FLASHINFER

The narrow DEP8-max sweep showed no GB200 advantage over B200 because both
cap at an 8-GPU NVLink island. Exploit NVL72's rack-scale NVLink with wide
expert parallelism spanning multiple nodes, mirroring the deepseek-v4
"megamoe" ladder (DEP = data-parallel attention + expert-parallel):

- 1P1D TP4 (2n)            low-latency, conc 4-64
- 1P1D DEP8 (4n)           mid, EP8/16-experts-per-rank, conc 128-512
- 1P1D DEP8->DEP16 (6n)    wide decode (EP16), conc 512-2048
- 2P1D DEP8->DEP16 (8n)    prefill-scaled, conc 2048-4096
- 4P1D DEP8->DEP16 (12n)   max throughput, conc 4096-8192

M3 has 128 routed experts (top-4), so EP8/EP16 shard cleanly. EP16 across
16 GPU / 4 nodes is the regime B200 physically can't reach.

Attention: FLASH_ATTN -> FLASHINFER (trtllm-gen) on all GB200 recipes to
exploit Blackwell. Requires the :minimax-m3 image rebuilt from m3_release
HEAD 022448dd (vllm-project/vllm#45381), which gates trtllm-gen page>=128.

Also add GB200 perf/NVLink-KV knobs from the deepseek-v4 reference:
numa-bind (Grace) and enable-sleep-mode (cuMem allocator so the KV cache is
IPC-exportable over the MNNVL fabric), alongside the existing UCX MNNVL env.

Replaces the four narrow EP4 recipes; keeps 1P1D TP4 for low latency.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  95 +++++++++-------
 ...3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} |  45 +++++---
 ...l => disagg-gb200-1p1d-dep8-dep16-6n.yaml} |  36 +++---
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        |  17 ++-
 ...l => disagg-gb200-2p1d-dep8-dep16-8n.yaml} |  42 ++++---
 ... => disagg-gb200-4p1d-dep8-dep16-12n.yaml} |  44 ++++---
 .../8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml     | 107 ------------------
 perf-changelog.yaml                           |  15 +--
 8 files changed, 170 insertions(+), 231 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-tp4ep4-3n.yaml => disagg-gb200-1p1d-dep8-4n.yaml} (69%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep4-2n.yaml => disagg-gb200-1p1d-dep8-dep16-6n.yaml} (77%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-tp4ep4-3n.yaml => disagg-gb200-2p1d-dep8-dep16-8n.yaml} (74%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tp4ep4-2n.yaml => disagg-gb200-4p1d-dep8-dep16-12n.yaml} (71%)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f246f518a..70ec293af 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11781,12 +11781,16 @@ minimaxm2.5-fp8-gb300-dynamo-vllm:
 # dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
 # NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
 # shipped without NIXL, so disagg workers crashed at NixlConnector init).
-# Engine args mirror the canonical recipe (ai-dynamo/dynamo
-# recipes/minimax-m3/vllm/disagg/MXFP8): block-size 128 + FLASH_ATTN.
-# Fully disaggregated GB200 sweep (NixlConnector prefill/decode split over
-# the NVL72 NVLink fabric; every worker = one 4-GPU node since the 444 GB
-# checkpoint can't fit in fewer): 1P1D TP4 (low conc), 1P1D TP4+EP4 (mid),
-# 1P2D / 2P1D TP4+EP4 (decode- and prefill-scaled), 1P1D DEP4 (max tput).
+# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
+# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
+# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
+# from m3_release before running. Fully disaggregated, rack-scale wide-EP
+# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
+# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
+# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
+# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
+# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
+# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:minimax-m3
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -11801,7 +11805,8 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: 1P+1D disagg TP4 (pure TP, no EP), 2 nodes (4 GPU each).
+      # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
+      # would idle DP ranks at small concurrencies, so stay narrow here.
       - conc-list: [4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -11816,64 +11821,68 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
-      # Mid curve: 1P+1D disagg TP4+EP4, 2 nodes (4 GPU each).
-      - conc-list: [64, 128, 256, 512]
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
+      # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
+      - conc-list: [128, 256, 512]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
         decode:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
 
-      # Decode-scaled: 1P+2D disagg TP4+EP4, 3 nodes (4 GPU each).
-      - conc-list: [256, 512, 1024]
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
+      # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
+      - conc-list: [512, 1024, 2048]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
         decode:
-          num-worker: 2
-          tp: 4
-          ep: 4
-          dp-attn: false
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
 
-      # Prefill-scaled: 2P+1D disagg TP4+EP4, 3 nodes (4 GPU each).
-      - conc-list: [256, 512, 1024]
+      # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
+      # (4 nodes) = 8 nodes.
+      - conc-list: [2048, 4096]
         prefill:
           num-worker: 2
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
         decode:
           num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
+          tp: 16
+          ep: 16
+          dp-attn: true
 
-      # Max throughput: 1P+1D disagg DEP4 (DP4+EP), 2 nodes (4 GPU each).
-      - conc-list: [512, 1024, 2048]
+      # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
+      # (4 nodes) = 12 nodes within one NVL72 rack.
+      - conc-list: [4096, 8192]
         prefill:
-          num-worker: 1
-          tp: 1
-          ep: 4
+          num-worker: 4
+          tp: 8
+          ep: 8
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
         decode:
           num-worker: 1
-          tp: 1
-          ep: 4
+          tp: 16
+          ep: 16
           dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
similarity index 69%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
index 02d9bd98e..efc5d5740 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -1,10 +1,11 @@
-name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
 
-# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (decode-scaled).
-# Prefill (TP4+EP4, 1 node) → NixlConnector → 2× Decode (TP4+EP4, 1 node
-# each) = 3 nodes. Two decode workers absorb more in-flight sequences for
-# mid/high concurrencies while a single prefill keeps TTFT low.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
+# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
+# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,12 +27,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 2
   prefill_workers: 1
-  decode_workers: 2
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -68,42 +69,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024"
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
similarity index 77%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 4b56e9e6f..5ca08a06d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -1,11 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (max-throughput curve).
-# Prefill (DEP4 = DP4 + expert-parallel, 1 node) → NixlConnector → Decode
-# (DEP4, 1 node) = 2 nodes. Data-parallel attention + EP maximizes decode
-# token throughput at high concurrency; engine shape mirrors the proven
-# agg-gb200-dep4-1n recipe. --block-size 128 is mandatory (MSA sparse/index
-# cache alignment).
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn
+# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128)
+# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER
+# attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -27,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 4
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -71,7 +70,7 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 4
+      data-parallel-size: 8
       data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
@@ -79,31 +78,36 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 4
+      data-parallel-size: 16
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-batched-tokens: 2048
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 128
 
 benchmark:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 558c5d894..b60b17515 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -1,9 +1,10 @@
 name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
-# Prefill (TP4, 1 node) → NixlConnector → Decode (TP4, 1 node). Pure TP,
-# no expert parallel: lowest TTFT/ITL for small concurrencies.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
 
 model:
   path: "minimax-m3-mxfp8"
@@ -74,12 +75,14 @@ backend:
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
@@ -91,13 +94,15 @@ backend:
       max-num-batched-tokens: 256
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
similarity index 74%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index 4a440766a..853095727 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-tp4ep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -1,10 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
 
 # MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
-# 2× Prefill (TP4+EP4, 1 node each) → NixlConnector → Decode (TP4+EP4,
-# 1 node) = 3 nodes. Two prefill workers sustain the prompt-ingest rate at
-# mid/high concurrencies without starving a single decode worker.
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt
+# ingest into a single wide decode at high concurrency. FLASHINFER
+# attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 1
+  prefill_nodes: 4
+  decode_nodes: 4
   prefill_workers: 2
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -68,42 +68,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "256x512x1024"
+  concurrencies: "2048x4096"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
similarity index 71%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index eeefc68c1..4a6aa5d0f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -1,8 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200.
-# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput).
+# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector ->
+# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max
+# prefill fan-in for the highest-concurrency points. FLASHINFER attention,
+# block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -24,12 +26,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
 
 frontend:
   type: dynamo
@@ -66,42 +68,50 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
       stream-interval: 32
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
       block-size: 128
-      attention-backend: FLASH_ATTN
+      attention-backend: FLASHINFER
       language-model-only: true
       gpu-memory-utilization: 0.9
       safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      stream-interval: 32
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "64x128x256x512"
+  concurrencies: "4096x8192"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
deleted file mode 100644
index c14b9fb3b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep4-2n.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4ep4-8k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200, 8k1k.
-# Prefill (TP4+EP4, 1 node) → NixlConnector → Decode (TP4+EP4, 1 node).
-# --block-size 128 is mandatory (MSA sparse/index cache alignment).
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASH_ATTN
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      stream-interval: 32
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "64x128x256"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 627ed5bb1..295a8e694 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3652,13 +3652,14 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Initial submission: MiniMax-M3 MXFP8 fully-disaggregated vLLM sweep for GB200 via Dynamo"
-    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: vllm/vllm-openai:minimax-m3 (multi-arch m3_release build, vllm-project/vllm#45381); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
-    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y + NCCL_CUMEM_ENABLE=1, cuda_ipc UCX_TLS); every worker = one 4-GPU node (444 GB checkpoint can't fit in fewer)"
-    - "5 disagg topologies: 1P1D TP4 (2n), 1P1D TP4+EP4 (2n), 1P2D TP4+EP4 (3n), 2P1D TP4+EP4 (3n), 1P1D DEP4 (2n)"
-    - "Concurrency sweep: TP4 4-64, TP4EP4 64-512, 1P2D/2P1D 256-1024, DEP4 512-2048"
-    - "--block-size 128 mandatory everywhere (MSA sparse/index cache alignment); --language-model-only for text-only benchmarks"
+    - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo"
+    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
+    - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
+    - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks"
+    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
+    - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
+    - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
+    - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From c8cd5670cc1878c9d9109c8b212c2e02adb7eb98 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 13 Jun 2026 23:00:11 -0700
Subject: [PATCH 08/33] feat: tune 1k1k low-conc latency + add 8k1k sweep for
 M3 GB200
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1k1k TP4 low-conc tuning: stream-interval 1 (was 128 decode / 32
prefill), cudagraph cap 128 (was 512), conc range extended to 1-64
(was 4-64) to match B200 coverage.

8k1k sweep: 5 disagg recipes mirroring the 1k1k megamoe ladder
(TP4, DEP8, DEP8→DEP16, 2P1D, 4P1D) with max-model-len 9472
(74×128 blocks = ISL+OSL+256 headroom). Concurrencies shifted ~4x
lower for 8x heavier prefill: TP4 1-16, DEP8 32-128,
DEP8→DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  81 ++++++++++++-
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        |   8 +-
 .../8k1k/disagg-gb200-1p1d-dep8-4n.yaml       | 111 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ++++++++++++++++++
 .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 106 +++++++++++++++++
 .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 110 +++++++++++++++++
 .../disagg-gb200-4p1d-dep8-dep16-12n.yaml     | 110 +++++++++++++++++
 perf-changelog.yaml                           |   3 +-
 8 files changed, 634 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 70ec293af..32957e282 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11807,7 +11807,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
       search-space:
       # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
       # would idle DP ranks at small concurrencies, so stay narrow here.
-      - conc-list: [4, 8, 16, 32, 64]
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
           tp: 4
@@ -11885,6 +11885,85 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 16
           dp-attn: true
 
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph
+      # cap 128 for best interactivity at small concurrencies.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index b60b17515..f3e79340a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -83,7 +83,7 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 32
+      stream-interval: 1
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
@@ -92,7 +92,7 @@ backend:
       max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 512
+      max-cudagraph-capture-size: 128
       block-size: 128
       attention-backend: FLASHINFER
       language-model-only: true
@@ -102,10 +102,10 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 128
+      stream-interval: 1
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x16x32x64"
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
new file mode 100644
index 000000000..f6f2c7874
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k).
+# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
+# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
+# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards
+# 16 experts/rank. FLASHINFER attention, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
new file mode 100644
index 000000000..0d7d44843
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -0,0 +1,111 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k).
+# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16
+# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72
+# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has
+# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
new file mode 100644
index 000000000..b0602354c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k).
+# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
+# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
+# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
+# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
+# Low-conc tuned: stream-interval 1, cudagraph cap 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
new file mode 100644
index 000000000..6a0765c60
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k).
+# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute;
+# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
new file mode 100644
index 000000000..9e4ff3c2b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -0,0 +1,110 @@
+name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
+
+# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k).
+# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16
+# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill
+# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 8
+  decode_nodes: 4
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024x2048"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 8329ac1da..46ac06a08 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3659,7 +3659,8 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Concurrency sweep: TP4 4-64, DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
+    - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
+    - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From b819a7a7dca04ea74b28b81bcd008a9059784144 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 15:33:46 -0700
Subject: [PATCH 09/33] =?UTF-8?q?feat:=20low-conc=20focus=20=E2=80=94=20wi?=
 =?UTF-8?q?der=20decode=20+=20more=20decode=20workers=20for=20M3=20GB200?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Comment out all conc > 64 entries (1k1k DEP8/DEP16/2P1D/4P1D and all
8k1k high-conc) to focus sweep budget on low-concurrency tuning.

Add two new 1k1k experiments at conc 1-64 alongside the existing
1P1D TP4 baseline:
  - 1P2D TP4 (3 nodes): 2 decode workers halve per-worker batch
  - 1P1D TP4→TP8 (3 nodes): wider decode TP spreads forward pass
    across 8 GPU over NVL72

All three share the low-conc tuning (stream-interval 1, cudagraph
cap 128, FLASHINFER, block-size 128).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 264 ++++++++++--------
 .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml    | 105 +++++++
 .../1k1k/disagg-gb200-1p2d-tp4-3n.yaml        | 104 +++++++
 perf-changelog.yaml                           |   4 +-
 4 files changed, 361 insertions(+), 116 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 32957e282..483cf4dcd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11821,69 +11821,103 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
-      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
-      # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
-      - conc-list: [128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-
-      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
-      # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
-      - conc-list: [512, 1024, 2048]
+      # Low latency: 1P+2D TP4, 3 nodes. Two decode workers halve
+      # the per-worker batch, reducing ITL at low concurrency.
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
+          tp: 4
+          ep: 1
+          dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml"
         decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
-      # (4 nodes) = 8 nodes.
-      - conc-list: [2048, 4096]
-        prefill:
           num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+          tp: 4
+          ep: 1
+          dp-attn: false
 
-      # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
-      # (4 nodes) = 12 nodes within one NVL72 rack.
-      - conc-list: [4096, 8192]
+      # Low latency: 1P+1D TP4 prefill -> TP8 decode (wider decode),
+      # 3 nodes. Wider decode TP spreads forward pass across 8 GPU.
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
-          num-worker: 4
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-        decode:
           num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+
+      ## --- High-conc entries commented out to focus on low-conc tuning ---
+      #
+      ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
+      ## decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
+      #- conc-list: [128, 256, 512]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #
+      ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
+      ## 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
+      #- conc-list: [512, 1024, 2048]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
+      ## (4 nodes) = 8 nodes.
+      #- conc-list: [2048, 4096]
+      #  prefill:
+      #    num-worker: 2
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
+      ## (4 nodes) = 12 nodes within one NVL72 rack.
+      #- conc-list: [4096, 8192]
+      #  prefill:
+      #    num-worker: 4
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
 
     - isl: 8192
       osl: 1024
@@ -11904,65 +11938,67 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
-      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
-      - conc-list: [32, 64, 128]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-
-      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
-      - conc-list: [128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
-      - conc-list: [512, 1024]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Max throughput 8k1k: 4P+1D, 12 nodes.
-      - conc-list: [1024, 2048]
-        prefill:
-          num-worker: 4
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+      ## --- 8k1k high-conc entries commented out ---
+      #
+      ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      #- conc-list: [32, 64, 128]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #
+      ## Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      #- conc-list: [128, 256, 512]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
+      #- conc-list: [512, 1024]
+      #  prefill:
+      #    num-worker: 2
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Max throughput 8k1k: 4P+1D, 12 nodes.
+      #- conc-list: [1024, 2048]
+      #  prefill:
+      #    num-worker: 4
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
new file mode 100644
index 000000000..6923c645e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -0,0 +1,105 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider
+# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TP8
+# (2 nodes) = 3 nodes. Wider decode TP reduces per-step latency by
+# spreading the forward pass across 8 GPU over NVL72 NVLink.
+# FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
new file mode 100644
index 000000000..1d1591198
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
@@ -0,0 +1,104 @@
+name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more
+# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode
+# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker
+# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 688493e05..d884e4a2d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3659,8 +3659,8 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "1k1k concurrency sweep: TP4 1-64 (low-conc latency tuned: stream-interval 1, cudagraph cap 128), DEP8 128-512, DEP8->DEP16 512-2048, 2P1D 2048-4096, 4P1D 4096-8192"
-    - "8k1k concurrency sweep (same 5 topologies, shifted ~4x lower for 8x heavier prefill): TP4 1-16, DEP8 32-128, DEP8->DEP16 128-512, 2P1D 512-1024, 4P1D 1024-2048; max-model-len 9472 (74*128)"
+    - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TP8 (3n, wider decode). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
+    - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 29eaaeb821e2993a6cf463da462f204408318fc0 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 18:56:30 -0700
Subject: [PATCH 10/33] feat: enable expert-parallel on GB200 TEP8 decode to
 close B200 low-conc gap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

B200 TEP8 (TP8+EP8) achieves 11.68ms TPOT at conc 1 vs GB200 TP8's
15.29ms — the gap is entirely from expert parallelism splitting 128
MoE experts across 8 ranks.  Add enable-expert-parallel: true to the
TP8 decode recipe and update nvidia-master.yaml decode ep: 1→8 so
result JSON reflects TEP8.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                       | 6 +++---
 .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml               | 9 +++++----
 perf-changelog.yaml                                      | 2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 483cf4dcd..ab8060e5d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11837,8 +11837,8 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
-      # Low latency: 1P+1D TP4 prefill -> TP8 decode (wider decode),
-      # 3 nodes. Wider decode TP spreads forward pass across 8 GPU.
+      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8, wider
+      # decode + expert parallel), 3 nodes. Matches B200 TEP8 topology.
       - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -11850,7 +11850,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
         decode:
           num-worker: 1
           tp: 8
-          ep: 1
+          ep: 8
           dp-attn: false
 
       ## --- High-conc entries commented out to focus on low-conc tuning ---
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 6923c645e..199699212 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -1,10 +1,10 @@
 name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
 
 # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider
-# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TP8
-# (2 nodes) = 3 nodes. Wider decode TP reduces per-step latency by
-# spreading the forward pass across 8 GPU over NVL72 NVLink.
-# FLASHINFER, block-size 128.
+# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
+# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
+# reduces per-step latency by spreading both attention and MoE across
+# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -83,6 +83,7 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 8
       pipeline-parallel-size: 1
+      enable-expert-parallel: true
       max-model-len: 2304
       max-num-seqs: 256
       max-num-batched-tokens: 256
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5b03f66eb..2ab37b008 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3659,7 +3659,7 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TP8 (3n, wider decode). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
+    - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
     - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 

From 56e61cfd6f54339a11eae9aa315bd757fc5b2b5b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 23:38:44 -0700
Subject: [PATCH 11/33] feat: add 8k1k TEP8 decode recipe for GB200 to close
 B200 gap at long ISL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GB200 8k1k only had TP4 (2n) giving 18.50ms TPOT at conc 1 vs B200
TEP8's 11.57ms.  Add 1P1D TP4→TEP8 (3n) 8k1k recipe mirroring the
1k1k TEP8 config that already closed the gap there (12.34ms vs 11.68ms).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            |  16 +++
 .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml    | 106 ++++++++++++++++++
 perf-changelog.yaml                           |   2 +-
 3 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c5f5ee67b..6707785ab 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12758,6 +12758,22 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 1
           dp-attn: false
 
+      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8),
+      # 3 nodes. Matches B200 TEP8 topology for 8k ISL.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
       ## --- 8k1k high-conc entries commented out ---
       #
       ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
new file mode 100644
index 000000000..165e9d338
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k,
+# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
+# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
+# reduces per-step latency by spreading both attention and MoE across
+# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4876032f7..2bd69870f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3676,7 +3676,7 @@
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
     - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
-    - "8k1k: 1P1D TP4 (2n) at conc 1-16; high-conc 8k1k entries temporarily commented out"
+    - "8k1k: 1P1D TP4 (2n) baseline + 1P1D TP4->TEP8 (3n, decode TP8+EP8) at conc 1-16; high-conc entries temporarily commented out"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 4a83c75eb89e244061cd1f0c3ef5dc50587a7c1b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 23:41:14 -0700
Subject: [PATCH 12/33] refactor: remove unoptimized TP4/1P2D baselines, keep
 TEP8-only sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop 1P1D-TP4 (2n) and 1P2D-TP4 (3n) entries from both 1k1k and 8k1k.
TEP8 dominates at every concurrency — TP4 baseline is 50% slower at
conc 1 and 1P2D gave <2% TPOT improvement for 50% more GPUs.

Active sweep is now TEP8-only: 1k1k conc 1-64, 8k1k conc 1-16.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 53 ++----------------------------
 perf-changelog.yaml                |  3 +-
 2 files changed, 4 insertions(+), 52 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6707785ab..0e47ece77 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12625,40 +12625,9 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: 1P+1D TP4 (pure TP, no EP), 2 nodes (4 GPU each). Wide EP
-      # would idle DP ranks at small concurrencies, so stay narrow here.
-      - conc-list: [1, 2, 4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-
-      # Low latency: 1P+2D TP4, 3 nodes. Two decode workers halve
-      # the per-worker batch, reducing ITL at low concurrency.
-      - conc-list: [1, 2, 4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml"
-        decode:
-          num-worker: 2
-          tp: 4
-          ep: 1
-          dp-attn: false
-
-      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8, wider
-      # decode + expert parallel), 3 nodes. Matches B200 TEP8 topology.
+      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
+      # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
       - conc-list: [1, 2, 4, 8, 16, 32, 64]
         prefill:
           num-worker: 1
@@ -12742,22 +12711,6 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 8192
       osl: 1024
       search-space:
-      # Low latency 8k1k: 1P+1D TP4, 2 nodes. stream-interval 1 + cudagraph
-      # cap 128 for best interactivity at small concurrencies.
-      - conc-list: [1, 2, 4, 8, 16]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-
       # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8),
       # 3 nodes. Matches B200 TEP8 topology for 8k ISL.
       - conc-list: [1, 2, 4, 8, 16]
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2bd69870f..abd45e97b 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3675,8 +3675,7 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Low-conc focus (conc 1-64): 1P1D TP4 (2n baseline), 1P2D TP4 (3n, 2 decode workers), 1P1D TP4->TEP8 (3n, decode TP8+EP8 to match B200 TEP8 topology). High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
-    - "8k1k: 1P1D TP4 (2n) baseline + 1P1D TP4->TEP8 (3n, decode TP8+EP8) at conc 1-16; high-conc entries temporarily commented out"
+    - "Optimized low-conc: TEP8-only (1P1D TP4 prefill -> TEP8 decode, 3n). Unoptimized TP4 and 1P2D baselines removed. 1k1k conc 1-64, 8k1k conc 1-16. High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 2a237f7d639a58d11a2025f8273f35848ea8a0d6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 14 Jun 2026 23:45:01 -0700
Subject: [PATCH 13/33] feat: uncomment all high-conc entries for full M3 GB200
 sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable DEP8 (4n), DEP8→DEP16 (6n), 2P1D (8n), 4P1D (12n) for both
1k1k and 8k1k alongside the optimized TEP8 low-conc configs.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 248 ++++++++++++++---------------
 perf-changelog.yaml                |   2 +-
 2 files changed, 123 insertions(+), 127 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0e47ece77..731d9cfe7 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12642,71 +12642,69 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 8
           dp-attn: false
 
-      ## --- High-conc entries commented out to focus on low-conc tuning ---
-      #
-      ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
-      ## decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
-      #- conc-list: [128, 256, 512]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #
-      ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
-      ## 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
-      #- conc-list: [512, 1024, 2048]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
-      ## (4 nodes) = 8 nodes.
-      #- conc-list: [2048, 4096]
-      #  prefill:
-      #    num-worker: 2
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
-      ## (4 nodes) = 12 nodes within one NVL72 rack.
-      #- conc-list: [4096, 8192]
-      #  prefill:
-      #    num-worker: 4
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
+      # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
+      # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
+      # (4 nodes) = 8 nodes.
+      - conc-list: [2048, 4096]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
+      # (4 nodes) = 12 nodes within one NVL72 rack.
+      - conc-list: [4096, 8192]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
 
     - isl: 8192
       osl: 1024
@@ -12727,67 +12725,65 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 8
           dp-attn: false
 
-      ## --- 8k1k high-conc entries commented out ---
-      #
-      ## Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
-      #- conc-list: [32, 64, 128]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #
-      ## Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
-      #- conc-list: [128, 256, 512]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
-      #- conc-list: [512, 1024]
-      #  prefill:
-      #    num-worker: 2
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Max throughput 8k1k: 4P+1D, 12 nodes.
-      #- conc-list: [1024, 2048]
-      #  prefill:
-      #    num-worker: 4
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index abd45e97b..853fbdb7d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3675,7 +3675,7 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Optimized low-conc: TEP8-only (1P1D TP4 prefill -> TEP8 decode, 3n). Unoptimized TP4 and 1P2D baselines removed. 1k1k conc 1-64, 8k1k conc 1-16. High-conc entries (DEP8/DEP16/2P1D/4P1D) temporarily commented out."
+    - "Full sweep enabled: TEP8 (3n) for low-conc + DEP8 (4n), DEP8->DEP16 (6n), 2P1D (8n), 4P1D (12n) for high-conc. Both 1k1k and 8k1k."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 5e2c2f948d44426298e8526e5f3d696a0c36bee6 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 15 Jun 2026 08:59:51 -0700
Subject: [PATCH 14/33] =?UTF-8?q?feat:=20test=201P1D=20TEP4=20decode=20(TP?=
 =?UTF-8?q?4+EP4,=202n)=20=E2=80=94=20conc=201-32=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test whether EP4 on 4 decode GPUs (2 nodes total) improves TPOT over
pure TP4 on GB200's NVL72 NVLink.  B200 showed TEP4 slightly worse
than TP4 intra-node; NVL72 all-to-all may differ.  All other entries
commented out for this isolated test.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 327 +++++++++---------
 .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml   | 106 ++++++
 perf-changelog.yaml                           |   2 +-
 3 files changed, 275 insertions(+), 160 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 731d9cfe7..0a8ae64fe 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12625,165 +12625,174 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
-      # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
-      # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
-      - conc-list: [1, 2, 4, 8, 16, 32, 64]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: false
-
-      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 2 nodes prefill + 2 nodes
-      # decode = 4 nodes. First rung of rack-scale EP (16 experts/rank).
-      - conc-list: [128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-
-      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode (EP16 across 16 GPU /
-      # 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72 NVLink.
-      - conc-list: [512, 1024, 2048]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Prefill-scaled: 2P+1D, 2x DEP8 prefill (4 nodes) -> DEP16 decode
-      # (4 nodes) = 8 nodes.
-      - conc-list: [2048, 4096]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Max throughput: 4P+1D, 4x DEP8 prefill (8 nodes) -> DEP16 decode
-      # (4 nodes) = 12 nodes within one NVL72 rack.
-      - conc-list: [4096, 8192]
-        prefill:
-          num-worker: 4
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8),
-      # 3 nodes. Matches B200 TEP8 topology for 8k ISL.
-      - conc-list: [1, 2, 4, 8, 16]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: false
-
-      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
-      - conc-list: [32, 64, 128]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-
-      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
-      - conc-list: [128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Prefill-scaled 8k1k: 2P+1D, 8 nodes. Double prefill absorbs 8k ISL.
-      - conc-list: [512, 1024]
-        prefill:
-          num-worker: 2
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Max throughput 8k1k: 4P+1D, 12 nodes.
-      - conc-list: [1024, 2048]
-        prefill:
-          num-worker: 4
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+      # Test: 1P+1D TP4 prefill -> TEP4 decode (TP4+EP4), 2 nodes.
+      # EP on 4 GPU splits 128 experts into 32/rank — cheaper than TEP8
+      # (3 nodes) if the all-to-all overhead is small on NVL72.
+      - conc-list: [1, 2, 4, 8, 16, 32]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: false
+
+      ## --- All other entries commented out for TEP4 test run ---
+      #
+      ## Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      #- conc-list: [1, 2, 4, 8, 16, 32, 64]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 4
+      #    ep: 1
+      #    dp-attn: false
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: false
+      #
+      ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
+      #- conc-list: [128, 256, 512]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #
+      ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
+      #- conc-list: [512, 1024, 2048]
+      #  prefill:
+      #    num-worker: 1
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Prefill-scaled: 2P+1D, 8 nodes.
+      #- conc-list: [2048, 4096]
+      #  prefill:
+      #    num-worker: 2
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## Max throughput: 4P+1D, 12 nodes.
+      #- conc-list: [4096, 8192]
+      #  prefill:
+      #    num-worker: 4
+      #    tp: 8
+      #    ep: 8
+      #    dp-attn: true
+      #    additional-settings:
+      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+      #  decode:
+      #    num-worker: 1
+      #    tp: 16
+      #    ep: 16
+      #    dp-attn: true
+      #
+      ## --- 8k1k entries commented out for TEP4 test run ---
+      #
+      #- isl: 8192
+      #  osl: 1024
+      #  search-space:
+      #  - conc-list: [1, 2, 4, 8, 16]
+      #    prefill:
+      #      num-worker: 1
+      #      tp: 4
+      #      ep: 1
+      #      dp-attn: false
+      #      additional-settings:
+      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+      #    decode:
+      #      num-worker: 1
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: false
+      #
+      #  - conc-list: [32, 64, 128]
+      #    prefill:
+      #      num-worker: 1
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: true
+      #      additional-settings:
+      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+      #    decode:
+      #      num-worker: 1
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: true
+      #
+      #  - conc-list: [128, 256, 512]
+      #    prefill:
+      #      num-worker: 1
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: true
+      #      additional-settings:
+      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+      #    decode:
+      #      num-worker: 1
+      #      tp: 16
+      #      ep: 16
+      #      dp-attn: true
+      #
+      #  - conc-list: [512, 1024]
+      #    prefill:
+      #      num-worker: 2
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: true
+      #      additional-settings:
+      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+      #    decode:
+      #      num-worker: 1
+      #      tp: 16
+      #      ep: 16
+      #      dp-attn: true
+      #
+      #  - conc-list: [1024, 2048]
+      #    prefill:
+      #      num-worker: 4
+      #      tp: 8
+      #      ep: 8
+      #      dp-attn: true
+      #      additional-settings:
+      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+      #    decode:
+      #      num-worker: 1
+      #      tp: 16
+      #      ep: 16
+      #      dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
new file mode 100644
index 000000000..147803c78
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k"
+
+# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode
+# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4
+# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128
+# MoE experts across 4 ranks (32 each), reducing per-step MoE compute.
+# FLASHINFER, block-size 128.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:minimax-m3"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 853fbdb7d..173cba607 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3675,7 +3675,7 @@
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
     - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "Full sweep enabled: TEP8 (3n) for low-conc + DEP8 (4n), DEP8->DEP16 (6n), 2P1D (8n), 4P1D (12n) for high-conc. Both 1k1k and 8k1k."
+    - "TEP4 decode test: 1P1D TP4 prefill -> TEP4 decode (TP4+EP4, 2n) at conc 1-32. All other entries temporarily commented out."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 055aa2c6c491f18d5190fa4ad536053c5c9ff417 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 15 Jun 2026 10:03:37 -0700
Subject: [PATCH 15/33] feat: restore full TEP8 + wide-EP sweep for M3 GB200
 disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Revert from TEP4 decode test back to the full sweep config:
- 1k1k: TEP8 3n (conc 1-64), DEP8 4n, DEP8→DEP16 6n, 2P1D 8n, 4P1D 12n
- 8k1k: TEP8 3n (conc 1-16), DEP8 4n, DEP8→DEP16 6n, 2P1D 8n, 4P1D 12n

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 316 ++++++++++++++---------------
 perf-changelog.yaml                |   4 +-
 2 files changed, 153 insertions(+), 167 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0a8ae64fe..6ae32670d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12625,171 +12625,157 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # Test: 1P+1D TP4 prefill -> TEP4 decode (TP4+EP4), 2 nodes.
-      # EP on 4 GPU splits 128 experts into 32/rank — cheaper than TEP8
-      # (3 nodes) if the all-to-all overhead is small on NVL72.
-      - conc-list: [1, 2, 4, 8, 16, 32]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml"
-        decode:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: false
-
-      ## --- All other entries commented out for TEP4 test run ---
-      #
-      ## Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
-      #- conc-list: [1, 2, 4, 8, 16, 32, 64]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 4
-      #    ep: 1
-      #    dp-attn: false
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: false
-      #
-      ## Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
-      #- conc-list: [128, 256, 512]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #
-      ## Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
-      #- conc-list: [512, 1024, 2048]
-      #  prefill:
-      #    num-worker: 1
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Prefill-scaled: 2P+1D, 8 nodes.
-      #- conc-list: [2048, 4096]
-      #  prefill:
-      #    num-worker: 2
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## Max throughput: 4P+1D, 12 nodes.
-      #- conc-list: [4096, 8192]
-      #  prefill:
-      #    num-worker: 4
-      #    tp: 8
-      #    ep: 8
-      #    dp-attn: true
-      #    additional-settings:
-      #    - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-      #  decode:
-      #    num-worker: 1
-      #    tp: 16
-      #    ep: 16
-      #    dp-attn: true
-      #
-      ## --- 8k1k entries commented out for TEP4 test run ---
-      #
-      #- isl: 8192
-      #  osl: 1024
-      #  search-space:
-      #  - conc-list: [1, 2, 4, 8, 16]
-      #    prefill:
-      #      num-worker: 1
-      #      tp: 4
-      #      ep: 1
-      #      dp-attn: false
-      #      additional-settings:
-      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
-      #    decode:
-      #      num-worker: 1
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: false
-      #
-      #  - conc-list: [32, 64, 128]
-      #    prefill:
-      #      num-worker: 1
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: true
-      #      additional-settings:
-      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-      #    decode:
-      #      num-worker: 1
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: true
-      #
-      #  - conc-list: [128, 256, 512]
-      #    prefill:
-      #      num-worker: 1
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: true
-      #      additional-settings:
-      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-      #    decode:
-      #      num-worker: 1
-      #      tp: 16
-      #      ep: 16
-      #      dp-attn: true
-      #
-      #  - conc-list: [512, 1024]
-      #    prefill:
-      #      num-worker: 2
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: true
-      #      additional-settings:
-      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
-      #    decode:
-      #      num-worker: 1
-      #      tp: 16
-      #      ep: 16
-      #      dp-attn: true
-      #
-      #  - conc-list: [1024, 2048]
-      #    prefill:
-      #      num-worker: 4
-      #      tp: 8
-      #      ep: 8
-      #      dp-attn: true
-      #      additional-settings:
-      #      - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-      #    decode:
-      #      num-worker: 1
+      # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
+      # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
+      - conc-list: [1, 2, 4, 8, 16, 32, 64]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [512, 1024, 2048]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled: 2P+1D, 8 nodes.
+      - conc-list: [2048, 4096]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput: 4P+1D, 12 nodes.
+      - conc-list: [4096, 8192]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      - conc-list: [1, 2, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
+      - conc-list: [32, 64, 128]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
+      - conc-list: [128, 256, 512]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Prefill-scaled 8k1k: 2P+1D, 8 nodes.
+      - conc-list: [512, 1024]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+        decode:
+          num-worker: 1
+          tp: 16
+          ep: 16
+          dp-attn: true
+
+      # Max throughput 8k1k: 4P+1D, 12 nodes.
+      - conc-list: [1024, 2048]
+        prefill:
+          num-worker: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
+        decode:
+          num-worker: 1
       #      tp: 16
       #      ep: 16
       #      dp-attn: true
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 173cba607..0f6df9b66 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3674,8 +3674,8 @@
     - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks"
     - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
     - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
-    - "5 topologies: 1P1D TP4 (2n, low-lat), 1P1D DEP8 (4n), 1P1D DEP8->DEP16 (6n, wide decode), 2P1D DEP8->DEP16 (8n), 4P1D DEP8->DEP16 (12n, max tput)"
-    - "TEP4 decode test: 1P1D TP4 prefill -> TEP4 decode (TP4+EP4, 2n) at conc 1-32. All other entries temporarily commented out."
+    - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)"
+    - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From e0ae36c3840c994336a408e3f6fb75165391a809 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 15 Jun 2026 10:10:32 -0700
Subject: [PATCH 16/33] fix: uncomment trailing 4P1D 8k1k decode lines in M3
 GB200 sweep

Three lines (tp/ep/dp-attn) for the 4P1D 8k1k decode were still
commented out from the TEP4 test.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 6ae32670d..876bf7fb6 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12776,9 +12776,9 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
         decode:
           num-worker: 1
-      #      tp: 16
-      #      ep: 16
-      #      dp-attn: true
+          tp: 16
+          ep: 16
+          dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint

From b6926e36314105d8dd0c72c7ac0bc0f133a522c7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 15 Jun 2026 12:08:30 -0700
Subject: [PATCH 17/33] fix: retrigger M3 GB200 sweep to validate MNNVL fused
 allreduce fix

The TEP8 multi-node eval (TP8+EP8 decode, 2 NVL72 nodes) was producing
gsm8k=0.0000 due to incorrect buffer aliasing in the eager
fused_allreduce_gemma_rms_norm path.  Fixed in vLLM M3 branch commit
66a43ba (cleanup/m3-mi300x-mxfp8): pass norm_out=None to match the
compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0f6df9b66..cc5112d0d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3855,3 +3855,11 @@
     - "Recipes sourced from NVIDIA/srt-slurm branch sa-submission-q2-2026"
     - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689
+
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Fix MNNVL fused allreduce correctness bug for TEP8 multi-node decode (gsm8k=0.0000 → expect passing)"
+    - "Root cause: fused_allreduce_gemma_rms_norm passed norm_out=separate buffer, triggering the residual_out←allreduce_in aliasing path which produces wrong results on the MNNVL backend across NVL72 node boundaries"
+    - "Fix: pass norm_out=None to match the compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing (norm_out←allreduce_in, residual_out←residual) — commit 66a43ba on cleanup/m3-mi300x-mxfp8"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734

From 0e5ba19ee027728578676107ef06efce09278be5 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Mon, 15 Jun 2026 18:53:17 -0700
Subject: [PATCH 18/33] fix: point TEP8 recipe at ghcr image with NixlConnector
 head_ratio fix

Update the 8k1k TEP8 recipe container to
ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73
which includes both the NixlConnector block_len validation
fix (78ef73b) and the norm_out=None MNNVL aliasing fix
(66a43ba).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml                | 2 +-
 perf-changelog.yaml                                       | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 165e9d338..453df782b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73"
   precision: "fp8"
 
 dynamo:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index f2e21f015..a4c6b97b1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3859,9 +3859,11 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Fix MNNVL fused allreduce correctness bug for TEP8 multi-node decode (gsm8k=0.0000 → expect passing)"
-    - "Root cause: fused_allreduce_gemma_rms_norm passed norm_out=separate buffer, triggering the residual_out←allreduce_in aliasing path which produces wrong results on the MNNVL backend across NVL72 node boundaries"
-    - "Fix: pass norm_out=None to match the compile-time AllReduceFusedAddGemmaRMSNormPattern aliasing (norm_out←allreduce_in, residual_out←residual) — commit 66a43ba on cleanup/m3-mi300x-mxfp8"
+    - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)"
+    - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)."
+    - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8"
+    - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
+    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 0531a4262ee8b1e0ceb6af2243f8c6ee1130680b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 11:02:45 -0700
Subject: [PATCH 19/33] fix: retrigger M3 GB200 sweep after making ghcr image
 public
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous run failed with 401 Unauthorized pulling
ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 —
the package is now public.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 43f4e074e..d56081d5d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3870,7 +3870,7 @@
     - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)."
     - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8"
     - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
-    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4)"
+    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4, public)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 6db15b052b460537407c0fcaa5370f4ab30cb1de Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 12:18:38 -0700
Subject: [PATCH 20/33] =?UTF-8?q?fix:=20retrigger=20M3=20GB200=20sweep=20?=
 =?UTF-8?q?=E2=80=94=20previous=20run=20hit=20NATS=20infra=20failure?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous run (19006) failed with "NATS failed to start" on
watchtower-navy-cn01, a transient cluster infra issue unrelated to the
NixlConnector fix.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 400b314ff..5b355060d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3871,7 +3871,7 @@
     - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)."
     - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8"
     - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
-    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (built from cleanup/m3-mi300x-mxfp8 HEAD 78ef73bc4, public)"
+    - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (public, from cleanup/m3-mi300x-mxfp8@78ef73bc4)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
 
 - config-keys:

From 67f452159dbab67237f17a910b1fa041c3b98d57 Mon Sep 17 00:00:00 2001
From: Bryan Shan <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 12:50:26 -0700
Subject: [PATCH 21/33] Remove minimaxm3-fp8-gb200-vllm details from changelog

Removed initial submission details for minimaxm3-fp8-gb200-dynamo-vllm from the changelog.
---
 perf-changelog.yaml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ba635f2da..5b1f07c9f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3665,19 +3665,6 @@
     - "Serves from the launch_b300-nv.sh MODEL/MODEL_PATH split (model not in the SRE-staged /scratch/models list -> writable /data/models)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1724
 
-- config-keys:
-    - minimaxm3-fp8-gb200-dynamo-vllm
-  description:
-    - "Initial submission: MiniMax-M3 MXFP8 disaggregated rack-scale wide-EP vLLM sweep for GB200 via Dynamo"
-    - "Model: MiniMaxAI/MiniMax-M3-MXFP8 (427B total / 26B active MoE, 128 routed experts top-4, MSA sparse attention, ~444 GB MXFP8 checkpoint)"
-    - "Image: vllm/vllm-openai:minimax-m3, rebuilt from m3_release HEAD 022448dd (vllm-project/vllm#45381, gates trtllm-gen page>=128); dynamo.install=true + wheel 1.2.0.dev20260526 layers in the dynamo runtime + NIXL"
-    - "FLASHINFER (trtllm-gen) attention on Blackwell + block-size 128 (MSA index-cache alignment); --language-model-only for text-only benchmarks"
-    - "Disaggregated prefill/decode over NixlConnector on the NVL72 NVLink fabric (UCX_CUDA_IPC_ENABLE_MNNVL=y, cuda_ipc UCX_TLS, NCCL_CUMEM_ENABLE=1 + enable-sleep-mode cuMem allocator, numa-bind)"
-    - "Rack-scale wide expert-parallel (deepseek-v4 megamoe style; DEP = DP-attn + EP): EP8/EP16 spans multiple GB200 nodes vs B200's 8-GPU NVLink island"
-    - "5 topologies, 1k1k + 8k1k: 1P1D TEP8 decode (3n, low-lat conc 1-64), 1P1D DEP8 (4n, conc 128-512), 1P1D DEP8->DEP16 (6n, conc 512-2048), 2P1D (8n, conc 2048-4096), 4P1D (12n, conc 4096-8192)"
-    - "TEP8 decode (enable-expert-parallel on TP8): 128 experts / 8 ranks = 16 experts/rank, ~19% lower ITL than pure TP8 at low conc; stream-interval 1 + max-cudagraph-capture-size 128 for interactivity"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
-
 - config-keys:
     - minimaxm3-fp8-b200-vllm
   description:
@@ -3873,6 +3860,7 @@
     - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
     - "Image: ghcr.io/semianalysisai/vllm-openai:minimax-m3-78ef73 (public, from cleanup/m3-mi300x-mxfp8@78ef73bc4)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
+  evals-only: true
 
 - config-keys:
     - minimaxm3-fp8-b200-vllm

From 211488bed966e39c3c369ef1373a8436278e6363 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 05:00:20 +0800
Subject: [PATCH 22/33] fix: switch M3 GB200 disagg to nightly, drop
 extra_mount workaround

Head_ratio fix merged upstream (vllm-project/vllm#45879).
nightly-aarch64 contains both M3 model support (#45381) and the
NixlConnector handshake fix (#45330).
---
 .github/configs/nvidia-master.yaml                         | 2 +-
 .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml                 | 6 +-----
 perf-changelog.yaml                                        | 7 ++-----
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index fa5b6fa04..6caf1bc11 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12623,7 +12623,7 @@ qwen3.5-fp4-b200-trt:
 # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
 # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:minimax-m3
+  image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index a0db43086..43838147a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,13 +8,9 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
-extra_mount:
-  - "/mnt/lustre01/users-public/sa-shared/patches/m3/worker.py:/usr/local/lib/python3.12/dist-packages/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py"
-  - "/mnt/lustre01/users-public/sa-shared/patches/m3/fused_allreduce_gemma_rms_norm.py:/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_allreduce_gemma_rms_norm.py"
-
 dynamo:
   install: true
   wheel: "1.2.0.dev20260526"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 01af177f5..2954594b3 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3854,11 +3854,8 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Fix NixlConnector handshake failure for hetero-TP disagg when num_kv_heads < decode TP (M3 TEP8: TP4 prefill → TP8 decode)"
-    - "Root cause: _validate_remote_agent_handshake used raw tp_ratio (8/4=2) for block_len validation, but M3 has only 4 KV heads — both sides have max(1,4//tp)=1 head/rank → same block_len. The assertion expected remote_len=131072 but got 65536, failing every handshake (0 KV transfers, gsm8k=0.0000)."
-    - "Fix: replace tp_ratio with head_ratio (remote_heads_per_rank // local_heads_per_rank) which correctly accounts for GQA replication — commit 78ef73b on cleanup/m3-mi300x-mxfp8"
-    - "Also includes norm_out=None MNNVL aliasing fix (commit 66a43ba)"
-    - "Runtime-patched via extra_mount: ARM64 base image vllm/vllm-openai:minimax-m3 + bind-mounted worker.py (head_ratio) and fused_allreduce_gemma_rms_norm.py (norm_out=None) from shared lustre"
+    - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
+    - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
   evals-only: true
 

From 492db6131560394fa619cb14242302e84c8de230 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 05:13:19 +0800
Subject: [PATCH 23/33] fix: append changelog entry at end to pass immutability
 gate

The validator requires new entries appended at the file end (byte prefix
must match origin/main exactly). The previous commit inserted mid-file,
shifting entry indices and triggering the immutability check.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 perf-changelog.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 75cb04b4d..093d2ee79 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3851,14 +3851,6 @@
     - "Runner script updated to support dsv4 model prefix with dynamo-trt framework on GB300"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1689
 
-- config-keys:
-    - minimaxm3-fp8-gb200-dynamo-vllm
-  description:
-    - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
-    - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
-  evals-only: true
-
 - config-keys:
     - minimaxm3-fp8-b200-vllm
   description:
@@ -3958,3 +3950,11 @@
     - "Update ISL=8192 search-space: TP8-only from conc=4-64, DPA from conc=128-1024 (previously conc=1-64 and DPA conc=64-512)"
     - "Update Applied TBO on high concurrencies"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1717
+
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
+    - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
+  evals-only: true

From 37ab79b658fdf33efb977a093b9dec37643a65c3 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 06:17:17 +0800
Subject: [PATCH 24/33] feat: switch GB200 M3 to
 vllm/vllm-openai:minimax-m3-0618, run full sweep

Multi-arch image (arm64+amd64) with upstream head_ratio fix baked in.
Update all 14 GB200 disagg recipes (1k1k + 8k1k), nvidia-master.yaml,
and changelog entry (no longer evals-only).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                             | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml   | 2 +-
 .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml                  | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml    | 2 +-
 .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml                    | 2 +-
 .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml                     | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml    | 2 +-
 .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml                  | 2 +-
 .../1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml                 | 2 +-
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml   | 2 +-
 .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml                  | 2 +-
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml    | 2 +-
 .../8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml                     | 2 +-
 .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml                  | 2 +-
 .../8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml                 | 2 +-
 perf-changelog.yaml                                            | 3 +--
 16 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9b3477ec5..0d8ef5c87 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12923,7 +12923,7 @@ qwen3.5-fp4-b200-trt:
 # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
 # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:nightly-aarch64
+  image: vllm/vllm-openai:minimax-m3-0618
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
index efc5d5740..79a2004bc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 5ca08a06d..3d1dde14f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index f3e79340a..0e8d58332 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
index 147803c78..ffa7ec15a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 199699212..b7cbc4ded 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
index 1d1591198..0000a9648 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index 853095727..877b2e235 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index 4a6aa5d0f..c2aaaae8a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
index f6f2c7874..85b8ecc9a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 0d7d44843..94049a5b8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
index b0602354c..3476bf62f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 43838147a..138a8ff71 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index 6a0765c60..dd8ea9dc8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index 9e4ff3c2b..dead061e8 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3"
+  container: "vllm/vllm-openai:minimax-m3-0618"
   precision: "fp8"
 
 dynamo:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 093d2ee79..5c4362e04 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3954,7 +3954,6 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
+    - "Switch to vllm/vllm-openai:minimax-m3-0618 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
     - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
-  evals-only: true

From 4723ca2cc7def4a81db0367682b9305f9b3a60c9 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 06:52:30 +0800
Subject: [PATCH 25/33] revert: switch GB200 M3 back to nightly-aarch64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

minimax-m3-0618 likely cherry-picks vLLM PR #45723 (gemm1_alpha for
FP8 TRT-LLM MoE) but ships flashinfer ≤0.6.13 which lacks that kwarg
(flashinfer PR #3504), causing TypeError at runtime.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                              | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml    | 2 +-
 .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml                   | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml     | 2 +-
 .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml                     | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +-
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml     | 2 +-
 .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml                   | 2 +-
 .../1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml                  | 2 +-
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml    | 2 +-
 .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml                   | 2 +-
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml     | 2 +-
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 2 +-
 .../8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml                   | 2 +-
 .../8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml                  | 2 +-
 perf-changelog.yaml                                             | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 0d8ef5c87..9b3477ec5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12923,7 +12923,7 @@ qwen3.5-fp4-b200-trt:
 # 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
 # 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:minimax-m3-0618
+  image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
index 79a2004bc..c930ca92b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 3d1dde14f..d0f92214b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 0e8d58332..1c1a8f5ca 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
index ffa7ec15a..0662dd338 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index b7cbc4ded..256f1f723 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
index 0000a9648..4552dfb01 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index 877b2e235..de4f3ce22 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index c2aaaae8a..cd978be55 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
index 85b8ecc9a..b0edec3a1 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
index 94049a5b8..d326ed74b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 3476bf62f..63bea1a22 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -9,7 +9,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 138a8ff71..43838147a 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -8,7 +8,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
index dd8ea9dc8..b8d2944a2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
index dead061e8..3cc56e088 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
@@ -7,7 +7,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:minimax-m3-0618"
+  container: "vllm/vllm-openai:nightly-aarch64"
   precision: "fp8"
 
 dynamo:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 5c4362e04..b48027110 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3954,6 +3954,6 @@
 - config-keys:
     - minimaxm3-fp8-gb200-dynamo-vllm
   description:
-    - "Switch to vllm/vllm-openai:minimax-m3-0618 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
+    - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
     - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734

From c3030aaee9b6406d62fe6dba53a307b05ea172d7 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 13:27:02 +0800
Subject: [PATCH 26/33] feat: add --moe-backend marlin for TP-only GB200 M3
 disagg workers

Per PR #1809 pattern: Marlin MoE backend for TP-only configs (no EP,
no DP-attention). Applied to 6 recipes affecting 9 worker sections
(prefill and/or decode). EP/DP-attention workers stay on default.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml     | 2 ++
 .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml                     | 1 +
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 1 +
 .../minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml     | 2 ++
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml     | 2 ++
 .../minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml | 1 +
 perf-changelog.yaml                                             | 1 +
 7 files changed, 10 insertions(+)

diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 1c1a8f5ca..049af1fa7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -69,6 +69,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 2304
@@ -88,6 +89,7 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       max-model-len: 2304
       max-num-seqs: 256
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
index 0662dd338..890b5a590 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
@@ -63,6 +63,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 2304
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 256f1f723..8d63df4ab 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -63,6 +63,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 2304
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
index 4552dfb01..de1488514 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
@@ -62,6 +62,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 2304
@@ -81,6 +82,7 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       max-model-len: 2304
       max-num-seqs: 256
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
index 63bea1a22..3b1ba4032 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
@@ -64,6 +64,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 9472
@@ -83,6 +84,7 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       max-model-len: 9472
       max-num-seqs: 256
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
index 43838147a..7074eaf13 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
@@ -63,6 +63,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      moe-backend: marlin
       pipeline-parallel-size: 1
       enforce-eager: true
       max-model-len: 9472
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b48027110..97dec6ca7 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3956,4 +3956,5 @@
   description:
     - "Switch to vllm/vllm-openai:nightly-aarch64 — contains upstream head_ratio fix (vllm-project/vllm#45879), drop extra_mount workaround"
     - "NixlConnector handshake block_len validation now uses per-rank KV head ratio instead of tp_ratio, fixing GQA-replicated disagg (M3 TEP8: TP4→TP8, 4 KV heads)"
+    - "Add --moe-backend marlin for TP-only prefill/decode workers (no EP, no DP-attention) per PR #1809 pattern"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734

From 2ea30b385c363163e735e4b11ef1042c836c6828 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 15:40:36 +0800
Subject: [PATCH 27/33] fix: scale MiniMax-M3 prefill to DEP8

---
 .github/configs/nvidia-master.yaml            | 10 +++++-----
 ...aml => disagg-gb200-1p1d-dep8-tp8-4n.yaml} | 20 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tp4-tp8-3n.yaml => disagg-gb200-1p1d-dep8-tp8-4n.yaml} (82%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d57819de0..b2091b3d8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13016,15 +13016,15 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 8192
       osl: 1024
       search-space:
-      # Low latency 8k1k: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
+      # Low latency 8k1k: DEP8 prefill -> TEP8 decode (TP8+EP8), 4 nodes.
       - conc-list: [1, 2, 4, 8, 16]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 1
-          dp-attn: false
+          tp: 8
+          ep: 8
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml"
         decode:
           num-worker: 1
           tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
similarity index 82%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
index 7074eaf13..d217c0ed9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
@@ -1,10 +1,10 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-8k1k"
 
 # MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k,
-# wider decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
-# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
-# reduces per-step latency by spreading both attention and MoE across
-# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
+# wider decode). Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) ->
+# NixlConnector -> Decode TEP8 (TP8+EP8, 2 nodes) = 4 nodes. DEP8
+# shards the prefill MoE weights across 8 GPUs so the model fits without
+# Marlin repacking. FLASHINFER attention, block-size 128.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,11 +26,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
+  gpus_per_prefill: 8
   gpus_per_decode: 8
 
 frontend:
@@ -62,9 +62,11 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
+      tensor-parallel-size: 1
       pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13346
+      enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 9472
       max-num-seqs: 16

From fc4af8b5717a0721e498cbf0e1f9c293fb253c0d Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 16:00:06 +0800
Subject: [PATCH 28/33] feat: redesign MiniMax-M3 GB200 decode tiers

---
 .github/configs/nvidia-master.yaml            |  73 ++++--------
 .../8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 111 ------------------
 ...ml => disagg-gb200-1p1d-dep8-tep8-4n.yaml} |  26 ++--
 .../8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml   |  14 +--
 .../8k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 108 -----------------
 ...n.yaml => disagg-gb200-2p7d-dep8-18n.yaml} |  19 ++-
 .../disagg-gb200-4p1d-dep8-dep16-12n.yaml     | 110 -----------------
 7 files changed, 46 insertions(+), 415 deletions(-)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-4n.yaml => disagg-gb200-1p1d-dep8-tep8-4n.yaml} (77%)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p1d-dep8-dep16-8n.yaml => disagg-gb200-2p7d-dep8-18n.yaml} (84%)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index b2091b3d8..21af1ea8d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12915,13 +12915,10 @@ qwen3.5-fp4-b200-trt:
 # block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
 # (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
 # (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
-# from m3_release before running. Fully disaggregated, rack-scale wide-EP
-# GB200 sweep (NixlConnector P/D split over the NVL72 NVLink fabric). Mirrors
-# the deepseek-v4 "megamoe" ladder: DEP unit = DP-attn + expert-parallel
-# (DEP8 = 8 GPU / 2 nodes, DEP16 = 16 GPU / 4 nodes), with prefill workers
-# scaled 1P->4P. EP8/EP16 vs B200's 8-GPU NVLink island is the GB200 edge.
-# 1P1D TP4 (low conc), 1P1D DEP8 (mid), 1P1D DEP8->DEP16 (wide decode),
-# 2P1D / 4P1D DEP8->DEP16 (prefill-scaled max throughput). M3 = 128 experts.
+# from m3_release before running. The active 8k1k matrix compares DEP8
+# prefill with TEP8 and TP8+Marlin decode at low concurrency, then fills the
+# NVL72 rack with 2P+7D DEP8 for throughput. DEP8 is TP1 + DP-attn + EP8
+# across 8 GPU / 2 nodes. M3 has 128 routed experts.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -13016,80 +13013,50 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 8192
       osl: 1024
       search-space:
-      # Low latency 8k1k: DEP8 prefill -> TEP8 decode (TP8+EP8), 4 nodes.
-      - conc-list: [1, 2, 4, 8, 16]
+      # Low conc: 1P DEP8 + 1D TEP8 (TP8+EP8), 4 nodes.
+      - conc-list: [8, 16, 32, 64]
         prefill:
           num-worker: 1
-          tp: 8
+          tp: 1
           ep: 8
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml"
         decode:
           num-worker: 1
           tp: 8
           ep: 8
           dp-attn: false
 
-      # Mid curve 8k1k: 1P+1D DEP8, 4 nodes.
-      - conc-list: [32, 64, 128]
+      # Low conc: 1P DEP8 + 1D TP8 Marlin, 4 nodes.
+      - conc-list: [8, 16, 32, 64]
         prefill:
           num-worker: 1
-          tp: 8
+          tp: 1
           ep: 8
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml"
         decode:
           num-worker: 1
           tp: 8
-          ep: 8
-          dp-attn: true
-
-      # Wide decode 8k1k: DEP8 prefill -> DEP16 decode, 6 nodes.
-      - conc-list: [128, 256, 512]
-        prefill:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
+          ep: 1
+          dp-attn: false
 
-      # Prefill-scaled 8k1k: 2P+1D, 8 nodes.
-      - conc-list: [512, 1024]
+      # High conc: 2P DEP8 + 7D DEP8, 18 nodes / 72 GPU.
+      - conc-list: [128, 256, 512, 1024]
         prefill:
           num-worker: 2
-          tp: 8
+          tp: 1
           ep: 8
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml"
         decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
-
-      # Max throughput 8k1k: 4P+1D, 12 nodes.
-      - conc-list: [1024, 2048]
-        prefill:
-          num-worker: 4
-          tp: 8
+          num-worker: 7
+          tp: 1
           ep: 8
           dp-attn: true
-          additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-        decode:
-          num-worker: 1
-          tp: 16
-          ep: 16
-          dp-attn: true
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
deleted file mode 100644
index d326ed74b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ /dev/null
@@ -1,111 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-8k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide decode, 8k1k).
-# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16
-# (16 GPU / 4 nodes) = 6 nodes. Rack-scale decode throughput over NVL72
-# NVLink -- EP16 across 4 nodes is the regime B200 can't reach. M3 has
-# 128 routed experts: EP16 = 8 experts/rank. FLASHINFER, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 4
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml
similarity index 77%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml
index b0edec3a1..3859ae520 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml
@@ -1,10 +1,8 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tep8-4n-8k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, 8k1k).
-# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
-# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
-# over the NVL72 NVLink fabric. M3 has 128 routed experts so EP8 shards
-# 16 experts/rank. FLASHINFER attention, block-size 128.
+# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TEP8 decode
+# (TP8+EP8, 2 nodes) = 4 nodes. Low-concurrency latency config.
+# Marlin is intentionally not used on the DEP8 prefill.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -80,19 +78,17 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 32
+      stream-interval: 1
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 8
       pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 9472
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
       block-size: 128
       attention-backend: FLASHINFER
       language-model-only: true
@@ -102,10 +98,10 @@ backend:
       no-enable-prefix-caching: true
       numa-bind: true
       enable-sleep-mode: true
-      stream-interval: 128
+      stream-interval: 1
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "32x64x128"
+  concurrencies: "8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
index d217c0ed9..528b90da9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
@@ -1,10 +1,8 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-4n-8k1k"
 
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k,
-# wider decode). Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) ->
-# NixlConnector -> Decode TEP8 (TP8+EP8, 2 nodes) = 4 nodes. DEP8
-# shards the prefill MoE weights across 8 GPUs so the model fits without
-# Marlin repacking. FLASHINFER attention, block-size 128.
+# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TP8 decode
+# (Marlin MoE, 2 nodes) = 4 nodes. Low-concurrency latency config.
+# Marlin is restricted to the pure TP8 decode worker.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -85,8 +83,8 @@ backend:
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 8
+      moe-backend: marlin
       pipeline-parallel-size: 1
-      enable-expert-parallel: true
       max-model-len: 9472
       max-num-seqs: 256
       max-num-batched-tokens: 256
@@ -106,4 +104,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1x2x4x8x16"
+  concurrencies: "8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
deleted file mode 100644
index 3b1ba4032..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ /dev/null
@@ -1,108 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-8k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, 8k1k).
-# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
-# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
-# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
-# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
-# Low-conc tuned: stream-interval 1, cudagraph cap 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      max-model-len: 9472
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1x2x4x8x16"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml
similarity index 84%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml
index b8d2944a2..23900d1c0 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml
@@ -1,9 +1,8 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p7d-dep8-18n-8k1k"
 
-# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled, 8k1k).
-# 2x Prefill DEP8 (4 nodes) -> NixlConnector -> 1x Decode DEP16
-# (4 nodes) = 8 nodes. Double prefill workers absorb 8k ISL compute;
-# rack-scale DEP16 decode across NVL72. FLASHINFER, block-size 128.
+# 2P DEP8 prefill (4 nodes) + 7D DEP8 decode (14 nodes) =
+# 18 nodes / 72 GPU. High-concurrency throughput config.
+# DEP8 is TP1 DP8 EP on both sides; Marlin is not used.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,11 +25,11 @@ resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 4
-  decode_nodes: 4
+  decode_nodes: 14
   prefill_workers: 2
-  decode_workers: 1
+  decode_workers: 7
   gpus_per_prefill: 8
-  gpus_per_decode: 16
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -85,7 +84,7 @@ backend:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
       pipeline-parallel-size: 1
-      data-parallel-size: 16
+      data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       max-model-len: 9472
@@ -107,4 +106,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "512x1024"
+  concurrencies: "128x256x512x1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
deleted file mode 100644
index 3cc56e088..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ /dev/null
@@ -1,110 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-8k1k"
-
-# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput, 8k1k).
-# 4x Prefill DEP8 (8 nodes) -> NixlConnector -> 1x Decode DEP16
-# (4 nodes) = 12 nodes within one NVL72 rack. Maximises prefill
-# bandwidth for 8k ISL; rack-scale DEP16 decode. FLASHINFER, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 8
-  decode_nodes: 4
-  prefill_workers: 4
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 8192
-  osl: 1024
-  concurrencies: "1024x2048"

From 4431440fdecb6b6519371832ca44742c79db7f51 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 19:00:18 +0800
Subject: [PATCH 29/33] feat: TEP4 prefill + B300-optimal decode for GB200 M3
 disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch all prefill from DEP8 (TP1 DP8 EP, 2 nodes) to TEP4
(TP4+EP4, 1 node), halving per-worker node footprint. Decode
configs follow B300 run 27630519240 optimal points (spec=none):
- conc 8-32: TP4+Marlin (no EP)
- conc 64-256: TEP4 (TP4+EP4)
- conc 512/1024: TEP8 (8k1k) or DEP8 (1k1k), 8 workers × 18n

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 147 +++++++-----------
 .../1k1k/disagg-gb200-1p1d-tep4-2n.yaml       | 105 +++++++++++++
 .../1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml   | 105 +++++++++++++
 .../disagg-gb200-2p8d-tep4-dep8-18n.yaml}     |  29 ++--
 ...4n.yaml => disagg-gb200-1p1d-tep4-2n.yaml} |  24 ++-
 ...aml => disagg-gb200-1p1d-tep4-tp4-2n.yaml} |  24 ++-
 .../8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml | 106 +++++++++++++
 7 files changed, 407 insertions(+), 133 deletions(-)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/{8k1k/disagg-gb200-2p7d-dep8-18n.yaml => 1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml} (81%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-tep8-4n.yaml => disagg-gb200-1p1d-tep4-2n.yaml} (82%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep8-tp8-4n.yaml => disagg-gb200-1p1d-tep4-tp4-2n.yaml} (82%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 21af1ea8d..f85aeefa9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12907,18 +12907,13 @@ qwen3.5-fp4-b200-trt:
 # MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
 # (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. Image is the multi-arch m3_release vLLM build
-# (vllm/vllm-openai:minimax-m3, vllm-project/vllm#45381); recipes set
-# dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime AND
-# NIXL are layered in at job start (the ai-dynamo vllm-runtime dev image
-# shipped without NIXL, so disagg workers crashed at NixlConnector init).
-# block-size 128 mandatory (MSA index-cache alignment); FLASHINFER
-# (trtllm-gen) attention to exploit Blackwell — needs vllm#45381 @ 022448dd
-# (m3_release HEAD: gates page>=128 on trtllm-gen GQA), so rebuild the image
-# from m3_release before running. The active 8k1k matrix compares DEP8
-# prefill with TEP8 and TP8+Marlin decode at low concurrency, then fills the
-# NVL72 rack with 2P+7D DEP8 for throughput. DEP8 is TP1 + DP-attn + EP8
-# across 8 GPU / 2 nodes. M3 has 128 routed experts.
+# tensor cores on Blackwell. Image is nightly-aarch64 (vLLM main); recipes
+# set dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime
+# AND NIXL are layered in at job start. block-size 128 mandatory (MSA
+# index-cache alignment); FLASHINFER attention. All prefill is TEP4
+# (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal
+# points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc,
+# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 8 decode workers.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -12933,130 +12928,98 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     # - isl: 1024
     #   osl: 1024
     #   search-space:
-    #   # Low latency: 1P+1D TP4 prefill -> TEP8 decode (TP8+EP8), 3 nodes.
-    #   # EP splits 128 MoE experts across 8 decode ranks (16 each), cutting
-    #   # per-step latency ~19% vs pure TP8.  Matches B200 TEP8 topology.
-    #   - conc-list: [1, 2, 4, 8, 16, 32, 64]
+    #   # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes.
+    #   - conc-list: [16, 32]
     #     prefill:
     #       num-worker: 1
     #       tp: 4
-    #       ep: 1
+    #       ep: 4
     #       dp-attn: false
     #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml"
+    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml"
     #     decode:
     #       num-worker: 1
-    #       tp: 8
-    #       ep: 8
+    #       tp: 4
+    #       ep: 1
     #       dp-attn: false
     #
-    #   # Mid curve: 1P+1D DEP8 (DP-attn + EP8), 4 nodes.
-    #   - conc-list: [128, 256, 512]
+    #   # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes.
+    #   - conc-list: [64, 128, 256, 512]
     #     prefill:
     #       num-worker: 1
-    #       tp: 8
-    #       ep: 8
-    #       dp-attn: true
-    #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml"
-    #     decode:
-    #       num-worker: 1
-    #       tp: 8
-    #       ep: 8
-    #       dp-attn: true
-    #
-    #   # Wide decode: 1P+1D DEP8 prefill -> DEP16 decode, 6 nodes.
-    #   - conc-list: [512, 1024, 2048]
-    #     prefill:
-    #       num-worker: 1
-    #       tp: 8
-    #       ep: 8
-    #       dp-attn: true
+    #       tp: 4
+    #       ep: 4
+    #       dp-attn: false
     #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml"
+    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml"
     #     decode:
     #       num-worker: 1
-    #       tp: 16
-    #       ep: 16
-    #       dp-attn: true
+    #       tp: 4
+    #       ep: 4
+    #       dp-attn: false
     #
-    #   # Prefill-scaled: 2P+1D, 8 nodes.
-    #   - conc-list: [2048, 4096]
+    #   # High conc: 2P TEP4 + 8D DEP8, 18 nodes / 72 GPU.
+    #   - conc-list: [1024]
     #     prefill:
     #       num-worker: 2
-    #       tp: 8
-    #       ep: 8
-    #       dp-attn: true
+    #       tp: 4
+    #       ep: 4
+    #       dp-attn: false
     #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml"
+    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml"
     #     decode:
-    #       num-worker: 1
-    #       tp: 16
-    #       ep: 16
-    #       dp-attn: true
-    #
-    #   # Max throughput: 4P+1D, 12 nodes.
-    #   - conc-list: [4096, 8192]
-    #     prefill:
-    #       num-worker: 4
-    #       tp: 8
+    #       num-worker: 8
+    #       tp: 1
     #       ep: 8
     #       dp-attn: true
-    #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml"
-    #     decode:
-    #       num-worker: 1
-    #       tp: 16
-    #       ep: 16
-    #       dp-attn: true
 
     - isl: 8192
       osl: 1024
       search-space:
-      # Low conc: 1P DEP8 + 1D TEP8 (TP8+EP8), 4 nodes.
-      - conc-list: [8, 16, 32, 64]
+      # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes.
+      - conc-list: [8, 16, 32]
         prefill:
           num-worker: 1
-          tp: 1
-          ep: 8
-          dp-attn: true
+          tp: 4
+          ep: 4
+          dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml"
         decode:
           num-worker: 1
-          tp: 8
-          ep: 8
+          tp: 4
+          ep: 1
           dp-attn: false
 
-      # Low conc: 1P DEP8 + 1D TP8 Marlin, 4 nodes.
-      - conc-list: [8, 16, 32, 64]
+      # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes.
+      - conc-list: [64, 128, 256]
         prefill:
           num-worker: 1
-          tp: 1
-          ep: 8
-          dp-attn: true
+          tp: 4
+          ep: 4
+          dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml"
         decode:
           num-worker: 1
-          tp: 8
-          ep: 1
+          tp: 4
+          ep: 4
           dp-attn: false
 
-      # High conc: 2P DEP8 + 7D DEP8, 18 nodes / 72 GPU.
-      - conc-list: [128, 256, 512, 1024]
+      # High conc: 2P TEP4 + 8D TEP8, 18 nodes / 72 GPU.
+      - conc-list: [512]
         prefill:
           num-worker: 2
-          tp: 1
-          ep: 8
-          dp-attn: true
+          tp: 4
+          ep: 4
+          dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml"
         decode:
-          num-worker: 7
-          tp: 1
+          num-worker: 8
+          tp: 8
           ep: 8
-          dp-attn: true
+          dp-attn: false
 
 # MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
 # 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml
new file mode 100644
index 000000000..938bfe8cb
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml
@@ -0,0 +1,105 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-1k1k"
+
+# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4,
+# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at
+# conc 64-512. EP splits 128 MoE experts across 4 decode ranks.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "64x128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
new file mode 100644
index 000000000..35a358f2b
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
@@ -0,0 +1,105 @@
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-1k1k"
+
+# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE,
+# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal
+# at conc 1-32. Marlin on TP-only decode (no EP).
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 2304
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      moe-backend: marlin
+      pipeline-parallel-size: 1
+      max-model-len: 2304
+      max-num-seqs: 256
+      max-num-batched-tokens: 256
+      max-cudagraph-capture-size: 128
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16x32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml
similarity index 81%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml
index 23900d1c0..265b26092 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p7d-dep8-18n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml
@@ -1,8 +1,9 @@
-name: "minimax-m3-vllm-disagg-gb200-2p7d-dep8-18n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-dep8-18n-1k1k"
 
-# 2P DEP8 prefill (4 nodes) + 7D DEP8 decode (14 nodes) =
-# 18 nodes / 72 GPU. High-concurrency throughput config.
-# DEP8 is TP1 DP8 EP on both sides; Marlin is not used.
+# 2P TEP4 prefill (2 nodes) + 8D DEP8 decode (16 nodes) = 18 nodes
+# / 72 GPU. High-concurrency throughput config. B300 optimal at
+# conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker
+# spans 2 GB200 nodes (8 GPU) over NVL72 NVLink.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -24,11 +25,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 4
-  decode_nodes: 14
+  prefill_nodes: 2
+  decode_nodes: 16
   prefill_workers: 2
-  decode_workers: 7
-  gpus_per_prefill: 8
+  decode_workers: 8
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -60,13 +61,11 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
-      max-model-len: 9472
+      max-model-len: 2304
       max-num-seqs: 16
       max-num-batched-tokens: 16384
       block-size: 128
@@ -87,7 +86,7 @@ backend:
       data-parallel-size: 8
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 9472
+      max-model-len: 2304
       max-num-seqs: 512
       max-num-batched-tokens: 512
       max-cudagraph-capture-size: 512
@@ -104,6 +103,6 @@ backend:
 
 benchmark:
   type: "sa-bench"
-  isl: 8192
+  isl: 1024
   osl: 1024
-  concurrencies: "128x256x512x1024"
+  concurrencies: "1024"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml
similarity index 82%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml
index 3859ae520..405751955 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml
@@ -1,8 +1,8 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tep8-4n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-8k1k"
 
-# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TEP8 decode
-# (TP8+EP8, 2 nodes) = 4 nodes. Low-concurrency latency config.
-# Marlin is intentionally not used on the DEP8 prefill.
+# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4,
+# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at
+# conc 64-256. EP splits 128 MoE experts across 4 decode ranks.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -24,12 +24,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
+  prefill_nodes: 1
+  decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -60,10 +60,8 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 9472
@@ -82,7 +80,7 @@ backend:
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 8
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
       enable-expert-parallel: true
       max-model-len: 9472
@@ -104,4 +102,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "8x16x32x64"
+  concurrencies: "64x128x256"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
similarity index 82%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
index 528b90da9..de35075fb 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep8-tp8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
@@ -1,8 +1,8 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-tp8-4n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-8k1k"
 
-# 1P DEP8 prefill (TP1 DP8 EP, 2 nodes) + 1D TP8 decode
-# (Marlin MoE, 2 nodes) = 4 nodes. Low-concurrency latency config.
-# Marlin is restricted to the pure TP8 decode worker.
+# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE,
+# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal
+# at conc 1-32. Marlin on TP-only decode (no EP).
 
 model:
   path: "minimax-m3-mxfp8"
@@ -24,12 +24,12 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
+  prefill_nodes: 1
+  decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
 
 frontend:
   type: dynamo
@@ -60,10 +60,8 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
+      tensor-parallel-size: 4
       pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
       enable-expert-parallel: true
       enforce-eager: true
       max-model-len: 9472
@@ -82,7 +80,7 @@ backend:
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 8
+      tensor-parallel-size: 4
       moe-backend: marlin
       pipeline-parallel-size: 1
       max-model-len: 9472
@@ -104,4 +102,4 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "8x16x32x64"
+  concurrencies: "8x16x32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml
new file mode 100644
index 000000000..1e3cea453
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml
@@ -0,0 +1,106 @@
+name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-tep8-18n-8k1k"
+
+# 2P TEP4 prefill (2 nodes) + 8D TEP8 decode (16 nodes) = 18 nodes
+# / 72 GPU. High-concurrency throughput config. B300 optimal at
+# conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200
+# nodes (8 GPU) over NVL72 NVLink.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 16
+  prefill_workers: 2
+  decode_workers: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9472
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 32
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9472
+      max-num-seqs: 512
+      max-num-batched-tokens: 512
+      max-cudagraph-capture-size: 512
+      block-size: 128
+      attention-backend: FLASHINFER
+      language-model-only: true
+      gpu-memory-utilization: 0.9
+      safetensors-load-strategy: "prefetch"
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      numa-bind: true
+      enable-sleep-mode: true
+      stream-interval: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"

From 2ac0def1c4c38c5cf83652462ca0917054a8d948 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Fri, 19 Jun 2026 19:02:00 +0800
Subject: [PATCH 30/33] fix: cap decode workers at 2 for high-conc GB200 M3
 recipes

Rename 2p8d/18n recipes to 2p2d/6n: 2 prefill (2 nodes) +
2 decode (4 nodes) = 6 nodes total.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                 | 14 +++++++-------
 ...8n.yaml => disagg-gb200-2p2d-tep4-dep8-6n.yaml} | 10 +++++-----
 ...8n.yaml => disagg-gb200-2p2d-tep4-tep8-6n.yaml} | 10 +++++-----
 3 files changed, 17 insertions(+), 17 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p8d-tep4-dep8-18n.yaml => disagg-gb200-2p2d-tep4-dep8-6n.yaml} (91%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p8d-tep4-tep8-18n.yaml => disagg-gb200-2p2d-tep4-tep8-6n.yaml} (91%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f85aeefa9..c073feb21 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12913,7 +12913,7 @@ qwen3.5-fp4-b200-trt:
 # index-cache alignment); FLASHINFER attention. All prefill is TEP4
 # (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal
 # points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc,
-# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 8 decode workers.
+# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 2 decode workers.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -12958,7 +12958,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     #       ep: 4
     #       dp-attn: false
     #
-    #   # High conc: 2P TEP4 + 8D DEP8, 18 nodes / 72 GPU.
+    #   # High conc: 2P TEP4 + 2D DEP8, 6 nodes / 24 GPU.
     #   - conc-list: [1024]
     #     prefill:
     #       num-worker: 2
@@ -12966,9 +12966,9 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     #       ep: 4
     #       dp-attn: false
     #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml"
+    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml"
     #     decode:
-    #       num-worker: 8
+    #       num-worker: 2
     #       tp: 1
     #       ep: 8
     #       dp-attn: true
@@ -13006,7 +13006,7 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 4
           dp-attn: false
 
-      # High conc: 2P TEP4 + 8D TEP8, 18 nodes / 72 GPU.
+      # High conc: 2P TEP4 + 2D TEP8, 6 nodes / 24 GPU.
       - conc-list: [512]
         prefill:
           num-worker: 2
@@ -13014,9 +13014,9 @@ minimaxm3-fp8-gb200-dynamo-vllm:
           ep: 4
           dp-attn: false
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml"
         decode:
-          num-worker: 8
+          num-worker: 2
           tp: 8
           ep: 8
           dp-attn: false
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml
similarity index 91%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml
index 265b26092..f3cd98459 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p8d-tep4-dep8-18n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-dep8-18n-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-dep8-6n-1k1k"
 
-# 2P TEP4 prefill (2 nodes) + 8D DEP8 decode (16 nodes) = 18 nodes
-# / 72 GPU. High-concurrency throughput config. B300 optimal at
+# 2P TEP4 prefill (2 nodes) + 2D DEP8 decode (4 nodes) = 6 nodes
+# / 24 GPU. High-concurrency throughput config. B300 optimal at
 # conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker
 # spans 2 GB200 nodes (8 GPU) over NVL72 NVLink.
 
@@ -26,9 +26,9 @@ resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 2
-  decode_nodes: 16
+  decode_nodes: 4
   prefill_workers: 2
-  decode_workers: 8
+  decode_workers: 2
   gpus_per_prefill: 4
   gpus_per_decode: 8
 
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml
similarity index 91%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml
index 1e3cea453..999ffa26d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p8d-tep4-tep8-18n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p8d-tep4-tep8-18n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-tep8-6n-8k1k"
 
-# 2P TEP4 prefill (2 nodes) + 8D TEP8 decode (16 nodes) = 18 nodes
-# / 72 GPU. High-concurrency throughput config. B300 optimal at
+# 2P TEP4 prefill (2 nodes) + 2D TEP8 decode (4 nodes) = 6 nodes
+# / 24 GPU. High-concurrency throughput config. B300 optimal at
 # conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200
 # nodes (8 GPU) over NVL72 NVLink.
 
@@ -26,9 +26,9 @@ resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 2
-  decode_nodes: 16
+  decode_nodes: 4
   prefill_workers: 2
-  decode_workers: 8
+  decode_workers: 2
   gpus_per_prefill: 4
   gpus_per_decode: 8
 

From 748469a061d2c1b96da85c08a3daf4a0801defa2 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 20 Jun 2026 12:15:59 +0800
Subject: [PATCH 31/33] feat: adapt NV B300 PR #1863 disagg configs for GB200
 M3 sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace TEP4 prefill + B300-optimal decode recipes with NV's PR #1863
B300 dynamo-vllm disagg search matrix, adapted for GB200 NVL72
(4 GPU/node):

- All prefill switched to DEP2 (TP1 DP2 EP, 2 GPU/worker) — lighter
  per-worker footprint allows more prefill workers
- Decode types: TP4+Marlin, TEP8, DEP8, DEP4
- 4p3d (3 decode workers) skipped
- 15 recipe files: 8 for 8k1k, 7 for 1k1k (both ISLs active)
- PR 1863 vllm_config values (max-num-seqs up to 4096,
  max-cudagraph-capture-size up to 8192, max-num-batched-tokens 16384)
- Prefill uses cudagraph (max-cudagraph-capture-size: 2048) instead
  of enforce-eager
- req_rate: inf for all benchmarks
- FLASHINFER attention, GB200 UCX env vars preserved

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml            | 273 +++++++++++++-----
 ...ml => disagg-gb200-1p1d-dep2-tep8-3n.yaml} |  68 ++---
 ...disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} |  63 ++--
 .../1k1k/disagg-gb200-1p1d-dep8-4n.yaml       | 118 --------
 .../1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml | 117 --------
 .../1k1k/disagg-gb200-1p1d-tp4-2n.yaml        | 113 --------
 .../1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml   | 107 -------
 .../1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml    | 107 -------
 .../1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml  | 101 +++++++
 .../1k1k/disagg-gb200-1p2d-tp4-3n.yaml        | 106 -------
 .../1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml  | 101 +++++++
 .../1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml  |  99 +++++++
 .../1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml | 117 --------
 ...ml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} |  67 ++---
 .../1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml  |  99 +++++++
 .../disagg-gb200-4p1d-dep8-dep16-12n.yaml     | 117 --------
 ...disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} |  63 ++--
 ...ml => disagg-gb200-1p2d-dep2-dep8-5n.yaml} |  72 +++--
 .../8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml  | 101 +++++++
 ...ml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} |  63 ++--
 .../8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml  | 101 +++++++
 .../8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml  |  99 +++++++
 .../8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml  | 101 +++++++
 .../8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml  |  99 +++++++
 24 files changed, 1282 insertions(+), 1190 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tep4-2n.yaml => disagg-gb200-1p1d-dep2-tep8-3n.yaml} (65%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-tep4-tp4-2n.yaml => disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} (68%)
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p2d-tep4-dep8-6n.yaml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} (66%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
 delete mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tep4-tp4-2n.yaml => disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml} (68%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-tep4-2n.yaml => disagg-gb200-1p2d-dep2-dep8-5n.yaml} (64%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-tep4-tep8-6n.yaml => disagg-gb200-2p2d-dep2-tep8-5n.yaml} (67%)
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml
 create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c073feb21..6d631c8cb 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12904,16 +12904,10 @@ qwen3.5-fp4-b200-trt:
       - { tp: 4, ep: 4, dp-attn: true, conc-list: [1024] }
       - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }
 
-# MiniMax-M3 (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
-# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
-# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX
-# tensor cores on Blackwell. Image is nightly-aarch64 (vLLM main); recipes
-# set dynamo.install=true + wheel 1.2.0.dev20260526 so the dynamo runtime
-# AND NIXL are layered in at job start. block-size 128 mandatory (MSA
-# index-cache alignment); FLASHINFER attention. All prefill is TEP4
-# (TP4+EP4, 1 GB200 node). Decode configs mirror B300 single-node optimal
-# points (run 27630519240): TP4+Marlin at low conc, TEP4 at mid conc,
-# TEP8 (8k1k) / DEP8 (1k1k) at high conc with 2 decode workers.
+# MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863.
+# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
+# DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped.
+# FLASHINFER attention. No kv-cache-dtype (GB200 default).
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -12925,96 +12919,231 @@ minimaxm3-fp8-gb200-dynamo-vllm:
   disagg: true
   scenarios:
     fixed-seq-len:
-    # - isl: 1024
-    #   osl: 1024
-    #   search-space:
-    #   # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes.
-    #   - conc-list: [16, 32]
-    #     prefill:
-    #       num-worker: 1
-    #       tp: 4
-    #       ep: 4
-    #       dp-attn: false
-    #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml"
-    #     decode:
-    #       num-worker: 1
-    #       tp: 4
-    #       ep: 1
-    #       dp-attn: false
-    #
-    #   # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes.
-    #   - conc-list: [64, 128, 256, 512]
-    #     prefill:
-    #       num-worker: 1
-    #       tp: 4
-    #       ep: 4
-    #       dp-attn: false
-    #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml"
-    #     decode:
-    #       num-worker: 1
-    #       tp: 4
-    #       ep: 4
-    #       dp-attn: false
-    #
-    #   # High conc: 2P TEP4 + 2D DEP8, 6 nodes / 24 GPU.
-    #   - conc-list: [1024]
-    #     prefill:
-    #       num-worker: 2
-    #       tp: 4
-    #       ep: 4
-    #       dp-attn: false
-    #       additional-settings:
-    #       - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml"
-    #     decode:
-    #       num-worker: 2
-    #       tp: 1
-    #       ep: 8
-    #       dp-attn: true
-
-    - isl: 8192
+    - isl: 1024
       osl: 1024
       search-space:
-      # Low conc: 1P TEP4 + 1D TP4 Marlin, 2 nodes.
-      - conc-list: [8, 16, 32]
+      # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096
+      - conc-list: [4, 16, 64, 128, 4096]
         prefill:
           num-worker: 1
-          tp: 4
-          ep: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
           dp-attn: false
+
+      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 1
           dp-attn: false
 
-      # Mid conc: 1P TEP4 + 1D TEP4, 2 nodes.
-      - conc-list: [64, 128, 256]
+      # 1p2d DEP2+DEP4, 3n: conc 2048
+      - conc-list: [2048]
         prefill:
           num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml"
+        decode:
+          num-worker: 2
           tp: 4
           ep: 4
+          dp-attn: true
+
+      # 2p1d DEP2+DEP8, 3n: conc 512,4096
+      - conc-list: [512, 4096]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p1d DEP2+TEP8, 3n: conc 32
+      - conc-list: [32]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
           dp-attn: false
+
+      # 2p2d DEP2+TEP8, 5n: conc 16
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+      # 3p2d DEP2+TEP8, 6n: conc 4
+      - conc-list: [4]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+    - isl: 8192
+      osl: 1024
+      search-space:
+      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
-          ep: 4
+          ep: 1
+          dp-attn: false
+
+      # 1p2d DEP2+DEP8, 5n: conc 128
+      - conc-list: [128]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p2d DEP2+DEP8, 5n: conc 256,512
+      - conc-list: [256, 512]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 2p2d DEP2+TEP8, 5n: conc 16
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
           dp-attn: false
 
-      # High conc: 2P TEP4 + 2D TEP8, 6 nodes / 24 GPU.
+      # 3p2d DEP2+DEP8, 6n: conc 512
       - conc-list: [512]
         prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml"
+        decode:
           num-worker: 2
-          tp: 4
-          ep: 4
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 3p2d DEP2+TEP8, 6n: conc 32
+      - conc-list: [32]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
           dp-attn: false
+
+      # 4p2d DEP2+DEP8, 6n: conc 4096
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+
+      # 5p2d DEP2+TEP8, 7n: conc 4,64
+      - conc-list: [4, 64]
+        prefill:
+          num-worker: 5
+          tp: 2
+          ep: 2
+          dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml"
         decode:
           num-worker: 2
           tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml
similarity index 65%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml
index 938bfe8cb..b4484c443 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml
@@ -1,8 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tep8-fp8-1k1k"
 
-# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4,
-# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at
-# conc 64-512. EP splits 128 MoE experts across 4 decode ranks.
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
+# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -20,16 +19,15 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 1
-  decode_nodes: 1
+  decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -41,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -50,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -60,46 +60,40 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 8
       enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "64x128x256x512"
+  concurrencies: "4x16x64x128x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
similarity index 68%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
index 35a358f2b..a0f1bda01 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
@@ -1,8 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-1k1k"
 
-# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE,
-# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal
-# at conc 1-32. Marlin on TP-only decode (no EP).
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin
+# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -20,7 +19,6 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
@@ -28,7 +26,7 @@ resources:
   decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
+  gpus_per_prefill: 2
   gpus_per_decode: 4
 
 frontend:
@@ -41,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -50,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -60,46 +60,41 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      enable-expert-parallel: false
       moe-backend: marlin
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "16x32"
+  concurrencies: "1x4x8x16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
deleted file mode 100644
index c930ca92b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-4n.yaml
+++ /dev/null
@@ -1,118 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (mid curve, wide EP).
-# Prefill DEP8 (DP-attn + EP across 8 GPU / 2 nodes) -> NixlConnector ->
-# Decode DEP8 (8 GPU / 2 nodes) = 4 nodes. Rack-scale expert parallel
-# over the NVL72 NVLink fabric -- the regime where GB200 pulls ahead of
-# B200 (capped at an 8-GPU NVLink island). M3 has 128 routed experts so
-# EP8 shards 16 experts/rank. FLASHINFER attention, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "128x256x512"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
deleted file mode 100644
index d0f92214b..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep8-dep16-6n.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep8-dep16-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (wide-decode curve).
-# Prefill DEP8 (8 GPU / 2 nodes) -> NixlConnector -> Decode DEP16 (DP-attn
-# + EP across 16 GPU / 4 nodes) = 6 nodes. EP16 (8 experts/rank of 128)
-# spans the NVL72 fabric to maximize decode token throughput. FLASHINFER
-# attention, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 2
-  decode_nodes: 4
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "512x1024x2048"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
deleted file mode 100644
index 049af1fa7..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-2n.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency curve).
-# Prefill (TP4, 1 node) -> NixlConnector -> Decode (TP4, 1 node). Pure
-# TP, no expert parallel: lowest TTFT/ITL at small concurrencies where
-# wide EP would leave DP ranks idle. FLASHINFER (trtllm-gen) attention,
-# block-size 128 (MSA index-cache alignment, vllm#45381 @ 022448dd).
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
deleted file mode 100644
index 890b5a590..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tep4-2n.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tep4-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, decode
-# TEP4). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP4
-# (TP4+EP4, 1 node) = 2 nodes. Expert parallelism on decode splits 128
-# MoE experts across 4 ranks (32 each), reducing per-step MoE compute.
-# FLASHINFER, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 1
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
deleted file mode 100644
index 8d63df4ab..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4-tp8-3n.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tp4-tp8-1k1k"
-
-# MiniMax-M3 disaggregated 1P+1D recipe for GB200 (low-latency, wider
-# decode). Prefill TP4 (1 node) -> NixlConnector -> Decode TEP8
-# (TP8+EP8, 2 nodes) = 3 nodes. Wider decode TP + expert parallelism
-# reduces per-step latency by spreading both attention and MoE across
-# 8 GPU over NVL72 NVLink.  FLASHINFER, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 8
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 8
-      pipeline-parallel-size: 1
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml
new file mode 100644
index 000000000..70c71b647
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml
@@ -0,0 +1,101 @@
+name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep4-fp8-1k1k"
+
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP)
+# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
deleted file mode 100644
index de1488514..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4-3n.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-1p2d-tp4-1k1k"
-
-# MiniMax-M3 disaggregated 1P+2D recipe for GB200 (low-latency, more
-# decode workers). Prefill TP4 (1 node) -> NixlConnector -> 2x Decode
-# TP4 (2 nodes) = 3 nodes. Two decode workers halve the per-worker
-# batch, reducing ITL at low concurrency. FLASHINFER, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 1
-  decode_nodes: 2
-  prefill_workers: 1
-  decode_workers: 2
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      moe-backend: marlin
-      pipeline-parallel-size: 1
-      max-model-len: 2304
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "1x2x4x8x16x32x64"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml
new file mode 100644
index 000000000..1785bbe22
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml
@@ -0,0 +1,101 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-dep8-fp8-1k1k"
+
+# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP)
+# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml
new file mode 100644
index 000000000..79c4f08f8
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml
@@ -0,0 +1,99 @@
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-tep8-fp8-1k1k"
+
+# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
+# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
deleted file mode 100644
index de4f3ce22..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8-dep16-8n.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-dep8-dep16-1k1k"
-
-# MiniMax-M3 disaggregated 2P+1D recipe for GB200 (prefill-scaled).
-# 2x Prefill DEP8 (8 GPU / 2 nodes each) -> NixlConnector -> Decode DEP16
-# (16 GPU / 4 nodes) = 8 nodes. Two wide prefill workers sustain prompt
-# ingest into a single wide decode at high concurrency. FLASHINFER
-# attention, block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 4
-  decode_nodes: 4
-  prefill_workers: 2
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "2048x4096"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
similarity index 66%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
index f3cd98459..b47576e2b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-tep4-dep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
@@ -1,9 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-dep8-6n-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-1k1k"
 
-# 2P TEP4 prefill (2 nodes) + 2D DEP8 decode (4 nodes) = 6 nodes
-# / 24 GPU. High-concurrency throughput config. B300 optimal at
-# conc 1024 is DEP8 (TP1 DP8 EP, dp-attn). Each decode worker
-# spans 2 GB200 nodes (8 GPU) over NVL72 NVLink.
+# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
+# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -21,15 +19,14 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 1
   decode_nodes: 4
   prefill_workers: 2
   decode_workers: 2
-  gpus_per_prefill: 4
+  gpus_per_prefill: 2
   gpus_per_decode: 8
 
 frontend:
@@ -42,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -51,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -61,48 +60,40 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
       stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13345
+      tensor-parallel-size: 8
       enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
new file mode 100644
index 000000000..91aff7587
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
@@ -0,0 +1,99 @@
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-1k1k"
+
+# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
+# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
deleted file mode 100644
index cd978be55..000000000
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-4p1d-dep8-dep16-12n.yaml
+++ /dev/null
@@ -1,117 +0,0 @@
-name: "minimax-m3-vllm-disagg-gb200-4p1d-dep8-dep16-1k1k"
-
-# MiniMax-M3 disaggregated 4P+1D recipe for GB200 (max throughput).
-# 4x Prefill DEP8 (8 GPU / 2 nodes each = 8 nodes) -> NixlConnector ->
-# Decode DEP16 (16 GPU / 4 nodes) = 12 nodes within one NVL72 rack. Max
-# prefill fan-in for the highest-concurrency points. FLASHINFER attention,
-# block-size 128.
-
-model:
-  path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
-  precision: "fp8"
-
-dynamo:
-  install: true
-  wheel: "1.2.0.dev20260526"
-
-slurm:
-  time_limit: "8:00:00"
-
-health_check:
-  max_attempts: 720
-  interval_seconds: 10
-
-
-resources:
-  gpu_type: "gb200"
-  gpus_per_node: 4
-  prefill_nodes: 8
-  decode_nodes: 4
-  prefill_workers: 4
-  decode_workers: 1
-  gpus_per_prefill: 8
-  gpus_per_decode: 16
-
-frontend:
-  type: dynamo
-  enable_multiple_frontends: false
-
-backend:
-  type: vllm
-  connector: null
-
-  prefill_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  decode_environment:
-    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
-    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
-    # NIXL/UCX KV transfer over the NVL72 NVLink fabric (cuda_ipc) instead
-    # of TCP: UCX_CUDA_IPC_ENABLE_MNNVL=y enables cross-node NVLink IPC and
-    # NCCL_CUMEM_ENABLE=1 cuMem-allocates buffers so they are IPC-exportable.
-    UCX_MEMTYPE_CACHE: "n"
-    UCX_MEMTYPE_REG_WHOLE: "n"
-    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
-    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
-    NCCL_CUMEM_ENABLE: "1"
-
-  vllm_config:
-    prefill:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 8
-      data-parallel-rpc-port: 13346
-      enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 2304
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 32
-
-    decode:
-      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 1
-      pipeline-parallel-size: 1
-      data-parallel-size: 16
-      data-parallel-rpc-port: 13345
-      enable-expert-parallel: true
-      max-model-len: 2304
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
-      trust-remote-code: true
-      no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
-
-benchmark:
-  type: "sa-bench"
-  isl: 1024
-  osl: 1024
-  concurrencies: "4096x8192"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
similarity index 68%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
index de35075fb..22614c41e 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-tp4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
@@ -1,8 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-tp4-2n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-8k1k"
 
-# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TP4 decode (Marlin MoE,
-# 1 node) = 2 nodes. Low-concurrency latency config. B300 optimal
-# at conc 1-32. Marlin on TP-only decode (no EP).
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin
+# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -20,7 +19,6 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
@@ -28,7 +26,7 @@ resources:
   decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 4
+  gpus_per_prefill: 2
   gpus_per_decode: 4
 
 frontend:
@@ -41,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -50,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -60,46 +60,41 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 4
+      enable-expert-parallel: false
       moe-backend: marlin
-      pipeline-parallel-size: 1
-      max-model-len: 9472
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "8x16x32"
+  concurrencies: "1x4x8x16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml
similarity index 64%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml
index 405751955..0d3339356 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-tep4-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml
@@ -1,8 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-tep4-2n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep8-fp8-8k1k"
 
-# 1P TEP4 prefill (TP4+EP4, 1 node) + 1D TEP4 decode (TP4+EP4,
-# 1 node) = 2 nodes. Mid-concurrency config. B300 optimal at
-# conc 64-256. EP splits 128 MoE experts across 4 decode ranks.
+# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -20,16 +19,15 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
   prefill_nodes: 1
-  decode_nodes: 1
+  decode_nodes: 4
   prefill_workers: 1
-  decode_workers: 1
-  gpus_per_prefill: 4
-  gpus_per_decode: 4
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
 
 frontend:
   type: dynamo
@@ -41,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -50,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -60,46 +60,42 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 256
-      max-num-batched-tokens: 256
-      max-cudagraph-capture-size: 128
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 1
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "64x128x256"
+  concurrencies: "128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml
new file mode 100644
index 000000000..c2983a2e5
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml
@@ -0,0 +1,101 @@
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-dep8-fp8-8k1k"
+
+# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
similarity index 67%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
index 999ffa26d..0b605388f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-tep4-tep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
@@ -1,9 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p2d-tep4-tep8-6n-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-8k1k"
 
-# 2P TEP4 prefill (2 nodes) + 2D TEP8 decode (4 nodes) = 6 nodes
-# / 24 GPU. High-concurrency throughput config. B300 optimal at
-# conc 512 is TEP8 (TP8+EP8). Each decode worker spans 2 GB200
-# nodes (8 GPU) over NVL72 NVLink.
+# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
+# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -21,15 +19,14 @@ health_check:
   max_attempts: 720
   interval_seconds: 10
 
-
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 1
   decode_nodes: 4
   prefill_workers: 2
   decode_workers: 2
-  gpus_per_prefill: 4
+  gpus_per_prefill: 2
   gpus_per_decode: 8
 
 frontend:
@@ -42,6 +39,7 @@ backend:
 
   prefill_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -51,6 +49,7 @@ backend:
 
   decode_environment:
     VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
     UCX_MEMTYPE_CACHE: "n"
     UCX_MEMTYPE_REG_WHOLE: "n"
@@ -61,46 +60,40 @@ backend:
   vllm_config:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
-      tensor-parallel-size: 4
-      pipeline-parallel-size: 1
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
       enable-expert-parallel: true
-      enforce-eager: true
-      max-model-len: 9472
-      max-num-seqs: 16
-      max-num-batched-tokens: 16384
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
       stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
 
     decode:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 8
-      pipeline-parallel-size: 1
       enable-expert-parallel: true
-      max-model-len: 9472
-      max-num-seqs: 512
-      max-num-batched-tokens: 512
-      max-cudagraph-capture-size: 512
-      block-size: 128
-      attention-backend: FLASHINFER
-      language-model-only: true
-      gpu-memory-utilization: 0.9
-      safetensors-load-strategy: "prefetch"
       trust-remote-code: true
       no-enable-prefix-caching: true
-      numa-bind: true
-      enable-sleep-mode: true
-      stream-interval: 128
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
 
 benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "512"
+  concurrencies: "16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml
new file mode 100644
index 000000000..2010dbf62
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml
@@ -0,0 +1,101 @@
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-dep8-fp8-8k1k"
+
+# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
new file mode 100644
index 000000000..c0dc7c26f
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
@@ -0,0 +1,99 @@
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-8k1k"
+
+# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
+# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml
new file mode 100644
index 000000000..669cad1d0
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml
@@ -0,0 +1,101 @@
+name: "minimax-m3-vllm-disagg-gb200-4p2d-dep2-dep8-fp8-8k1k"
+
+# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 4
+  prefill_workers: 4
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml
new file mode 100644
index 000000000..4aec44f74
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml
@@ -0,0 +1,99 @@
+name: "minimax-m3-vllm-disagg-gb200-5p2d-dep2-tep8-fp8-8k1k"
+
+# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
+# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863.
+
+model:
+  path: "minimax-m3-mxfp8"
+  container: "vllm/vllm-openai:nightly-aarch64"
+  precision: "fp8"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260526"
+
+slurm:
+  time_limit: "8:00:00"
+
+health_check:
+  max_attempts: 720
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 3
+  decode_nodes: 4
+  prefill_workers: 5
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_CUMEM_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      attention-backend: FLASHINFER
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x64"
+  req_rate: "inf"

From d8e17d461325a7f233b99534cfb0714ba5a59cfa Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sun, 21 Jun 2026 08:47:56 +0800
Subject: [PATCH 32/33] fix: fit MiniMax-M3 sweep on GB200

---
 .github/configs/nvidia-master.yaml            | 124 +++++++++---------
 ...ml => disagg-gb200-1p1d-dep4-tep8-3n.yaml} |  12 +-
 ...disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} |  12 +-
 ...ml => disagg-gb200-1p2d-dep4-dep4-3n.yaml} |  16 ++-
 ...ml => disagg-gb200-2p1d-dep4-dep8-4n.yaml} |  18 +--
 ...ml => disagg-gb200-2p1d-dep4-tep8-4n.yaml} |  14 +-
 ...ml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} |  14 +-
 ...ml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} |  14 +-
 ...disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} |  12 +-
 ...ml => disagg-gb200-1p2d-dep4-dep8-5n.yaml} |  16 ++-
 ...ml => disagg-gb200-2p2d-dep4-dep8-6n.yaml} |  18 +--
 ...ml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} |  14 +-
 ...ml => disagg-gb200-3p2d-dep4-dep8-7n.yaml} |  18 +--
 ...ml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} |  14 +-
 ...ml => disagg-gb200-4p2d-dep4-dep8-8n.yaml} |  18 +--
 ...ml => disagg-gb200-5p2d-dep4-tep8-9n.yaml} |  14 +-
 16 files changed, 189 insertions(+), 159 deletions(-)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep2-tep8-3n.yaml => disagg-gb200-1p1d-dep4-tep8-3n.yaml} (89%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml => disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} (89%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-1p2d-dep2-dep4-3n.yaml => disagg-gb200-1p2d-dep4-dep4-3n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-dep2-dep8-3n.yaml => disagg-gb200-2p1d-dep4-dep8-4n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p1d-dep2-tep8-3n.yaml => disagg-gb200-2p1d-dep4-tep8-4n.yaml} (88%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-2p2d-dep2-tep8-5n.yaml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} (88%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/{disagg-gb200-3p2d-dep2-tep8-6n.yaml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} (88%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml => disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml} (89%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-1p2d-dep2-dep8-5n.yaml => disagg-gb200-1p2d-dep4-dep8-5n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-dep2-dep8-5n.yaml => disagg-gb200-2p2d-dep4-dep8-6n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-2p2d-dep2-tep8-5n.yaml => disagg-gb200-2p2d-dep4-tep8-6n.yaml} (88%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-3p2d-dep2-dep8-6n.yaml => disagg-gb200-3p2d-dep4-dep8-7n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-3p2d-dep2-tep8-6n.yaml => disagg-gb200-3p2d-dep4-tep8-7n.yaml} (88%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-4p2d-dep2-dep8-6n.yaml => disagg-gb200-4p2d-dep4-dep8-8n.yaml} (86%)
 rename benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/{disagg-gb200-5p2d-dep2-tep8-7n.yaml => disagg-gb200-5p2d-dep4-tep8-9n.yaml} (88%)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 19215da03..cc22e03c5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -13305,9 +13305,9 @@ qwen3.5-fp4-b200-trt:
       - { tp: 8, ep: 8, dp-attn: true, conc-list: [256, 512, 1024] }
 
 # MiniMax-M3 GB200 disagg sweep — adapted from NV B300 PR #1863.
-# All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
+# All prefill DEP4 (TP1 DP4 EP, 4 GPU/worker). Decode: TP4+Marlin, TEP8,
 # DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped.
-# FLASHINFER attention. No kv-cache-dtype (GB200 default).
+# FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep.
 minimaxm3-fp8-gb200-dynamo-vllm:
   image: vllm/vllm-openai:nightly-aarch64
   model: MiniMaxAI/MiniMax-M3-MXFP8
@@ -13322,105 +13322,105 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # 1p1d DEP2+TEP8, 3n: conc 4,16,64,128,4096
+      # 1p1d DEP4+TEP8, 3n: conc 4,16,64,128,4096
       - conc-list: [4, 16, 64, 128, 4096]
         prefill:
           num-worker: 1
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml"
         decode:
           num-worker: 1
           tp: 8
           ep: 8
           dp-attn: false
 
-      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16
       - conc-list: [1, 4, 8, 16]
         prefill:
           num-worker: 1
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 1
           dp-attn: false
 
-      # 1p2d DEP2+DEP4, 3n: conc 2048
+      # 1p2d DEP4+DEP4, 3n: conc 2048
       - conc-list: [2048]
         prefill:
           num-worker: 1
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml"
         decode:
           num-worker: 2
           tp: 4
           ep: 4
           dp-attn: true
 
-      # 2p1d DEP2+DEP8, 3n: conc 512,4096
+      # 2p1d DEP4+DEP8, 4n: conc 512,4096
       - conc-list: [512, 4096]
         prefill:
           num-worker: 2
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml"
         decode:
           num-worker: 1
           tp: 8
           ep: 8
           dp-attn: true
 
-      # 2p1d DEP2+TEP8, 3n: conc 32
+      # 2p1d DEP4+TEP8, 4n: conc 32
       - conc-list: [32]
         prefill:
           num-worker: 2
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml"
         decode:
           num-worker: 1
           tp: 8
           ep: 8
           dp-attn: false
 
-      # 2p2d DEP2+TEP8, 5n: conc 16
+      # 2p2d DEP4+TEP8, 6n: conc 16
       - conc-list: [16]
         prefill:
           num-worker: 2
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: false
 
-      # 3p2d DEP2+TEP8, 6n: conc 4
+      # 3p2d DEP4+TEP8, 7n: conc 4
       - conc-list: [4]
         prefill:
           num-worker: 3
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml"
         decode:
           num-worker: 2
           tp: 8
@@ -13430,120 +13430,120 @@ minimaxm3-fp8-gb200-dynamo-vllm:
     - isl: 8192
       osl: 1024
       search-space:
-      # 1p1d DEP2+TP4 Marlin, 2n: conc 1,4,8,16
+      # 1p1d DEP4+TP4 Marlin, 2n: conc 1,4,8,16
       - conc-list: [1, 4, 8, 16]
         prefill:
           num-worker: 1
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml"
         decode:
           num-worker: 1
           tp: 4
           ep: 1
           dp-attn: false
 
-      # 1p2d DEP2+DEP8, 5n: conc 128
+      # 1p2d DEP4+DEP8, 5n: conc 128
       - conc-list: [128]
         prefill:
           num-worker: 1
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: true
 
-      # 2p2d DEP2+DEP8, 5n: conc 256,512
+      # 2p2d DEP4+DEP8, 6n: conc 256,512
       - conc-list: [256, 512]
         prefill:
           num-worker: 2
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: true
 
-      # 2p2d DEP2+TEP8, 5n: conc 16
+      # 2p2d DEP4+TEP8, 6n: conc 16
       - conc-list: [16]
         prefill:
           num-worker: 2
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: false
 
-      # 3p2d DEP2+DEP8, 6n: conc 512
+      # 3p2d DEP4+DEP8, 7n: conc 512
       - conc-list: [512]
         prefill:
           num-worker: 3
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: true
 
-      # 3p2d DEP2+TEP8, 6n: conc 32
+      # 3p2d DEP4+TEP8, 7n: conc 32
       - conc-list: [32]
         prefill:
           num-worker: 3
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: false
 
-      # 4p2d DEP2+DEP8, 6n: conc 4096
+      # 4p2d DEP4+DEP8, 8n: conc 4096
       - conc-list: [4096]
         prefill:
           num-worker: 4
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml"
         decode:
           num-worker: 2
           tp: 8
           ep: 8
           dp-attn: true
 
-      # 5p2d DEP2+TEP8, 7n: conc 4,64
+      # 5p2d DEP4+TEP8, 9n: conc 4,64
       - conc-list: [4, 64]
         prefill:
           num-worker: 5
-          tp: 2
-          ep: 2
+          tp: 4
+          ep: 4
           dp-attn: true
           additional-settings:
-          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml"
+          - "CONFIG_FILE=recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml"
         decode:
           num-worker: 2
           tp: 8
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
similarity index 89%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
index b4484c443..38b463e79 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tep8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tep8-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k"
 
-# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
-# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8)
+# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,7 +26,7 @@ resources:
   decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 4096
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
similarity index 89%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
index a0f1bda01..653683bc4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-1k1k"
 
-# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin
-# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
+# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin
+# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,7 +26,7 @@ resources:
   decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 4
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -86,6 +87,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 4096
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
index 70c71b647..ca884ade7 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep2-dep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep4-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep4-fp8-1k1k"
 
-# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP4 (TP1 DP4 EP)
-# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP4 (TP1 DP4 EP)
+# 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,7 +26,7 @@ resources:
   decode_nodes: 2
   prefill_workers: 1
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 4
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 1024
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 8192
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
index 1785bbe22..10712e807 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-dep8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-dep8-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-dep8-fp8-1k1k"
 
-# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D DEP8 (TP1 DP8 EP)
-# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D DEP8 (TP1 DP8 EP)
+# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 2
   prefill_workers: 2
   decode_workers: 1
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 4096
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 8192
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
index 79c4f08f8..930ec860b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep2-tep8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p1d-dep2-tep8-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-tep8-fp8-1k1k"
 
-# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TEP8 (TP8+EP8)
-# decode = 3 nodes (1P + 2D). Adapted from NV B300 PR #1863.
+# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TEP8 (TP8+EP8)
+# 4 nodes (2P + 2D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 2
   prefill_workers: 2
   decode_workers: 1
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 4096
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
index b47576e2b..c422781b4 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-1k1k"
 
-# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
-# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
+# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8)
+# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 4
   prefill_workers: 2
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 4096
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
index 91aff7587..58fb1952f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-1k1k"
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-1k1k"
 
-# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
-# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8)
+# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 3
   decode_nodes: 4
   prefill_workers: 3
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 2304
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 4096
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
similarity index 89%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
index 22614c41e..2c94b75ed 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep2-tp4-marlin-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p1d-dep2-tp4-marlin-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-8k1k"
 
-# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 1D TP4 Marlin
-# decode = 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
+# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 1D TP4 Marlin
+# 2 nodes (1P + 1D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,7 +26,7 @@ resources:
   decode_nodes: 1
   prefill_workers: 1
   decode_workers: 1
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 4
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -86,6 +87,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 1024
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
index 0d3339356..236bf112b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep2-dep8-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-1p2d-dep2-dep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep8-fp8-8k1k"
 
-# 1P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
-# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
+# 1P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -26,7 +26,7 @@ resources:
   decode_nodes: 4
   prefill_workers: 1
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 1024
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 4096
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
index c2983a2e5..5c2056418 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-dep8-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-dep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-dep8-fp8-8k1k"
 
-# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
-# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
+# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 4
   prefill_workers: 2
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 1024
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 4096
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
index 0b605388f..9d6fea2e9 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep2-tep8-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-2p2d-dep2-tep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-8k1k"
 
-# 2P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
-# decode = 5 nodes (1P + 4D). Adapted from NV B300 PR #1863.
+# 2P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8)
+# 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 1
+  prefill_nodes: 2
   decode_nodes: 4
   prefill_workers: 2
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 1024
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
index 2010dbf62..515c0e48b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-dep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-dep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-dep8-fp8-8k1k"
 
-# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
-# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 3
   decode_nodes: 4
   prefill_workers: 3
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 1024
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 4096
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
index c0dc7c26f..ace2e4477 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep2-tep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-3p2d-dep2-tep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-8k1k"
 
-# 3P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
-# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+# 3P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8)
+# 7 nodes (3P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 3
   decode_nodes: 4
   prefill_workers: 3
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 1024
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
similarity index 86%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
index 669cad1d0..2453fe560 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep2-dep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-4p2d-dep2-dep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-4p2d-dep4-dep8-fp8-8k1k"
 
-# 4P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
-# decode = 6 nodes (2P + 4D). Adapted from NV B300 PR #1863.
+# 4P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D DEP8 (TP1 DP8 EP)
+# 8 nodes (4P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 2
+  prefill_nodes: 4
   decode_nodes: 4
   prefill_workers: 4
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -87,11 +88,12 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
-      max-num-seqs: 1024
+      max-num-seqs: 512
       max-num-batched-tokens: 16384
-      max-cudagraph-capture-size: 4096
+      max-cudagraph-capture-size: 2048
 
 benchmark:
   type: "sa-bench"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
similarity index 88%
rename from benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml
rename to benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
index 4aec44f74..418c65ba2 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep2-tep8-7n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
@@ -1,7 +1,7 @@
-name: "minimax-m3-vllm-disagg-gb200-5p2d-dep2-tep8-fp8-8k1k"
+name: "minimax-m3-vllm-disagg-gb200-5p2d-dep4-tep8-fp8-8k1k"
 
-# 5P DEP2 prefill (TP1 DP2 EP, 2 GPU/worker) + 2D TEP8 (TP8+EP8)
-# decode = 7 nodes (3P + 4D). Adapted from NV B300 PR #1863.
+# 5P DEP4 prefill (TP1 DP4 EP, 4 GPU/worker) + 2D TEP8 (TP8+EP8)
+# 9 nodes (5P + 4D). Adapted from NV B300 PR #1863.
 
 model:
   path: "minimax-m3-mxfp8"
@@ -22,11 +22,11 @@ health_check:
 resources:
   gpu_type: "gb200"
   gpus_per_node: 4
-  prefill_nodes: 3
+  prefill_nodes: 5
   decode_nodes: 4
   prefill_workers: 5
   decode_workers: 2
-  gpus_per_prefill: 2
+  gpus_per_prefill: 4
   gpus_per_decode: 8
 
 frontend:
@@ -61,7 +61,7 @@ backend:
     prefill:
       kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
       tensor-parallel-size: 1
-      data-parallel-size: 2
+      data-parallel-size: 4
       data-parallel-rpc-port: 13345
       enable-expert-parallel: true
       trust-remote-code: true
@@ -70,6 +70,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-cudagraph-capture-size: 2048
@@ -85,6 +86,7 @@ backend:
       gpu-memory-utilization: 0.90
       max-model-len: 9472
       language-model-only: true
+      kv-cache-dtype: fp8
       attention-backend: FLASHINFER
       stream-interval: 32
       max-num-seqs: 1024

From 0c61503b4f5ed45d6f0ad493c27e7827ac84ec9b Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Tue, 23 Jun 2026 16:25:29 +0800
Subject: [PATCH 33/33] perf(gb200): refresh MiniMax-M3 vLLM image

---
 .github/configs/nvidia-master.yaml            |  2 +-
 .../configs/minimax-m3-gb200-vllm-fixes.sh    | 38 ++++++++++
 .../1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml  |  2 +-
 .../disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml |  2 +-
 .../1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml  |  2 +-
 .../1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml  |  2 +-
 .../1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml  |  2 +-
 .../1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml  |  2 +-
 .../1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml  |  2 +-
 .../disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml |  2 +-
 .../8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml  |  2 +-
 .../8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml  |  2 +-
 .../8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml  |  2 +-
 .../8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml  |  2 +-
 .../8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml  |  2 +-
 .../8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml  |  2 +-
 .../8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml  |  2 +-
 perf-changelog.yaml                           |  8 ++
 runners/launch_gb200-nv.sh                    | 73 +++++++++++++++----
 19 files changed, 121 insertions(+), 30 deletions(-)
 create mode 100755 benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index e0ef11dc5..93a377183 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -12109,7 +12109,7 @@ qwen3.5-fp4-b200-trt:
 # DEP8, DEP4. 4 GPU/node (GB200 NVL72). 4p3d (3 decode workers) skipped.
 # FLASHINFER attention with FP8 KV cache, matching the validated GB300 sweep.
 minimaxm3-fp8-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:nightly-aarch64
+  image: vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: gb200
diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh
new file mode 100755
index 000000000..c0eed0a51
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-gb200-vllm-fixes.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python3 - <<'PYEOF'
+from importlib.util import find_spec
+from pathlib import Path
+
+spec = find_spec("vllm")
+if not spec or not spec.origin:
+    raise RuntimeError("vllm is not installed")
+root = Path(spec.origin).parent
+patches = {
+    root / "distributed/device_communicators/flashinfer_all_reduce.py": [
+        (
+            "            comm_backend=comm_backend,\n"
+            "            group=group,\n",
+            "            comm_backend=comm_backend,\n"
+            '            force_oneshot_support=backend == "mnnvl",\n'
+            "            group=group,\n",
+        ),
+    ],
+    root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
+        (
+            "            prefill_topk = topk[:, nd:num_tokens, :]\n",
+            "            prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
+        ),
+    ],
+}
+for path, edits in patches.items():
+    source = path.read_text()
+    for old, new in edits:
+        if new in source:
+            continue
+        if source.count(old) != 1:
+            raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
+        source = source.replace(old, new, 1)
+    path.write_text(source)
+PYEOF
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
index 38b463e79..74c2e2668 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tep8-3n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tep8-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
index 653683bc4..324170080 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
index ca884ade7..43ca4f723 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-1p2d-dep4-dep4-3n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep4-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
index 10712e807..a8e05c640 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-dep8-4n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-dep8-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
index 930ec860b..9aea9db19 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p1d-dep4-tep8-4n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p1d-dep4-tep8-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
index c422781b4..9786b2306 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
index 58fb1952f..2d22a2437 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/1k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-1k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
index 2c94b75ed..d10b7866d 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p1d-dep4-tp4-marlin-2n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p1d-dep4-tp4-marlin-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
index 236bf112b..1e386a693 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-1p2d-dep4-dep8-5n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-1p2d-dep4-dep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
index 5c2056418..5e77b9e8f 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-dep8-6n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-dep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
index 9d6fea2e9..cda685755 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-2p2d-dep4-tep8-6n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-2p2d-dep4-tep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
index 515c0e48b..55a0cfc58 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-dep8-7n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-dep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
index ace2e4477..ad5e1da1b 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4-tep8-7n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-3p2d-dep4-tep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
index 2453fe560..8b9857c14 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-4p2d-dep4-dep8-8n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-4p2d-dep4-dep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
index 418c65ba2..7a39d40dc 100644
--- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8/8k1k/disagg-gb200-5p2d-dep4-tep8-9n.yaml
@@ -5,7 +5,7 @@ name: "minimax-m3-vllm-disagg-gb200-5p2d-dep4-tep8-fp8-8k1k"
 
 model:
   path: "minimax-m3-mxfp8"
-  container: "vllm/vllm-openai:nightly-aarch64"
+  container: "vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
   precision: "fp8"
 
 dynamo:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4265d320b..3b3ad71c0 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4099,3 +4099,11 @@
     - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch."
     - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1893
+
+- config-keys:
+    - minimaxm3-fp8-gb200-dynamo-vllm
+  description:
+    - "Update the GB200 MiniMax-M3 Dynamo-vLLM image to vllm/vllm-openai:minimax-m3-perf-arm64-13.0.1-7a67223"
+    - "Allocate FlashInfer MNNVL workspace for one-shot all-reduce and materialize the MSA prefill top-k slice before CSR construction"
+    - "Preserve current Qwen3.5 and Kimi-K2.5 GB200 launcher paths while adding MiniMax-M3 shared-FS staging and atomic image import"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1734
diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
index 4017b1fd2..8ab7de40a 100755
--- a/runners/launch_gb200-nv.sh
+++ b/runners/launch_gb200-nv.sh
@@ -63,8 +63,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
     elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then
         export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5"
         export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8"
+    elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+        export MODEL_PATH="/mnt/lustre01/models/MiniMax-M3-MXFP8"
+        export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
     else
-        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8"
+        echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8, minimaxm3/fp8"
         exit 1
     fi
 else
@@ -77,15 +80,22 @@ export SLURM_ACCOUNT="benchmark"
 
 NGINX_IMAGE="nginx:1.27.4"
 
-# === Cluster diagnostic probe (minimax only) ===
+uses_watchtower_shared_fs() {
+    case "$MODEL_PREFIX" in
+        minimaxm2.5|minimaxm3|kimik2.5) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+
+# === Cluster diagnostic probe for watchtower-hosted sweeps ===
 # The gb200-nv_* runners may be hosted on different physical clusters
 # (e.g., the legacy NVIDIA Lustre cluster vs Oracle Cloud "watchtower").
 # Print enough info to identify the layout, then pick a writable
 # squash dir on a path that's also visible to compute nodes. Falls
 # back to the legacy sa-shared path so other configs are untouched.
 SQUASH_DIR="/mnt/lustre01/users-public/sa-shared"
-if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then
-    echo "=== cluster diagnostic (minimax sweep) ==="
+if uses_watchtower_shared_fs; then
+    echo "=== cluster diagnostic (watchtower sweep) ==="
     echo "USER=$(id -un) UID=$(id -u) GID=$(id -g) GROUPS=$(id -Gn)"
     echo "HOME=$HOME"
     echo "HOSTNAME=$(hostname -f 2>/dev/null || hostname)"
@@ -131,8 +141,27 @@ fi
 SQUASH_FILE="${SQUASH_DIR}/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 NGINX_SQUASH_FILE="${SQUASH_DIR}/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 
-enroot import -o $SQUASH_FILE docker://$IMAGE
-enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE
+# Concurrent matrix jobs import to the same shared-FS squash path.
+# Serialize imports and atomically replace invalid images so readers never
+# observe a partially written squash file.
+import_squash() {
+    local squash="$1" image="$2"
+    local lock="${squash}.lock"
+    (
+        exec 9>"$lock"
+        flock -w 1800 9 || { echo "Failed to acquire lock for $squash" >&2; exit 1; }
+        if unsquashfs -l "$squash" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import: $squash"
+        else
+            rm -f "$squash" "$squash".tmp.*
+            enroot import -o "${squash}.tmp.$$" "docker://$image"
+            mv -f "${squash}.tmp.$$" "$squash"
+        fi
+    ) || exit 1
+}
+
+import_squash "$SQUASH_FILE" "$IMAGE"
+import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE"
 
 export EVAL_ONLY="${EVAL_ONLY:-false}"
 
@@ -201,11 +230,12 @@ fi
 
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
+SRTCTL_SETUP_SCRIPT=""
 # On the watchtower (Oracle) gb200 cluster, /home/slurm-shared is not
 # cross-mounted to compute nodes. Put the srt-slurm workspace and staged
 # InferenceX checkout on a writable shared-FS path that compute can see.
 # Per-run-unique paths avoid races between parallel sweep jobs.
-if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then
+if uses_watchtower_shared_fs; then
     SHARED_BASE=""
     for cand in \
         /mnt/lustre01/users-public/sa-shared/gha-runs \
@@ -277,6 +307,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then
         echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2
         exit 1
     fi
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
+    cd "$SRT_REPO_DIR" || exit 1
+    git checkout main || exit 1
+    mkdir -p recipes/vllm/minimax-m3-gb200-fp8 || exit 1
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3-gb200-fp8" recipes/vllm/minimax-m3-gb200-fp8 || exit 1
+    SRTCTL_SETUP_SCRIPT="minimax-m3-gb200-vllm-fixes.sh"
+    cp \
+        "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \
+        "configs/$SRTCTL_SETUP_SCRIPT" || exit 1
 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "kimik2.5" && $PRECISION == "fp4" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1
     cd "$SRT_REPO_DIR" || exit 1
@@ -306,7 +346,7 @@ source $HOME/.local/bin/env
 # under a head-node-only path, .venv/bin/python3 becomes a broken
 # symlink on compute. Pin the venv to /usr/bin/python3 — a system
 # path that exists at the same location on both head and compute.
-if [[ ($MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5") && -x /usr/bin/python3 ]]; then
+if uses_watchtower_shared_fs && [[ -x /usr/bin/python3 ]]; then
     uv venv --seed --python /usr/bin/python3
 else
     uv venv --seed
@@ -323,10 +363,10 @@ echo "Configs available at: $SRT_REPO_DIR/"
 
 # Create srtslurm.yaml for srtctl (used by both frameworks)
 SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm"
-# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path
+# Watchtower-hosted sweeps: SRT_REPO_DIR was moved to a shared-FS path
 # above so srtctl's outputs/ directory (which lives under
 # SRTCTL_ROOT) is visible to compute nodes.
-if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then
+if uses_watchtower_shared_fs; then
     SRTCTL_ROOT="$SRT_REPO_DIR"
 fi
 echo "Creating srtslurm.yaml configuration..."
@@ -368,7 +408,7 @@ export INFMAX_WORKSPACE="$GITHUB_WORKSPACE"
 # can't see. Stage the relevant subset to shared FS and repoint
 # INFMAX_WORKSPACE there. rsync excludes the srt-slurm clone (already
 # on shared FS) and .git (not needed in container) for speed.
-if [[ $MODEL_PREFIX == "minimaxm2.5" || $MODEL_PREFIX == "kimik2.5" ]]; then
+if uses_watchtower_shared_fs; then
     SHARED_INFMAX_WORKSPACE="${SHARED_BASE}/infmax-workspace-${RUN_KEY}"
     mkdir -p "$SHARED_INFMAX_WORKSPACE" || exit 1
     rsync -a --delete \
@@ -393,11 +433,16 @@ if [[ ! -f "$CONFIG_PATH" ]]; then
 fi
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH"
 
+SRTCTL_APPLY_ARGS=(
+    -f "$CONFIG_PATH"
+    --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)"
+)
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
-    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
-else
-    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+    SRTCTL_APPLY_ARGS+=(--setup-script install-torchao.sh)
+elif [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then
+    SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT")
 fi
+SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1)
 echo "$SRTCTL_OUTPUT"
 
 JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+')