diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ffadf6b48..1b54016a8 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -11604,6 +11604,232 @@ qwen3.5-fp8-h100-sglang-agentic:
       - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 12, 14, 16] }
       - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
 
+minimaxm3-fp8-b300-dynamo-vllm:
+  image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: b300
+  precision: fp8
+  framework: dynamo-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - conc-list: [4, 16, 64, 128, 4096]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+      - conc-list: [2048]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 4
+          dp-attn: true
+      - conc-list: [512, 4096]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [32]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [4]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - conc-list: [128]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [256, 512]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [16]
+        prefill:
+          num-worker: 2
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [512]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [32]
+        prefill:
+          num-worker: 3
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+      - conc-list: [4096]
+        prefill:
+          num-worker: 4
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml"
+        decode:
+          num-worker: 3
+          tp: 4
+          ep: 4
+          dp-attn: true
+      - conc-list: [4, 64]
+        prefill:
+          num-worker: 5
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml"
+        decode:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: false
+      - conc-list: [1, 4, 8, 16]
+        prefill:
+          num-worker: 1
+          tp: 2
+          ep: 2
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+
 # MiniMax-M3 GB300 disagg sweep — adapted from NV B300 PR #1863.
 # All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: TP4+Marlin, TEP8,
 # DEP8, DEP4. 4 GPU/node (GB300 NVL72). 4p3d (3 decode workers) skipped.
diff --git a/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh
new file mode 100755
index 000000000..02862bba3
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/configs/minimax-m3-vllm-fixes.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+python3 - <<'PYEOF'
+from importlib.util import find_spec
+from pathlib import Path
+
+spec = find_spec("vllm")
+if not spec or not spec.origin:
+    raise RuntimeError("vllm is not installed")
+root = Path(spec.origin).parent
+patches = {
+    root / "models/minimax_m3/nvidia/sparse_attention_msa.py": [
+        (
+            "            prefill_topk = topk[:, nd:num_tokens, :]\n",
+            "            prefill_topk = topk[:, nd:num_tokens, :].contiguous()\n",
+        ),
+    ],
+    root / "distributed/kv_transfer/kv_connector/v1/nixl/base_worker.py": [
+        (
+            "            for i, local_len in enumerate(self.block_len_per_layer):\n",
+            "            total_kv_heads = self.transfer_topo.total_num_kv_heads\n"
+            "            local_heads = self.transfer_topo.local_physical_heads\n"
+            "            remote_heads = max(1, total_kv_heads // remote_tp_size)\n"
+            "            for i, local_len in enumerate(self.block_len_per_layer):\n",
+        ),
+        (
+            "remote_len == (local_len * tp_ratio) // block_size_ratio,",
+            "remote_len == (local_len * remote_heads // local_heads) "
+            "// block_size_ratio,",
+        ),
+        (
+            "remote_len == local_len // (-tp_ratio),",
+            "remote_len == local_len * remote_heads // local_heads,",
+        ),
+    ],
+}
+for path, edits in patches.items():
+    source = path.read_text()
+    for old, new in edits:
+        if new in source:
+            continue
+        if source.count(old) != 1:
+            raise RuntimeError(f"missing or ambiguous patch anchor in {path}")
+        source = source.replace(old, new, 1)
+    path.write_text(source)
+PYEOF
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml
new file mode 100644
index 000000000..5bbb13362
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tep8-1k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tep8-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4x16x64x128x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml
new file mode 100644
index 000000000..49a60981e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p1d-dep2-tp4-marlin-1k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp4-marlin-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 4
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  allow_prefill_decode_colocation: true
+
+  prefill_environment:
+    UCX_TLS: "cuda_ipc,cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_ipc,cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 4
+      enable-expert-parallel: false
+      moe-backend: marlin
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 2048
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml
new file mode 100644
index 000000000..ef7e66d76
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/1p2d-dep2-dep4-1k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep4-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 4
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "2048"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml
new file mode 100644
index 000000000..9f5aa341c
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-dep8-1k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-dep8-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "512x4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml
new file mode 100644
index 000000000..42c6e7bbc
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p1d-dep2-tep8-1k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-2p1d-fp8-dep2-tep8-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml
new file mode 100644
index 000000000..3e701df05
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/2p2d-dep2-tep8-1k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml
new file mode 100644
index 000000000..b9a1d1058
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/1k1k/3p2d-dep2-tep8-1k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-1k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  decode_environment:
+    UCX_TLS: "cuda_copy,rc"
+    VLLM_FLOAT32_MATMUL_PRECISION: "high"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 2048
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 2304
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 4096
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 8192
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "4"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml
new file mode 100644
index 000000000..c98ad0b44
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p1d-dep2-tp4-marlin-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-1p1d-fp8-dep2-tp4-marlin-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 2
+  gpus_per_decode: 4
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  allow_prefill_decode_colocation: true
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_ipc,cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_ipc,cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 4
+      enable-expert-parallel: false
+      moe-backend: marlin
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 2048
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8x16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml
new file mode 100644
index 000000000..bc4c449b2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/1p2d-dep2-dep8-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-1p2d-fp8-dep2-dep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml
new file mode 100644
index 000000000..7a8ddd1a1
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-dep8-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-dep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml
new file mode 100644
index 000000000..d00fb046e
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/2p2d-dep2-tep8-8k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-2p2d-fp8-dep2-tep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "16"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml
new file mode 100644
index 000000000..cf8736e14
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-dep8-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-dep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml
new file mode 100644
index 000000000..9572688b2
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/3p2d-dep2-tep8-8k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-3p2d-fp8-dep2-tep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml
new file mode 100644
index 000000000..a0e050f1d
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p2d-dep2-dep8-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-4p2d-fp8-dep2-dep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 4
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024  # Per DP rank: 2 workers x DP8 = 16 ranks.
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml
new file mode 100644
index 000000000..6f765ab74
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/4p3d-dep2-dep4-8k1k.yaml
@@ -0,0 +1,81 @@
+name: "minimax-m3-vllm-disagg-b300-4p3d-fp8-dep2-dep4-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 1
+  decode_nodes: 2
+  prefill_workers: 4
+  decode_workers: 3
+  gpus_per_prefill: 2
+  gpus_per_decode: 4
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 512  # Per DP rank: 3 workers x DP4 = 12 ranks.
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml
new file mode 100644
index 000000000..d40a33582
--- /dev/null
+++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8/8k1k/5p2d-dep2-tep8-8k1k.yaml
@@ -0,0 +1,79 @@
+name: "minimax-m3-vllm-disagg-b300-5p2d-fp8-dep2-tep8-8k1k"
+
+model:
+  path: "MiniMaxAI/MiniMax-M3-MXFP8"
+  container: "vllm/vllm-openai:minimax-m3-0618-x86_64-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "b300"
+  gpus_per_node: 8
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 2
+  gpus_per_prefill: 2
+  gpus_per_decode: 8
+
+dynamo:
+  install: true
+  version: 1.3.0.dev20260614
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  decode_environment:
+    VLLM_FLOAT32_MATMUL_PRECISION: high
+    UCX_TLS: "cuda_copy,rc"
+
+  vllm_config:
+    prefill:
+      tensor-parallel-size: 1
+      data-parallel-size: 2
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-cudagraph-capture-size: 2048
+      max-num-batched-tokens: 16384
+
+    decode:
+      tensor-parallel-size: 8
+      enable-expert-parallel: true
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      block-size: 128
+      gpu-memory-utilization: 0.90
+      max-model-len: 9472
+      language-model-only: true
+      stream-interval: 32
+      max-num-seqs: 1024
+      max-num-batched-tokens: 16384
+      max-cudagraph-capture-size: 4096
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x64"
+  req_rate: "inf"
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 38203f9f0..0157dfae5 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4034,6 +4034,19 @@
     - "No benchmark configuration change; reuse the exact 25-point fixed-sequence matrix and 2 eval jobs"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1869
 
+- config-keys:
+    - minimaxm3-fp8-b300-dynamo-vllm
+  description:
+    - "Add MiniMax-M3 MXFP8 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP."
+    - "Add local srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp8 and wire the B300 launcher to overlay them into srt-slurm."
+    - "Add right-sized TP4 decode variants with expert parallelism disabled and the Marlin MoE backend for selected low-concurrency 1k1k and 8k1k shapes."
+    - "Colocate the six-GPU TP4 prefill/decode pairs on one B300 node and enable CUDA IPC for NIXL KV transfer."
+    - "Patch the 0618 image's MiniMax M3 MSA prefill top-k slice to be contiguous before CSR construction."
+    - "Align 8k1k expert-parallel settings with the 1k1k recipes and correct the decode CUDA graph capture limit."
+    - "Backport NVIDIA/srt-slurm#38 to sanitize Slurm node-IP discovery output on the pinned submission branch."
+    - "Backport vllm-project/vllm#45879 so NIXL validates heterogeneous-TP KV block lengths using the GQA KV-head ratio."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1863
+
 - config-keys:
     - kimik2.5-fp4-gb300-dynamo-trt
   description:
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index a941860c0..5b92d6d9c 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -3,6 +3,8 @@
 # System-specific configuration for B300 NV Slurm cluster (sa-shared)
 SLURM_PARTITION="batch_1"
 SLURM_ACCOUNT="benchmark"
+# b300-018 repeatedly times out UCX/NIXL transfers; allow an empty override to disable this.
+MINIMAX_M3_SLURM_EXCLUDED_NODELIST="${MINIMAX_M3_SLURM_EXCLUDED_NODELIST-b300-018}"
 
 set -x
 
@@ -45,13 +47,17 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == "
 elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then
     export MODEL_PATH="/data/models/MiniMax-M2.5"
     export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8"
+elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then
+    export MODEL_PATH="/data/models/MiniMax-M3-MXFP8"
+    export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8"
 else
-    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm"
+    echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm"
     exit 1
 fi
 
 echo "Cloning srt-slurm repository..."
 SRT_REPO_DIR="srt-slurm"
+SRTCTL_SETUP_SCRIPT=""
 if [ -d "$SRT_REPO_DIR" ]; then
     echo "Removing existing $SRT_REPO_DIR..."
     rm -rf "$SRT_REPO_DIR"
@@ -79,6 +85,18 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS
     git checkout main
     mkdir -p recipes/vllm/minimax-m2.5-fp8
     cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8
+elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
+    git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR" || exit 1
+    git checkout sa-submission-q2-2026
+    mkdir -p recipes/vllm/minimax-m3
+    cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3
+    SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh"
+    # NVIDIA/srt-slurm#38
+    git show 22d46ba9971615016d2339c9ffbc7b4597accfad --format= -- src/srtctl/core/ip_utils/get_node_ip.sh | git apply - || exit 1
+    cp \
+        "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \
+        "configs/$SRTCTL_SETUP_SCRIPT"
 else
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR" || exit 1
@@ -161,7 +179,17 @@ fi
 
 # Override the job name in the config file with the runner name
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_FILE"
-SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+if [[ "$MODEL_PREFIX" == "minimaxm3" && -n "$MINIMAX_M3_SLURM_EXCLUDED_NODELIST" ]]; then
+    sed -i "/^name:.*/a sbatch_directives:\n  exclude: \"${MINIMAX_M3_SLURM_EXCLUDED_NODELIST}\"" "$CONFIG_FILE"
+fi
+SRTCTL_APPLY_ARGS=(
+    -f "$CONFIG_FILE"
+    --tags "b300,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)"
+)
+if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then
+    SRTCTL_APPLY_ARGS+=(--setup-script "$SRTCTL_SETUP_SCRIPT")
+fi
+SRTCTL_OUTPUT=$(srtctl apply "${SRTCTL_APPLY_ARGS[@]}" 2>&1)
 echo "$SRTCTL_OUTPUT"
 
 # Extract JOB_ID from srtctl output
@@ -174,6 +202,15 @@ if [ -z "$JOB_ID" ]; then
     exit 1
 fi
 
+if [[ "$MODEL_PREFIX" == "minimaxm3" && -n "$MINIMAX_M3_SLURM_EXCLUDED_NODELIST" ]]; then
+    SBATCH_SCRIPT="outputs/$JOB_ID/sbatch_script.sh"
+    if ! grep -Fq "#SBATCH --exclude=${MINIMAX_M3_SLURM_EXCLUDED_NODELIST}" "$SBATCH_SCRIPT"; then
+        echo "Error: Slurm node exclusion was not rendered in $SBATCH_SCRIPT" >&2
+        scancel "$JOB_ID" || true
+        exit 1
+    fi
+fi
+
 echo "Extracted JOB_ID: $JOB_ID"
 
 # Use the JOB_ID to find the logs directory