From 2d76732a8e11f8ca7d391b0e1caf2f567cbc6b5f Mon Sep 17 00:00:00 2001
From: pensieve-intern <pensieve-intern@noreply>
Date: Thu, 11 Jun 2026 13:57:08 +0000
Subject: [PATCH] =?UTF-8?q?[OMNIML-4964]=20cell=5Ft1=5Fd3=20=E2=80=94=20pe?=
 =?UTF-8?q?nsieve-intern=20agent=20draft?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cell_output.log                               | 787 ++++++++++++++++++
 metrics_output.log                            |  16 +
 .../_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml  |   4 +
 .../specdec_bench_dflash_vllm_t1_d3.yaml      |  73 ++
 4 files changed, 880 insertions(+)
 create mode 100644 cell_output.log
 create mode 100644 metrics_output.log
 create mode 100644 tools/launcher/common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml
 create mode 100644 tools/launcher/examples/Qwen3.5/Qwen3.5-4B/specdec_bench_dflash_vllm_t1_d3.yaml

diff --git a/cell_output.log b/cell_output.log
new file mode 100644
index 00000000000..26e748ef80f
--- /dev/null
+++ b/cell_output.log
@@ -0,0 +1,787 @@
+warning: `VIRTUAL_ENV=/tmp/builds/YQxxH4yPp/0/omniml/integration/nmm-sandbox/.venv-intern-agent` does not match the project environment path `.venv` and will be ignored; use `--active` to target the active environment instead
+Using CPython 3.12.13 interpreter at: /usr/local/bin/python
+Creating virtual environment at: .venv
+warning: No `requires-python` value found in the workspace. Defaulting to `>=3.12`.
+   Updating https://github.com/NVIDIA-NeMo/Run (HEAD)
+    Updated https://github.com/NVIDIA-NeMo/Run (1e26b6a98a756575c10a9a0ea9661fac0c7ad776)
+warning: Failed to hardlink files; falling back to full copy. This may lead to degraded performance.
+         If the cache and target directories are on different filesystems, hardlinking may not be supported.
+         If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.
+Installed 149 packages in 2.81s
+Configuring global options
+Dry run for task __main__:cicd
+Resolved Arguments
+┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Argument Name    ┃ Resolved Value                                            ┃
+┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ detach           │ True                                                      │
+│ hf_local         │ None                                                      │
+│ identity         │ '/.ssh/id_ed25519'                                        │
+│ job_dir          │ '/lustre/fsw/portfolios/coreai/users/chenhany/experiment… │
+│ job_name         │ 'Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3'              │
+│ pipeline         │ SandboxPipeline(                                          │
+│                  │   global_vars=GlobalVariables(hf_model='/hf-local/Qwen/Q… │
+│                  │   task_0=SandboxTask0(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:qwen3_5-cu130',         │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/qualitative',       │
+│                  │ '--engine VLLM', '--speculative_algorithm DFlash',        │
+│                  │ '--draft_length 3', '--runtime_params                     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 32',         │
+│                  │ '--output_length 4096', '--aa_timing', '--show_progress', │
+│                  │ '--save_dir                                               │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/qualitative'], │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}]), │
+│                  │   task_1=SandboxTask1(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:qwen3_5-cu130',         │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k',    │
+│                  │ '--engine VLLM', '--speculative_algorithm DFlash',        │
+│                  │ '--draft_length 3', '--runtime_params                     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 8',          │
+│                  │ '--num_requests 80', '--output_length 4096',              │
+│                  │ '--aa_timing', '--show_progress', '--save_dir             │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/throughput_32… │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}])) │
+│ task             │ None                                                      │
+│ test_level       │ 0                                                         │
+│ user             │ 'chenhany'                                                │
+└──────────────────┴───────────────────────────────────────────────────────────┘
+Launching cicd...
+============================================================
+Version Report
+============================================================
+  Launcher                       e916b41      (main)
+  Model-Optimizer                16d562a0     (pensieve-intern/OMNIML-4961/cell-t1-d3)
+============================================================
+────────────── Entering Experiment cicd with id: cicd_1781183495 ───────────────
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 0 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:qwen3_5-cu130', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 1 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:qwen3_5-cu130', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+[13:11:40] Connecting to                                           client.py:257
+           chenhany@cw-dfw-cs-001-login-01.nvidia.com                           
+[13:11:40] INFO     Connected (version 2.0, client             transport.py:1786
+                    OpenSSH_8.9p1)                                              
+[13:11:41] INFO     Authentication (publickey) successful!     transport.py:1786
+           INFO     rsyncing                                         rsync.py:37
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781183495 to                           
+                    /lustre/fsw/portfolios/coreai/users/chenhany/exp            
+                    eriments/cicd ...                                           
+[13:12:05] INFO     Successfully ran `rsync  -pthrvz  --rsh='ssh -i  rsync.py:93
+                    /.ssh/id_ed25519 -p 22 '                                    
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781183495                              
+                    chenhany@cw-dfw-cs-001-login-01.nvidia.com:/lust            
+                    re/fsw/portfolios/coreai/users/chenhany/experime            
+                    nts/cicd`                                                   
+[13:12:05] Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0 for                     
+           experiment cicd                                                      
+           INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726706                            
+           Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1 for                     
+           experiment cicd                                                      
+[SLURM] Job 12726706 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:12:06
+[13:12:06] INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726707                            
+────────────────── Detaching from Experiment cicd_1781183495. ──────────────────
+[13:12:06] Task specific cleanup won't be run.                experiment.py:1212
+           Ephemeral logs and artifacts may be lost.                            
+[SLURM] Job 12726707 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:12:06
+
+Experiment Status for cicd_1781183495
+
+Task 0: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726706
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781183495/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781183495/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+
+Task 1: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726707
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781183495/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781183495/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+
+                                                                                
+# The experiment was run with the following tasks: ['Qwen3.5-4B_specdec_bench_df
+# You can inspect and reconstruct this experiment at a later point in time using
+experiment = run.Experiment.from_id("cicd_1781183495")                          
+experiment.status() # Gets the overall status                                   
+experiment.logs("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Gets the log f
+experiment.cancel("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Cancels the 
+                                                                                
+                                                                                
+# You can inspect this experiment at a later point in time using the CLI as well
+nemo experiment status cicd_1781183495                                          
+nemo experiment logs cicd_1781183495 0                                          
+nemo experiment cancel cicd_1781183495 0                                        
+                                                                                
+Found 1 experiment(s): cicd_1781183495
+
+=== [2026-06-11 13:12:12] Polling iteration 1/320 ===
+  cicd_1781183495 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781183495 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:15:15] Polling iteration 2/320 ===
+  cicd_1781183495 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: FAILED
+  cicd_1781183495 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: CANCELLED
+
+  Summary: 0 succeeded, 1 failed, 1 cancelled, 0 running, 0 pending
+
+All experiments complete.
+  SUCCEEDED: 0
+  FAILED: 1
+  CANCELLED: 1
+
+=== Fetching experiment logs ===
+Fetching logs: cicd_1781183495 task 0
+Fetching logs: cicd_1781183495 task 1
+=== Done fetching logs ===
+warning: `VIRTUAL_ENV=/tmp/builds/YQxxH4yPp/0/omniml/integration/nmm-sandbox/.venv-intern-agent` does not match the project environment path `.venv` and will be ignored; use `--active` to target the active environment instead
+warning: No `requires-python` value found in the workspace. Defaulting to `>=3.12`.
+Configuring global options
+Error processing argument 'pipeline.global_vars.hf_draft_model=/hf-local/z-lab/Qwen3.5-4B-DFlash': Invalid argument: No parameter named 'hf_draft_model' exists for <function cicd at 0x7effef6f3b00> (Argument: pipeline.global_vars.hf_draft_model=/hf-local/z-lab/Qwen3.5-4B-DFlash, Context: {'key': 'pipeline.global_vars.hf_draft_model', 'value': '/hf-local/z-lab/Qwen3.5-4B-DFlash'})
+Unexpected error: Invalid argument: No parameter named 'hf_draft_model' exists for <function cicd at 0x7effef6f3b00> (Argument: pipeline.global_vars.hf_draft_model=/hf-local/z-lab/Qwen3.5-4B-DFlash, Context: {'key': 'pipeline.global_vars.hf_draft_model', 'value': '/hf-local/z-lab/Qwen3.5-4B-DFlash'})
+warning: `VIRTUAL_ENV=/tmp/builds/YQxxH4yPp/0/omniml/integration/nmm-sandbox/.venv-intern-agent` does not match the project environment path `.venv` and will be ignored; use `--active` to target the active environment instead
+warning: No `requires-python` value found in the workspace. Defaulting to `>=3.12`.
+Configuring global options
+Dry run for task __main__:cicd
+Resolved Arguments
+┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Argument Name    ┃ Resolved Value                                            ┃
+┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ detach           │ True                                                      │
+│ hf_local         │ None                                                      │
+│ identity         │ '/.ssh/id_ed25519'                                        │
+│ job_dir          │ '/lustre/fsw/portfolios/coreai/users/chenhany/experiment… │
+│ job_name         │ 'Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3'              │
+│ pipeline         │ SandboxPipeline(                                          │
+│                  │   global_vars=GlobalVariables(hf_model='/hf-local/Qwen/Q… │
+│                  │   task_0=SandboxTask0(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:qwen3_5-cu130',         │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/qualitative',       │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 32',         │
+│                  │ '--output_length 4096', '--aa_timing', '--show_progress', │
+│                  │ '--save_dir                                               │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/qualitative'], │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}]), │
+│                  │   task_1=SandboxTask1(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:qwen3_5-cu130',         │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k',    │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 8',          │
+│                  │ '--num_requests 80', '--output_length 4096',              │
+│                  │ '--aa_timing', '--show_progress', '--save_dir             │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/throughput_32… │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}])) │
+│ task             │ None                                                      │
+│ test_level       │ 0                                                         │
+│ user             │ 'chenhany'                                                │
+└──────────────────┴───────────────────────────────────────────────────────────┘
+Launching cicd...
+============================================================
+Version Report
+============================================================
+  Launcher                       e916b41      (main)
+  Model-Optimizer                16d562a0     (pensieve-intern/OMNIML-4961/cell-t1-d3)
+============================================================
+────────────── Entering Experiment cicd with id: cicd_1781183791 ───────────────
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 0 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:qwen3_5-cu130', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 1 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:qwen3_5-cu130', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+[13:16:37] Connecting to                                           client.py:257
+           chenhany@cw-dfw-cs-001-login-01.nvidia.com                           
+[13:16:37] INFO     Connected (version 2.0, client             transport.py:1786
+                    OpenSSH_8.9p1)                                              
+           INFO     Authentication (publickey) successful!     transport.py:1786
+           INFO     rsyncing                                         rsync.py:37
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781183791 to                           
+                    /lustre/fsw/portfolios/coreai/users/chenhany/exp            
+                    eriments/cicd ...                                           
+[13:17:02] INFO     Successfully ran `rsync  -pthrvz  --rsh='ssh -i  rsync.py:93
+                    /.ssh/id_ed25519 -p 22 '                                    
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781183791                              
+                    chenhany@cw-dfw-cs-001-login-01.nvidia.com:/lust            
+                    re/fsw/portfolios/coreai/users/chenhany/experime            
+                    nts/cicd`                                                   
+[13:17:02] Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0 for                     
+           experiment cicd                                                      
+           INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726736                            
+           Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1 for                     
+           experiment cicd                                                      
+[SLURM] Job 12726736 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:17:02
+[13:17:03] INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726737                            
+────────────────── Detaching from Experiment cicd_1781183791. ──────────────────
+[13:17:03] Task specific cleanup won't be run.                experiment.py:1212
+           Ephemeral logs and artifacts may be lost.                            
+[SLURM] Job 12726737 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:17:03
+
+Experiment Status for cicd_1781183791
+
+Task 0: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Status: PENDING
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726736
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781183791/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781183791/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+
+Task 1: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726737
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781183791/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781183791/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+
+                                                                                
+# The experiment was run with the following tasks: ['Qwen3.5-4B_specdec_bench_df
+# You can inspect and reconstruct this experiment at a later point in time using
+experiment = run.Experiment.from_id("cicd_1781183791")                          
+experiment.status() # Gets the overall status                                   
+experiment.logs("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Gets the log f
+experiment.cancel("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Cancels the 
+                                                                                
+                                                                                
+# You can inspect this experiment at a later point in time using the CLI as well
+nemo experiment status cicd_1781183791                                          
+nemo experiment logs cicd_1781183791 0                                          
+nemo experiment cancel cicd_1781183791 0                                        
+                                                                                
+Found 1 experiment(s): cicd_1781183791
+
+=== [2026-06-11 13:17:11] Polling iteration 1/320 ===
+  cicd_1781183791 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781183791 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:20:14] Polling iteration 2/320 ===
+  cicd_1781183791 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: FAILED
+  cicd_1781183791 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: CANCELLED
+
+  Summary: 0 succeeded, 1 failed, 1 cancelled, 0 running, 0 pending
+
+All experiments complete.
+  SUCCEEDED: 0
+  FAILED: 1
+  CANCELLED: 1
+
+=== Fetching experiment logs ===
+Fetching logs: cicd_1781183791 task 0
+Fetching logs: cicd_1781183791 task 1
+=== Done fetching logs ===
+warning: `VIRTUAL_ENV=/tmp/builds/YQxxH4yPp/0/omniml/integration/nmm-sandbox/.venv-intern-agent` does not match the project environment path `.venv` and will be ignored; use `--active` to target the active environment instead
+warning: No `requires-python` value found in the workspace. Defaulting to `>=3.12`.
+Configuring global options
+Dry run for task __main__:cicd
+Resolved Arguments
+┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Argument Name    ┃ Resolved Value                                            ┃
+┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ detach           │ True                                                      │
+│ hf_local         │ None                                                      │
+│ identity         │ '/.ssh/id_ed25519'                                        │
+│ job_dir          │ '/lustre/fsw/portfolios/coreai/users/chenhany/experiment… │
+│ job_name         │ 'Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3'              │
+│ pipeline         │ SandboxPipeline(                                          │
+│                  │   global_vars=GlobalVariables(hf_model='/hf-local/Qwen/Q… │
+│                  │   task_0=SandboxTask0(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:nightly',               │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/qualitative',       │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 32',         │
+│                  │ '--output_length 4096', '--aa_timing', '--show_progress', │
+│                  │ '--save_dir                                               │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/qualitative'], │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}]), │
+│                  │   task_1=SandboxTask1(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:nightly',               │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k',    │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 8',          │
+│                  │ '--num_requests 80', '--output_length 4096',              │
+│                  │ '--aa_timing', '--show_progress', '--save_dir             │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/throughput_32… │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}])) │
+│ task             │ None                                                      │
+│ test_level       │ 0                                                         │
+│ user             │ 'chenhany'                                                │
+└──────────────────┴───────────────────────────────────────────────────────────┘
+Launching cicd...
+============================================================
+Version Report
+============================================================
+  Launcher                       e916b41      (main)
+  Model-Optimizer                16d562a0     (pensieve-intern/OMNIML-4961/cell-t1-d3)
+============================================================
+────────────── Entering Experiment cicd with id: cicd_1781184058 ───────────────
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 0 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:nightly', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 1 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:nightly', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+[13:21:04] Connecting to                                           client.py:257
+           chenhany@cw-dfw-cs-001-login-01.nvidia.com                           
+[13:21:04] INFO     Connected (version 2.0, client             transport.py:1786
+                    OpenSSH_8.9p1)                                              
+           INFO     Authentication (publickey) successful!     transport.py:1786
+           INFO     rsyncing                                         rsync.py:37
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781184058 to                           
+                    /lustre/fsw/portfolios/coreai/users/chenhany/exp            
+                    eriments/cicd ...                                           
+[13:21:31] INFO     Successfully ran `rsync  -pthrvz  --rsh='ssh -i  rsync.py:93
+                    /.ssh/id_ed25519 -p 22 '                                    
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781184058                              
+                    chenhany@cw-dfw-cs-001-login-01.nvidia.com:/lust            
+                    re/fsw/portfolios/coreai/users/chenhany/experime            
+                    nts/cicd`                                                   
+[13:21:31] Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0 for                     
+           experiment cicd                                                      
+           INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726765                            
+           Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1 for                     
+           experiment cicd                                                      
+[13:21:32] INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726766                            
+────────────────── Detaching from Experiment cicd_1781184058. ──────────────────
+[13:21:32] Task specific cleanup won't be run.                experiment.py:1212
+           Ephemeral logs and artifacts may be lost.                            
+[SLURM] Job 12726765 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:21:32
+[SLURM] Job 12726766 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:21:32
+
+Experiment Status for cicd_1781184058
+
+Task 0: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Status: PENDING
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726765
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781184058/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781184058/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+
+Task 1: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726766
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781184058/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781184058/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+
+                                                                                
+# The experiment was run with the following tasks: ['Qwen3.5-4B_specdec_bench_df
+# You can inspect and reconstruct this experiment at a later point in time using
+experiment = run.Experiment.from_id("cicd_1781184058")                          
+experiment.status() # Gets the overall status                                   
+experiment.logs("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Gets the log f
+experiment.cancel("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Cancels the 
+                                                                                
+                                                                                
+# You can inspect this experiment at a later point in time using the CLI as well
+nemo experiment status cicd_1781184058                                          
+nemo experiment logs cicd_1781184058 0                                          
+nemo experiment cancel cicd_1781184058 0                                        
+                                                                                
+Found 1 experiment(s): cicd_1781184058
+
+=== [2026-06-11 13:21:39] Polling iteration 1/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:24:42] Polling iteration 2/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:27:44] Polling iteration 3/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:30:47] Polling iteration 4/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: RUNNING
+
+  Summary: 1 succeeded, 0 failed, 0 cancelled, 1 running, 0 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:33:49] Polling iteration 5/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: RUNNING
+
+  Summary: 1 succeeded, 0 failed, 0 cancelled, 1 running, 0 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:36:52] Polling iteration 6/320 ===
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781184058 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: FAILED
+
+  Summary: 1 succeeded, 1 failed, 0 cancelled, 0 running, 0 pending
+
+All experiments complete.
+  SUCCEEDED: 1
+  FAILED: 1
+  CANCELLED: 0
+
+=== Fetching experiment logs ===
+Fetching logs: cicd_1781184058 task 0
+Fetching logs: cicd_1781184058 task 1
+=== Done fetching logs ===
+warning: `VIRTUAL_ENV=/tmp/builds/YQxxH4yPp/0/omniml/integration/nmm-sandbox/.venv-intern-agent` does not match the project environment path `.venv` and will be ignored; use `--active` to target the active environment instead
+warning: No `requires-python` value found in the workspace. Defaulting to `>=3.12`.
+Configuring global options
+Dry run for task __main__:cicd
+Resolved Arguments
+┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Argument Name    ┃ Resolved Value                                            ┃
+┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ detach           │ True                                                      │
+│ hf_local         │ None                                                      │
+│ identity         │ '/.ssh/id_ed25519'                                        │
+│ job_dir          │ '/lustre/fsw/portfolios/coreai/users/chenhany/experiment… │
+│ job_name         │ 'Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3'              │
+│ pipeline         │ SandboxPipeline(                                          │
+│                  │   global_vars=GlobalVariables(hf_model='/hf-local/Qwen/Q… │
+│                  │   task_0=SandboxTask0(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:nightly',               │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/qualitative',       │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 32',         │
+│                  │ '--output_length 4096', '--aa_timing', '--show_progress', │
+│                  │ '--save_dir                                               │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/qualitative'], │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}]), │
+│                  │   task_1=SandboxTask1(                                    │
+│                  │     script='common/specdec_bench/run.sh',                 │
+│                  │     slurm_config=SlurmConfig(                             │
+│                  │       host='cw-dfw-cs-001-login-01.nvidia.com',           │
+│                  │       account='coreai_dlalgo_modelopt',                   │
+│                  │       partition='batch',                                  │
+│                  │       container='vllm/vllm-openai:nightly',               │
+│                  │       modelopt_install_path='/usr/local/lib/python3.12/d… │
+│                  │       container_mounts=['/lustre/fsw/portfolios/coreai/p… │
+│                  │ '/lustre:/lustre', '/cm:/cm',                             │
+│                  │ '/var/run/munge:/var/run/munge'],                         │
+│                  │       srun_args=['--no-container-mount-home'],            │
+│                  │       array=None,                                         │
+│                  │       nodes=1,                                            │
+│                  │       ntasks_per_node=1,                                  │
+│                  │       gpus_per_node=2),                                   │
+│                  │     args=['--dataset speed', '--dataset_path              │
+│                  │ /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k',    │
+│                  │ '--engine VLLM', '--speculative_algorithm DFLASH',        │
+│                  │ '--draft_length 3', '--block_size 4', '--draft_model_dir  │
+│                  │ /hf-local/z-lab/Qwen3.5-4B-DFlash', '--runtime_params     │
+│                  │ common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3… │
+│                  │ '--tp_size 2', '--ep_size 1', '--concurrency 8',          │
+│                  │ '--num_requests 80', '--output_length 4096',              │
+│                  │ '--aa_timing', '--show_progress', '--save_dir             │
+│                  │ /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/throughput_32… │
+│                  │     environment=[{'HF_MODEL_CKPT':                        │
+│                  │ '<<global_vars.hf_model>>'}, {'HF_LOCAL': '/hf-local'}])) │
+│ task             │ None                                                      │
+│ test_level       │ 0                                                         │
+│ user             │ 'chenhany'                                                │
+└──────────────────┴───────────────────────────────────────────────────────────┘
+Launching cicd...
+============================================================
+Version Report
+============================================================
+  Launcher                       e916b41      (main)
+  Model-Optimizer                16d562a0     (pensieve-intern/OMNIML-4961/cell-t1-d3)
+============================================================
+────────────── Entering Experiment cicd with id: cicd_1781185067 ───────────────
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 0 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:nightly', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+job Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3 task 1 slurm_config: SlurmConfig(host='cw-dfw-cs-001-login-01.nvidia.com', port=22, account='coreai_dlalgo_modelopt', partition='batch', qos=None, container='vllm/vllm-openai:nightly', modelopt_install_path='/usr/local/lib/python3.12/dist-packages/modelopt', container_mounts=['/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_modelopt/hf-local:/hf-local', '/lustre:/lustre', '/cm:/cm', '/var/run/munge:/var/run/munge'], srun_args=['--no-container-mount-home'], array=None, nodes=1, ntasks_per_node=1, gpus_per_node=2, time='04:00:00', local=False)
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+find: ‘modules/Megatron-LM/megatron/*’: No such file or directory
+find: ‘modules/Megatron-LM/examples/*’: No such file or directory
+find: ‘modules/Megatron-LM/*.py’: No such file or directory
+find: ‘modules/Model-Optimizer-Internal/**’: No such file or directory
+[13:37:53] Connecting to                                           client.py:257
+           chenhany@cw-dfw-cs-001-login-01.nvidia.com                           
+[13:37:53] INFO     Connected (version 2.0, client             transport.py:1786
+                    OpenSSH_8.9p1)                                              
+           INFO     Authentication (publickey) successful!     transport.py:1786
+           INFO     rsyncing                                         rsync.py:37
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781185067 to                           
+                    /lustre/fsw/portfolios/coreai/users/chenhany/exp            
+                    eriments/cicd ...                                           
+[13:38:17] INFO     Successfully ran `rsync  -pthrvz  --rsh='ssh -i  rsync.py:93
+                    /.ssh/id_ed25519 -p 22 '                                    
+                    /tmp/pensieve-intern-agent-wsy7i9j7/workspace/ex            
+                    periments/cicd/cicd_1781185067                              
+                    chenhany@cw-dfw-cs-001-login-01.nvidia.com:/lust            
+                    re/fsw/portfolios/coreai/users/chenhany/experime            
+                    nts/cicd`                                                   
+[13:38:17] Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0 for                     
+           experiment cicd                                                      
+           INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726980                            
+           Launching job                                       experiment.py:800
+           Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1 for                     
+           experiment cicd                                                      
+[SLURM] Job 12726980 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:38:18
+[13:38:18] INFO     Launched app:                                launcher.py:116
+                    slurm_tunnel://nemo_run/12726981                            
+────────────────── Detaching from Experiment cicd_1781185067. ──────────────────
+[13:38:18] Task specific cleanup won't be run.                experiment.py:1212
+           Ephemeral logs and artifacts may be lost.                            
+[SLURM] Job 12726981 - State: PENDING, Estimated start: N/A, Current time: 2026-06-11 13:38:18
+
+Experiment Status for cicd_1781185067
+
+Task 0: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726980
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781185067/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781185067/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0
+
+Task 1: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Status: SUBMITTED
+- Executor: SlurmExecutor on chenhany@cw-dfw-cs-001-login-01.nvidia.com
+- Job id: 12726981
+- Local Directory: /tmp/pensieve-intern-agent-wsy7i9j7/workspace/experiments/cicd/cicd_1781185067/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+- Remote Directory: /lustre/fsw/portfolios/coreai/users/chenhany/experiments/cicd/cicd_1781185067/Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1
+
+                                                                                
+# The experiment was run with the following tasks: ['Qwen3.5-4B_specdec_bench_df
+# You can inspect and reconstruct this experiment at a later point in time using
+experiment = run.Experiment.from_id("cicd_1781185067")                          
+experiment.status() # Gets the overall status                                   
+experiment.logs("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Gets the log f
+experiment.cancel("Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0") # Cancels the 
+                                                                                
+                                                                                
+# You can inspect this experiment at a later point in time using the CLI as well
+nemo experiment status cicd_1781185067                                          
+nemo experiment logs cicd_1781185067 0                                          
+nemo experiment cancel cicd_1781185067 0                                        
+                                                                                
+Found 1 experiment(s): cicd_1781185067
+
+=== [2026-06-11 13:38:25] Polling iteration 1/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: PENDING
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 0 running, 2 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:41:28] Polling iteration 2/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:44:30] Polling iteration 3/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: RUNNING
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: PENDING
+
+  Summary: 0 succeeded, 0 failed, 0 cancelled, 1 running, 1 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:47:33] Polling iteration 4/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: RUNNING
+
+  Summary: 1 succeeded, 0 failed, 0 cancelled, 1 running, 0 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:50:35] Polling iteration 5/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: RUNNING
+
+  Summary: 1 succeeded, 0 failed, 0 cancelled, 1 running, 0 pending
+Waiting 180s before next poll...
+
+=== [2026-06-11 13:53:38] Polling iteration 6/320 ===
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_0: SUCCEEDED
+  cicd_1781185067 / Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3_1: SUCCEEDED
+
+  Summary: 2 succeeded, 0 failed, 0 cancelled, 0 running, 0 pending
+
+All experiments complete.
+  SUCCEEDED: 2
+  FAILED: 0
+  CANCELLED: 0
+
+=== Fetching experiment logs ===
+Fetching logs: cicd_1781185067 task 0
+Fetching logs: cicd_1781185067 task 1
+=== Done fetching logs ===
diff --git a/metrics_output.log b/metrics_output.log
new file mode 100644
index 00000000000..12a766c4098
--- /dev/null
+++ b/metrics_output.log
@@ -0,0 +1,16 @@
+qualitative Average_AL= 1.34
+qualitative Category_AL coding = 1.2781
+qualitative Category_AL humanities = 1.3442
+qualitative Category_AL math = 1.4108
+qualitative Category_AL multilingual = 1.3429
+qualitative Category_AL qa = 1.3675
+qualitative Category_AL rag = 1.3815
+qualitative Category_AL reasoning = 1.2566
+qualitative Category_AL roleplay = 1.2802
+qualitative Category_AL stem = 1.3352
+qualitative Category_AL summarization = 1.3883
+qualitative Category_AL writing = 1.3549
+throughput_32k Average_AL= 1.3651
+throughput_32k Category_AL high_entropy = 1.3702
+throughput_32k Category_AL low_entropy = 1.3063
+throughput_32k Category_AL mixed = 1.4167
diff --git a/tools/launcher/common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml b/tools/launcher/common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml
new file mode 100644
index 00000000000..b1689e56f62
--- /dev/null
+++ b/tools/launcher/common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml
@@ -0,0 +1,4 @@
+sampling_kwargs:
+  temperature: 1
+engine_args:
+  max_model_len: 65536
diff --git a/tools/launcher/examples/Qwen3.5/Qwen3.5-4B/specdec_bench_dflash_vllm_t1_d3.yaml b/tools/launcher/examples/Qwen3.5/Qwen3.5-4B/specdec_bench_dflash_vllm_t1_d3.yaml
new file mode 100644
index 00000000000..7e34d38df7f
--- /dev/null
+++ b/tools/launcher/examples/Qwen3.5/Qwen3.5-4B/specdec_bench_dflash_vllm_t1_d3.yaml
@@ -0,0 +1,73 @@
+# SPEED-bench DFlash speculative-decoding run for Qwen3.5-4B via vLLM.
+#
+# The qwen3_5 model_type needs recent transformers/vLLM support, and DFlash
+# requires a vLLM build whose speculative_config accepts method=dflash.
+# Use vllm/vllm-openai:nightly for this cell.
+#
+# Slurm run on cw_dfw:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen3.5/Qwen3.5-4B/specdec_bench_dflash_vllm_t1_d3.yaml --yes detach=true
+
+job_name: Qwen3.5-4B_specdec_bench_dflash_vllm_t1_d3
+
+pipeline:
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-4B
+
+  # task_0: SPEED qualitative split
+  task_0:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative
+      - --engine VLLM
+      - --speculative_algorithm DFLASH
+      - --draft_length 3
+      - --block_size 4
+      - --draft_model_dir /hf-local/z-lab/Qwen3.5-4B-DFlash
+      - --runtime_params common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml
+      - --tp_size 2
+      - --ep_size 1
+      - --concurrency 32
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/qualitative
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:nightly
+
+  # task_1: SPEED throughput_32k split
+  task_1:
+    script: common/specdec_bench/run.sh
+    args:
+      - --dataset speed
+      - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k
+      - --engine VLLM
+      - --speculative_algorithm DFLASH
+      - --draft_length 3
+      - --block_size 4
+      - --draft_model_dir /hf-local/z-lab/Qwen3.5-4B-DFlash
+      - --runtime_params common/specdec_bench/_cells/Qwen3.5-4B_dflash_vllm_t1_d3.yaml
+      - --tp_size 2
+      - --ep_size 1
+      - --concurrency 8
+      - --num_requests 80
+      - --output_length 4096
+      - --aa_timing
+      - --show_progress
+      - --save_dir /scratchspace/Qwen3.5-4B_dflash_vllm_t1_d3/throughput_32k
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 2
+      container: vllm/vllm-openai:nightly