From 404a5fb38a94b4862ce2a79c3915009757a7ad56 Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Thu, 11 Jun 2026 23:13:29 +0000 Subject: [PATCH 1/5] Add SPEED-bench gemma MTP vLLM t0_d7 cell Signed-off-by: Pensieve Intern --- .../specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml diff --git a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml new file mode 100644 index 00000000000..fcf893c989e --- /dev/null +++ b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml @@ -0,0 +1,4 @@ +sampling_kwargs: + temperature: 0 +engine_args: + max_model_len: 40960 From e6aaec396eefaafbcb544f523700705887792698 Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Thu, 11 Jun 2026 23:21:49 +0000 Subject: [PATCH 2/5] Set hybrid KV cache workaround for gemma cell Signed-off-by: Pensieve Intern --- .../specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml index fcf893c989e..e271ed59308 100644 --- a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml +++ b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml @@ -2,3 +2,4 @@ sampling_kwargs: temperature: 0 engine_args: max_model_len: 40960 + disable_hybrid_kv_cache_manager: true From fe757c2f6e3f4f9319da5a3984c26f42f71adb1a Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Thu, 11 Jun 2026 23:22:11 +0000 Subject: [PATCH 3/5] Forward vLLM hybrid KV cache option Signed-off-by: Pensieve Intern --- examples/specdec_bench/specdec_bench/models/vllm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/specdec_bench/specdec_bench/models/vllm.py b/examples/specdec_bench/specdec_bench/models/vllm.py index 24062399cb8..5599fe66082 100644 --- a/examples/specdec_bench/specdec_bench/models/vllm.py +++ b/examples/specdec_bench/specdec_bench/models/vllm.py @@ -135,6 +135,7 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs async_scheduling=kwargs.get("async_scheduling", True), enforce_eager=False, max_model_len=kwargs.get("max_model_len"), + disable_hybrid_kv_cache_manager=kwargs.get("disable_hybrid_kv_cache_manager", False), ) self.engine_args = engine_args self.model = AsyncLLM.from_engine_args(engine_args) From 3b65009f792f7bf7ba1241d15f98da5a595c1eb3 Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Thu, 11 Jun 2026 23:30:04 +0000 Subject: [PATCH 4/5] Allow eager mode for vLLM specbench cells Signed-off-by: Pensieve Intern --- examples/specdec_bench/specdec_bench/models/vllm.py | 2 +- .../specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/specdec_bench/specdec_bench/models/vllm.py b/examples/specdec_bench/specdec_bench/models/vllm.py index 5599fe66082..8a0ea797267 100644 --- a/examples/specdec_bench/specdec_bench/models/vllm.py +++ b/examples/specdec_bench/specdec_bench/models/vllm.py @@ -133,7 +133,7 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs max_num_seqs=max_concurrent_requests * num_speculative_tokens, skip_tokenizer_init=False, async_scheduling=kwargs.get("async_scheduling", True), - enforce_eager=False, + enforce_eager=kwargs.get("enforce_eager", False), max_model_len=kwargs.get("max_model_len"), disable_hybrid_kv_cache_manager=kwargs.get("disable_hybrid_kv_cache_manager", False), ) diff --git a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml index e271ed59308..25bd68297fe 100644 --- a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml +++ b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml @@ -3,3 +3,4 @@ sampling_kwargs: engine_args: max_model_len: 40960 disable_hybrid_kv_cache_manager: true + enforce_eager: true From f8d942c8eaa6192212a5158a174254f7f133b6fb Mon Sep 17 00:00:00 2001 From: Pensieve Intern Date: Thu, 11 Jun 2026 23:38:06 +0000 Subject: [PATCH 5/5] Restore cell-only SPEED-bench diff Signed-off-by: Pensieve Intern --- examples/specdec_bench/specdec_bench/models/vllm.py | 3 +-- .../specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/specdec_bench/specdec_bench/models/vllm.py b/examples/specdec_bench/specdec_bench/models/vllm.py index 8a0ea797267..24062399cb8 100644 --- a/examples/specdec_bench/specdec_bench/models/vllm.py +++ b/examples/specdec_bench/specdec_bench/models/vllm.py @@ -133,9 +133,8 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs max_num_seqs=max_concurrent_requests * num_speculative_tokens, skip_tokenizer_init=False, async_scheduling=kwargs.get("async_scheduling", True), - enforce_eager=kwargs.get("enforce_eager", False), + enforce_eager=False, max_model_len=kwargs.get("max_model_len"), - disable_hybrid_kv_cache_manager=kwargs.get("disable_hybrid_kv_cache_manager", False), ) self.engine_args = engine_args self.model = AsyncLLM.from_engine_args(engine_args) diff --git a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml index 25bd68297fe..fcf893c989e 100644 --- a/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml +++ b/tools/launcher/common/specdec_bench/_cells/gemma-4-E4B-it_mtp_vllm_t0_d7.yaml @@ -2,5 +2,3 @@ sampling_kwargs: temperature: 0 engine_args: max_model_len: 40960 - disable_hybrid_kv_cache_manager: true - enforce_eager: true