From 9ed8e5e721ccf80cdc6ae5f800bc05b4522c27c0 Mon Sep 17 00:00:00 2001 From: HelloItMeMort <> Date: Thu, 4 Jun 2026 22:30:12 -0500 Subject: [PATCH 1/2] feat: upgrade deprecated speculative decoding runtime flags for llama.cpp Remove deprecated --draft, --draft-n, --draft-max, --draft-min, --draft-n-min flags and replace with the new --spec-draft-* and --spec-ngram-* flags introduced in recent llama.cpp versions. Changes: - Remove deprecated --draft, --draft-n, --draft-max, --draft-min, --draft-n-min - Add --spec-draft-* flags for draft model CPU control (threads, affinity, priority, polling) - Add --spec-draft-* flags for draft model GPU/device control (override-tensor, cpu-moe) - Add --spec-draft-n-max, --spec-draft-n-min for draft token counts - Add --spec-draft-p-split, --spec-draft-p-min for draft probability thresholds - Add --spec-draft-device/-devd, --spec-draft-ngl/-ngld for draft GPU offloading - Add --spec-draft-type-k/-ctkd, --spec-draft-type-v/-ctvd for draft KV cache types - Add --spec-draft-model/-md for specifying a separate draft model - Add --spec-type for selecting speculative decoding method - Add --spec-ngram-* flags for all ngram-based speculative decoding variants Based on llama.cpp server docs: https://github.com/ggml-org/llama.cpp/tree/master/tools/server --- pkg/inference/runtime_flags_allowlist.go | 45 +++++++++++++++---- pkg/inference/runtime_flags_allowlist_test.go | 11 ++++- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pkg/inference/runtime_flags_allowlist.go b/pkg/inference/runtime_flags_allowlist.go index 3b47a43ac..77721a95e 100644 --- a/pkg/inference/runtime_flags_allowlist.go +++ b/pkg/inference/runtime_flags_allowlist.go @@ -114,14 +114,43 @@ var LlamaCppAllowedFlags = map[string]bool{ "--spm-infill": true, // Speculative decoding (safe flags only - no file paths) - "--draft": true, "--draft-n": true, "--draft-max": true, - "--draft-min": true, "--draft-n-min": true, - "--draft-p-min": true, - "-cd": true, "--ctx-size-draft": true, - "-devd": true, "--device-draft": true, - "-ngld": true, "--gpu-layers-draft": true, "--n-gpu-layers-draft": true, - "-td": true, "--threads-draft": true, - "-tbd": true, "--threads-batch-draft": true, + "--spec-draft-threads": true, "-td": true, "--threads-draft": true, + "--spec-draft-threads-batch": true, "-tbd": true, "--threads-batch-draft": true, + "--spec-draft-cpu-mask": true, "-Cd": true, "--cpu-mask-draft": true, + "--spec-draft-cpu-range": true, "-Crd": true, "--cpu-range-draft": true, + "--spec-draft-cpu-strict": true, "--cpu-strict-draft": true, + "--spec-draft-prio": true, "--prio-draft": true, + "--spec-draft-poll": true, "--poll-draft": true, + "--spec-draft-cpu-mask-batch": true, "-Cbd": true, "--cpu-mask-batch-draft": true, + "--spec-draft-cpu-strict-batch": true, "--cpu-strict-batch-draft": true, + "--spec-draft-prio-batch": true, "--prio-batch-draft": true, + "--spec-draft-poll-batch": true, "--poll-batch-draft": true, + "--spec-draft-override-tensor": true, "-otd": true, "--override-tensor-draft": true, + "--spec-draft-cpu-moe": true, "-cmoed": true, "--cpu-moe-draft": true, + "--spec-draft-n-cpu-moe": true, "--spec-draft-ncmoe": true, "-ncmoed": true, "--n-cpu-moe-draft": true, + "--spec-draft-n-max": true, "--draft-n-max": true, + "--spec-draft-n-min": true, "--draft-n-min": true, + "--spec-draft-p-split": true, "--draft-p-split": true, + "--spec-draft-p-min": true, "--draft-p-min": true, + "--spec-draft-backend-sampling": true, "--no-spec-draft-backend-sampling": true, + "--spec-draft-device": true, "-devd": true, "--device-draft": true, + "--spec-draft-ngl": true, "-ngld": true, "--gpu-layers-draft": true, "--n-gpu-layers-draft": true, + "--spec-type": true, + "--spec-ngram-mod-n-min": true, + "--spec-ngram-mod-n-max": true, + "--spec-ngram-mod-n-match": true, + "--spec-ngram-simple-size-n": true, + "--spec-ngram-simple-size-m": true, + "--spec-ngram-simple-min-hits": true, + "--spec-ngram-map-k-size-n": true, + "--spec-ngram-map-k-size-m": true, + "--spec-ngram-map-k-min-hits": true, + "--spec-ngram-map-k4v-size-n": true, + "--spec-ngram-map-k4v-size-m": true, + "--spec-ngram-map-k4v-min-hits": true, + "--spec-draft-type-k": true, "-ctkd": true, "--cache-type-k-draft": true, + "--spec-draft-type-v": true, "-ctvd": true, "--cache-type-v-draft": true, + "-cd": true, "--ctx-size-draft": true, // LoRA (safe flags only - no file paths) "--lora-init-without-apply": true, diff --git a/pkg/inference/runtime_flags_allowlist_test.go b/pkg/inference/runtime_flags_allowlist_test.go index 6384ea433..cfbc4c677 100644 --- a/pkg/inference/runtime_flags_allowlist_test.go +++ b/pkg/inference/runtime_flags_allowlist_test.go @@ -162,9 +162,16 @@ func TestLlamaCppAllowedFlags_Categories(t *testing.T) { "--metrics", "--no-metrics", "--jinja", "--no-jinja", }, "speculative": { - "--draft", "--draft-max", "--draft-min", + "--spec-draft-n-max", "--draft-n-max", "--spec-draft-n-min", "--draft-n-min", + "--spec-draft-p-min", "--draft-p-min", + "--spec-draft-p-split", "--draft-p-split", + "--spec-draft-threads", "-td", "--threads-draft", + "--spec-draft-ngl", "-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft", + "--spec-draft-device", "-devd", "--device-draft", + "--spec-type", "-cd", "--ctx-size-draft", - "-ngld", "--gpu-layers-draft", + "--spec-draft-type-k", "-ctkd", "--cache-type-k-draft", + "--spec-draft-type-v", "-ctvd", "--cache-type-v-draft", }, } From 60e199a863a3529c743f51081f40dd0973c955e3 Mon Sep 17 00:00:00 2001 From: HelloItMeMort <> Date: Fri, 5 Jun 2026 09:51:56 -0500 Subject: [PATCH 2/2] feat: restore legacy backward-compatible speculative decoding flags Restore --draft-max and --draft-min as backward-compatible aliases alongside the new --spec-draft-n-max and --spec-draft-p-min names. --- pkg/inference/backends/llamacpp/llamacpp.go | 4 ++-- pkg/inference/runtime_flags_allowlist.go | 7 +++++-- pkg/inference/runtime_flags_allowlist_test.go | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 3bccff740..f5ffef771 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -181,10 +181,10 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode if draftPath != "" { args = append(args, "--model-draft", draftPath) if config.Speculative.NumTokens > 0 { - args = append(args, "--draft-max", strconv.Itoa(config.Speculative.NumTokens)) + args = append(args, "--spec-draft-n-max", strconv.Itoa(config.Speculative.NumTokens)) } if config.Speculative.MinAcceptanceRate > 0 { - args = append(args, "--draft-p-min", strconv.FormatFloat(config.Speculative.MinAcceptanceRate, 'f', 2, 64)) + args = append(args, "--spec-draft-p-min", strconv.FormatFloat(config.Speculative.MinAcceptanceRate, 'f', 2, 64)) } } } diff --git a/pkg/inference/runtime_flags_allowlist.go b/pkg/inference/runtime_flags_allowlist.go index 77721a95e..3b7d40ea2 100644 --- a/pkg/inference/runtime_flags_allowlist.go +++ b/pkg/inference/runtime_flags_allowlist.go @@ -114,6 +114,9 @@ var LlamaCppAllowedFlags = map[string]bool{ "--spm-infill": true, // Speculative decoding (safe flags only - no file paths) + // Flags prefixed with --spec-draft-* are the canonical llama.cpp v1.5+ names. + // Short aliases (e.g. -td) and legacy unprefixed variants (e.g. --threads-draft) + // are kept for backward compatibility with older llama.cpp versions. "--spec-draft-threads": true, "-td": true, "--threads-draft": true, "--spec-draft-threads-batch": true, "-tbd": true, "--threads-batch-draft": true, "--spec-draft-cpu-mask": true, "-Cd": true, "--cpu-mask-draft": true, @@ -128,8 +131,8 @@ var LlamaCppAllowedFlags = map[string]bool{ "--spec-draft-override-tensor": true, "-otd": true, "--override-tensor-draft": true, "--spec-draft-cpu-moe": true, "-cmoed": true, "--cpu-moe-draft": true, "--spec-draft-n-cpu-moe": true, "--spec-draft-ncmoe": true, "-ncmoed": true, "--n-cpu-moe-draft": true, - "--spec-draft-n-max": true, "--draft-n-max": true, - "--spec-draft-n-min": true, "--draft-n-min": true, + "--spec-draft-n-max": true, "--draft-n-max": true, "--draft-max": true, + "--spec-draft-n-min": true, "--draft-n-min": true, "--draft-min": true, "--spec-draft-p-split": true, "--draft-p-split": true, "--spec-draft-p-min": true, "--draft-p-min": true, "--spec-draft-backend-sampling": true, "--no-spec-draft-backend-sampling": true, diff --git a/pkg/inference/runtime_flags_allowlist_test.go b/pkg/inference/runtime_flags_allowlist_test.go index cfbc4c677..a73ec3890 100644 --- a/pkg/inference/runtime_flags_allowlist_test.go +++ b/pkg/inference/runtime_flags_allowlist_test.go @@ -162,7 +162,7 @@ func TestLlamaCppAllowedFlags_Categories(t *testing.T) { "--metrics", "--no-metrics", "--jinja", "--no-jinja", }, "speculative": { - "--spec-draft-n-max", "--draft-n-max", "--spec-draft-n-min", "--draft-n-min", + "--spec-draft-n-max", "--draft-n-max", "--draft-max", "--spec-draft-n-min", "--draft-n-min", "--draft-min", "--spec-draft-p-min", "--draft-p-min", "--spec-draft-p-split", "--draft-p-split", "--spec-draft-threads", "-td", "--threads-draft",