diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go index 3bccff740..f5ffef771 100644 --- a/pkg/inference/backends/llamacpp/llamacpp.go +++ b/pkg/inference/backends/llamacpp/llamacpp.go @@ -181,10 +181,10 @@ func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode if draftPath != "" { args = append(args, "--model-draft", draftPath) if config.Speculative.NumTokens > 0 { - args = append(args, "--draft-max", strconv.Itoa(config.Speculative.NumTokens)) + args = append(args, "--spec-draft-n-max", strconv.Itoa(config.Speculative.NumTokens)) } if config.Speculative.MinAcceptanceRate > 0 { - args = append(args, "--draft-p-min", strconv.FormatFloat(config.Speculative.MinAcceptanceRate, 'f', 2, 64)) + args = append(args, "--spec-draft-p-min", strconv.FormatFloat(config.Speculative.MinAcceptanceRate, 'f', 2, 64)) } } } diff --git a/pkg/inference/runtime_flags_allowlist.go b/pkg/inference/runtime_flags_allowlist.go index 3b47a43ac..3b7d40ea2 100644 --- a/pkg/inference/runtime_flags_allowlist.go +++ b/pkg/inference/runtime_flags_allowlist.go @@ -114,14 +114,46 @@ var LlamaCppAllowedFlags = map[string]bool{ "--spm-infill": true, // Speculative decoding (safe flags only - no file paths) - "--draft": true, "--draft-n": true, "--draft-max": true, - "--draft-min": true, "--draft-n-min": true, - "--draft-p-min": true, - "-cd": true, "--ctx-size-draft": true, - "-devd": true, "--device-draft": true, - "-ngld": true, "--gpu-layers-draft": true, "--n-gpu-layers-draft": true, - "-td": true, "--threads-draft": true, - "-tbd": true, "--threads-batch-draft": true, + // Flags prefixed with --spec-draft-* are the canonical llama.cpp v1.5+ names. + // Short aliases (e.g. -td) and legacy unprefixed variants (e.g. --threads-draft) + // are kept for backward compatibility with older llama.cpp versions. + "--spec-draft-threads": true, "-td": true, "--threads-draft": true, + "--spec-draft-threads-batch": true, "-tbd": true, "--threads-batch-draft": true, + "--spec-draft-cpu-mask": true, "-Cd": true, "--cpu-mask-draft": true, + "--spec-draft-cpu-range": true, "-Crd": true, "--cpu-range-draft": true, + "--spec-draft-cpu-strict": true, "--cpu-strict-draft": true, + "--spec-draft-prio": true, "--prio-draft": true, + "--spec-draft-poll": true, "--poll-draft": true, + "--spec-draft-cpu-mask-batch": true, "-Cbd": true, "--cpu-mask-batch-draft": true, + "--spec-draft-cpu-strict-batch": true, "--cpu-strict-batch-draft": true, + "--spec-draft-prio-batch": true, "--prio-batch-draft": true, + "--spec-draft-poll-batch": true, "--poll-batch-draft": true, + "--spec-draft-override-tensor": true, "-otd": true, "--override-tensor-draft": true, + "--spec-draft-cpu-moe": true, "-cmoed": true, "--cpu-moe-draft": true, + "--spec-draft-n-cpu-moe": true, "--spec-draft-ncmoe": true, "-ncmoed": true, "--n-cpu-moe-draft": true, + "--spec-draft-n-max": true, "--draft-n-max": true, "--draft-max": true, + "--spec-draft-n-min": true, "--draft-n-min": true, "--draft-min": true, + "--spec-draft-p-split": true, "--draft-p-split": true, + "--spec-draft-p-min": true, "--draft-p-min": true, + "--spec-draft-backend-sampling": true, "--no-spec-draft-backend-sampling": true, + "--spec-draft-device": true, "-devd": true, "--device-draft": true, + "--spec-draft-ngl": true, "-ngld": true, "--gpu-layers-draft": true, "--n-gpu-layers-draft": true, + "--spec-type": true, + "--spec-ngram-mod-n-min": true, + "--spec-ngram-mod-n-max": true, + "--spec-ngram-mod-n-match": true, + "--spec-ngram-simple-size-n": true, + "--spec-ngram-simple-size-m": true, + "--spec-ngram-simple-min-hits": true, + "--spec-ngram-map-k-size-n": true, + "--spec-ngram-map-k-size-m": true, + "--spec-ngram-map-k-min-hits": true, + "--spec-ngram-map-k4v-size-n": true, + "--spec-ngram-map-k4v-size-m": true, + "--spec-ngram-map-k4v-min-hits": true, + "--spec-draft-type-k": true, "-ctkd": true, "--cache-type-k-draft": true, + "--spec-draft-type-v": true, "-ctvd": true, "--cache-type-v-draft": true, + "-cd": true, "--ctx-size-draft": true, // LoRA (safe flags only - no file paths) "--lora-init-without-apply": true, diff --git a/pkg/inference/runtime_flags_allowlist_test.go b/pkg/inference/runtime_flags_allowlist_test.go index 6384ea433..a73ec3890 100644 --- a/pkg/inference/runtime_flags_allowlist_test.go +++ b/pkg/inference/runtime_flags_allowlist_test.go @@ -162,9 +162,16 @@ func TestLlamaCppAllowedFlags_Categories(t *testing.T) { "--metrics", "--no-metrics", "--jinja", "--no-jinja", }, "speculative": { - "--draft", "--draft-max", "--draft-min", + "--spec-draft-n-max", "--draft-n-max", "--draft-max", "--spec-draft-n-min", "--draft-n-min", "--draft-min", + "--spec-draft-p-min", "--draft-p-min", + "--spec-draft-p-split", "--draft-p-split", + "--spec-draft-threads", "-td", "--threads-draft", + "--spec-draft-ngl", "-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft", + "--spec-draft-device", "-devd", "--device-draft", + "--spec-type", "-cd", "--ctx-size-draft", - "-ngld", "--gpu-layers-draft", + "--spec-draft-type-k", "-ctkd", "--cache-type-k-draft", + "--spec-draft-type-v", "-ctvd", "--cache-type-v-draft", }, }