LLAMA: add support for grammar

Chris Warren-Smith · Chris Warren-Smith · commit 7a268902a1b8 · 2026-02-09T08:59:24.000+10:30
diff --git a/llama/llama-sb.cpp b/llama/llama-sb.cpp
@@ -31,7 +31,8 @@ Llama::Llama() :
   _min_p(0),
   _top_k(0),
   _max_tokens(0),
-  _log_level(GGML_LOG_LEVEL_CONT) {
+  _log_level(GGML_LOG_LEVEL_CONT),
+  _seed(LLAMA_DEFAULT_SEED) {
   llama_log_set([](enum ggml_log_level level, const char * text, void *user_data) {
     Llama *llama = (Llama *)user_data;
     if (level > llama->_log_level) {
@@ -63,6 +64,9 @@ void Llama::reset() {
   _top_p = 1.0f;
   _min_p = 0.0f;
   _max_tokens = 150;
+  _grammar_src.clear();
+  _grammar_root.clear();
+  _seed = LLAMA_DEFAULT_SEED;
   if (_ctx) {
     llama_memory_clear(llama_get_memory(_ctx), true);
   }
@@ -93,36 +97,53 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
       _last_error = "Failed to create context";
     } else {
       _vocab = llama_model_get_vocab(_model);
-
-      auto sparams = llama_sampler_chain_default_params();
-      sparams.no_perf = false;
-      _sampler = llama_sampler_chain_init(sparams);
     }
   }
   return _last_error.empty();
 }
 
-void Llama::configure_sampler() {
-  llama_sampler_reset(_sampler);
+void Llama::set_grammar(const string &src, const string &root) {
+  _grammar_src = src;
+  _grammar_root = root;
+}
+
+bool Llama::configure_sampler() {
+  auto sparams = llama_sampler_chain_default_params();
+  sparams.no_perf = false;
+  llama_sampler *chain = llama_sampler_chain_init(sparams);
+
+  if (!_grammar_src.empty()) {
+    llama_sampler *grammar = llama_sampler_init_grammar(_vocab, _grammar_src.c_str(), _grammar_root.c_str());
+    if (!grammar) {
+      _last_error = "failed to initialize grammar sampler";
+      return false;
+    }
+    llama_sampler_chain_add(chain, grammar);
+  }
   if (_penalty_last_n != 0 && _penalty_repeat != 1.0f) {
     auto penalties = llama_sampler_init_penalties(_penalty_last_n, _penalty_repeat, 0.0f, 0.0f);
-    llama_sampler_chain_add(_sampler, penalties);
+    llama_sampler_chain_add(chain, penalties);
   }
   if (_temperature <= 0.0f) {
-    llama_sampler_chain_add(_sampler, llama_sampler_init_greedy());
+    llama_sampler_chain_add(chain, llama_sampler_init_greedy());
   } else {
-    llama_sampler_chain_add(_sampler, llama_sampler_init_temp(_temperature));
     if (_top_k > 0) {
-      llama_sampler_chain_add(_sampler, llama_sampler_init_top_k(_top_k));
+      llama_sampler_chain_add(chain, llama_sampler_init_top_k(_top_k));
     }
-    if (_top_p < 1.0f) {
-      llama_sampler_chain_add(_sampler, llama_sampler_init_top_p(_top_p, 1));
+    if (_top_p < 1.0f || _min_p > 0.0f) {
+      llama_sampler_chain_add(chain, llama_sampler_init_top_p(_top_p, 1));
     }
     if (_min_p > 0.0f) {
-      llama_sampler_chain_add(_sampler, llama_sampler_init_min_p(_min_p, 1));
+      llama_sampler_chain_add(chain, llama_sampler_init_min_p(_min_p, 1));
     }
-    llama_sampler_chain_add(_sampler, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+    llama_sampler_chain_add(chain, llama_sampler_init_temp(_temperature));
+    llama_sampler_chain_add(chain, llama_sampler_init_dist(_seed));
   }
+  if (_sampler) {
+    llama_sampler_free(_sampler);
+  }
+  _sampler = chain;
+  return true;
 }
 
 vector<llama_token> Llama::tokenize(const string &prompt) {
@@ -201,7 +222,9 @@ bool Llama::make_space_for_tokens(int n_tokens, int keep_min) {
 }
 
 bool Llama::generate(LlamaIter &iter, const string &prompt) {
-  configure_sampler();
+  if (!configure_sampler()) {
+    return false;
+  }
 
   vector<llama_token> prompt_tokens = tokenize(prompt);
   if (prompt_tokens.size() == 0) {
diff --git a/llama/llama-sb.h b/llama/llama-sb.h
@@ -50,6 +50,8 @@ struct Llama {
   void set_temperature(float temperature) { _temperature = temperature; }
   void set_top_k(int top_k) { _top_k = top_k; }
   void set_top_p(float top_p) { _top_p = top_p; }
+  void set_grammar(const string &src, const string &root);
+  void set_seed(unsigned int seed) { _seed = seed; }
 
   // error handling
   const char *last_error() { return _last_error.c_str(); }
@@ -58,7 +60,7 @@ struct Llama {
 
   private:
   bool ends_with_sentence_boundary(const string &out);
-  void configure_sampler();
+  bool configure_sampler();
   bool make_space_for_tokens(int n_tokens, int keep_min);
   vector<llama_token> tokenize(const string &prompt);
   string token_to_string(LlamaIter &iter, llama_token tok);
@@ -68,6 +70,8 @@ struct Llama {
   llama_sampler *_sampler;
   const llama_vocab *_vocab;
   vector<string> _stop_sequences;
+  string _grammar_src;
+  string _grammar_root;
   string _last_error;
   int32_t _penalty_last_n;
   float _penalty_repeat;
@@ -77,4 +81,5 @@ struct Llama {
   int _top_k;
   int _max_tokens;
   int _log_level;
+  unsigned int _seed;
 };
diff --git a/llama/main.cpp b/llama/main.cpp
@@ -211,6 +211,42 @@ static int cmd_llama_set_top_p(var_s *self, int argc, slib_par_t *arg, var_s *re
   return result;
 }
 
+//
+// llama.set_grammar("text")
+//
+static int cmd_llama_set_grammar(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_grammar", 1, 1);
+  } else {
+    int id = get_llama_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_llama.at(id);
+      llama.set_grammar(get_param_str(argc, arg, 0, 0), "root");
+      result = 1;
+    }
+  }
+  return result;
+}
+
+//
+// llama.set_seed(123)
+//
+static int cmd_llama_set_seed(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
+  int result = 0;
+  if (argc != 1) {
+    error(retval, "llama.set_seed", 1, 1);
+  } else {
+    int id = get_llama_class_id(self, retval);
+    if (id != -1) {
+      Llama &llama = g_llama.at(id);
+      llama.set_seed(get_param_num(argc, arg, 0, 0));
+      result = 1;
+    }
+  }
+  return result;
+}
+
 //
 // llama.reset() - make the model forget everything
 //
@@ -355,6 +391,8 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
     v_create_callback(retval, "set_temperature", cmd_llama_set_temperature);
     v_create_callback(retval, "set_top_k", cmd_llama_set_top_k);
     v_create_callback(retval, "set_top_p", cmd_llama_set_top_p);
+    v_create_callback(retval, "set_grammar", cmd_llama_set_grammar);
+    v_create_callback(retval, "set_seed", cmd_llama_set_seed);
     result = 1;
   } else {
     error(retval, llama.last_error());
@@ -388,7 +426,7 @@ int sblib_init(const char *sourceFile) {
 //
 // Release variables falling out of scope
 //
-SBLIB_API void sblib_free(int cls_id, int id) {
+SBLIB_API int sblib_free(int cls_id, int id) {
   if (id != -1) {
     switch (cls_id) {
     case CLASS_ID_LLAMA:
@@ -403,6 +441,7 @@ SBLIB_API void sblib_free(int cls_id, int id) {
       break;
     }
   }
+  return 0;
 }
 
 //