SciSharp · martindevans · Mar 14, 2026 · Apr 17, 2026
diff --git a/LLama.Unittest/MtmdWeightsTests.cs b/LLama.Unittest/MtmdWeightsTests.cs
@@ -81,7 +81,7 @@ public void BasicPropertyChecks()
             Assert.True(_mtmdWeights.SupportsVision);
             Assert.False(_mtmdWeights.UsesMRope);
             Assert.True(_mtmdWeights.UsesNonCausalAttention);
-            Assert.Equal(-1, _mtmdWeights.AudioBitrate);
+            Assert.Equal(-1, _mtmdWeights.SampleRate);
         }
 
         [Fact,Trait("Category", "NoCI")]
@@ -143,8 +143,8 @@ public void TokenizeProvidesChunkMetadata()
             Assert.True(_mtmdWeights.SupportsVision);
             Assert.False(_mtmdWeights.SupportsAudio);
 
-            var audioBitrate = _mtmdWeights.AudioBitrate;
-            Assert.True(audioBitrate <= 0);
+            var audioSampleRate = _mtmdWeights.SampleRate;
+            Assert.True(audioSampleRate <= 0);
         }
     }
 }
diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
@@ -43,7 +43,12 @@ public sealed class BatchedExecutor
     /// The <see cref="LLamaWeights"/> this executor is using
     /// </summary>
     public LLamaWeights Model { get; }
-
+
+    /// <summary>
+    /// The optional <see cref="MtmdWeights"/> this executor is using
+    /// </summary>
+    public MtmdWeights? ClipModel { get; }
+
     /// <summary>
     /// Get the number of tokens in the batch, waiting for <see cref="Infer"/> to be called
     /// </summary>
@@ -79,21 +84,15 @@ public int BatchedTokenCount
     /// </summary>
     /// <param name="model">The model to use</param>
     /// <param name="contextParams">Parameters to create a new context</param>
-    public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
-        : this(model, contextParams, null)
-    {
-    }
-
-    public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel)
+    /// <param name="clipModel">Clip model to use for multimodal capabilities</param>
+    public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel = null)
     {
         Model = model;
         Context = model.CreateContext(contextParams);
         ClipModel = clipModel;
         Epoch = 1;
     }
 
-    public MtmdWeights? ClipModel { get; }
-
     /// <summary>
     /// Start a new <see cref="Conversation"/>
     /// </summary>

diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
@@ -1,14 +1,11 @@
 using System;
 using System.Collections.Generic;
-using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
-using Microsoft.Extensions.AI;
 using Microsoft.Extensions.Logging;
-using static System.Net.Mime.MediaTypeNames;
 
 namespace LLama;
 
@@ -79,7 +76,7 @@ public async Task<IReadOnlyList<float[]>> GetEmbeddings(string input, Cancellati
             Context.Dispose();
 
         Context = _weights.CreateContext(_params, _logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+        Context.NativeHandle.SetEmbeddings(true);
 
         // Add all of the tokens to the batch
         var tokens = Context.Tokenize(input, special: true);

diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
@@ -1,11 +1,7 @@
 using System;
 using System.Collections.Generic;
-using System.IO;
-using System.Linq;
-using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
-using System.Xml.Linq;
 using LLama.Abstractions;
 using LLama.Exceptions;
 using LLama.Native;
@@ -44,7 +40,7 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg
         if (@params.PoolingType != LLamaPoolingType.Rank)
             throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank");
         Context = weights.CreateContext(@params, logger);
-        NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+        Context.NativeHandle.SetEmbeddings(true);
     }
 
     /// <inheritdoc />

diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>ff4affb4c1aa7eb4_v3</BinaryReleaseId>
+    <BinaryReleaseId>3f7c29d318e317b6</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>

diff --git a/LLama/MtmdWeights.cs b/LLama/MtmdWeights.cs
@@ -110,10 +110,10 @@
    public int EvaluateChunks(SafeMtmdInputChunks chunks, SafeLLamaContextHandle llamaContext, ref int nPast, int seqId, int nBatch, bool logitsLast)
        => NativeHandle.EvaluateChunks(chunks, llamaContext, ref nPast, seqId, nBatch, logitsLast);

    public int EvaluateChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, ref int nPast, int seqId, int nBatch, bool logitsLast)
        => NativeHandle.EvaluateChunk(chunkPtr, llamaContext, ref nPast, seqId, nBatch, logitsLast);

    public int DecodeImageChunk(IntPtr chunkPtr, SafeLLamaContextHandle llamaContext, IntPtr encodedEmbeddings, ref int nPast, int seqId, int nBatch)
        => NativeHandle.DecodeImageChunk(chunkPtr, llamaContext, encodedEmbeddings, ref nPast, seqId, nBatch);

    /// <summary>
@@ -137,9 +137,9 @@
     public bool UsesMRope => NativeHandle.DecodeUseMRope();
 
     /// <summary>
-    /// Gets the audio bitrate advertised by the model.
+    /// Gets the audio sample rate advertised by the model.
     /// </summary>
-    public int AudioBitrate => NativeHandle.GetAudioBitrate();
+    public int SampleRate => NativeHandle.GetAudioSampleRate();
 
     /// <inheritdoc />
     public void Dispose() => NativeHandle.Dispose();

diff --git a/LLama/Native/GPUSplitMode.cs b/LLama/Native/GPUSplitMode.cs
@@ -20,4 +20,10 @@ public enum GPUSplitMode
     /// split layers and KV across GPUs, use tensor parallelism if supported
     /// </summary>
     Row = 2,
+
+    // Undocumented in llama.h
+    /// <summary>
+    /// 
+    /// </summary>
+    Tensor = 3,
 }
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
@@ -206,7 +206,17 @@ public enum LLamaFtype
         /// except 1d tensors 
         /// </summary>
         LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,
-
+
+        /// <summary>
+        /// Except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_NVFP4 = 39,
+
+        /// <summary>
+        /// Except 1d tensors
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_Q1_0 = 40,
+
         /// <summary>
         /// File type was not specified
         /// </summary>

diff --git a/LLama/Native/LLamaModelImatrixData.cs b/LLama/Native/LLamaModelImatrixData.cs
@@ -0,0 +1,12 @@
+namespace LLama.Native;
+
+/* /// <summary>
+/// 
+/// </summary>
+/// <remarks>llama_model_imatrix_data</remarks>
+public unsafe struct LLamaModelImatrixData
+{
+    char* name;
+    float* data;
+    nuint size;
+} */
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -79,25 +79,35 @@ public bool keep_split
         }
         private sbyte _keep_split;
 
+        /// <summary>
+        /// calculate and show the final quantization size without performing quantization
+        /// </summary>
+        public bool dry_run
+        {
+            get => Convert.ToBoolean(_dry_run);
+            set => _dry_run = Convert.ToSByte(value);
+        }
+        private sbyte _dry_run;
+
         /// <summary>
         /// pointer to importance matrix data
         /// </summary>
-        public IntPtr imatrix;
+        public IntPtr imatrix; // LLamaModelImatrixData *
 
         /// <summary>
         /// pointer to vector containing overrides
         /// </summary>
-        public IntPtr kv_overrides;
+        public IntPtr kv_overrides; // llama_model_kv_override *
 
         /// <summary>
         /// pointer to vector containing tensor types
         /// </summary>
-        public IntPtr tensor_types;
+        public IntPtr tensor_types; // llama_model_tensor_override *
 
         /// <summary>
         /// Pointer to vector containing layer indices to prune
         /// </summary>
-        public IntPtr prune_layers;
+        public IntPtr prune_layers; // int32 *
 
         /// <summary>
         /// Create a LLamaModelQuantizeParams with default values

diff --git a/LLama/Native/LLamaModelTensorOverride.cs b/LLama/Native/LLamaModelTensorOverride.cs
@@ -0,0 +1,9 @@
+namespace LLama.Native;
+
+// Unsupported - we can't handle ggml_type since LlamaSharp doesn't wrap/expose ggml
+/*
+ * struct llama_model_tensor_override {
+       const char * pattern;
+       enum ggml_type type; // GGMLType might work?
+   };
+*/
diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
@@ -40,12 +40,24 @@ internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr)
     /// </summary>
     public void Unload()
     {
+        // Early exit if already unloaded
+        if (!Loaded)
+            return;
+
+        // If the model has been unloaded this handle will have been auto unloaded
+        if (Model.IsClosed)
+        {
+            Loaded = false;
+            return;
+        }
+
+        // Unload
         Loaded = false;
         llama_adapter_lora_free(Pointer);
 
-        // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
+        // Manually free a LoRA adapter. loaded adapters which have not been
+        // freed will be automatically freed when the associated model is deleted
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
-        [Obsolete("adapters are now freed together with the associated model")]
         static extern void llama_adapter_lora_free(IntPtr adapter);
     }
 }
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
@@ -41,24 +41,50 @@ internal struct mtmd_context_params
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_context_params_default", CallingConvention = CallingConvention.Cdecl)]
     internal static extern mtmd_context_params mtmd_context_params_default();
 
+    /// <summary>
+    /// whether we need to set non-causal mask before llama_decode
+    /// if chunk is nullptr, we assume the default case where chunk is an image chunk
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_non_causal", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_decode_use_non_causal(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model use M-RoPE for llama_decode
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_decode_use_mrope", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_decode_use_mrope(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model supports vision input
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_vision", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_support_vision(SafeMtmdModelHandle ctx);
 
+    /// <summary>
+    /// whether the current model supports audio input
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_support_audio", CallingConvention = CallingConvention.Cdecl)]
     [return: MarshalAs(UnmanagedType.I1)]
     internal static extern bool mtmd_support_audio(SafeMtmdModelHandle ctx);
 
-    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_bitrate", CallingConvention = CallingConvention.Cdecl)]
-    internal static extern int mtmd_get_audio_bitrate(SafeMtmdModelHandle ctx);
+    /// <summary>
+    /// get audio sample rate in Hz, for example 16000 for Whisper
+    /// </summary>
+    /// <param name="ctx"></param>
+    /// <returns></returns>
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_get_audio_sample_rate", CallingConvention = CallingConvention.Cdecl)]
+    internal static extern int mtmd_get_audio_sample_rate(SafeMtmdModelHandle ctx);
 
     // bitmap ------------------------------------------------------------
 
@@ -153,9 +179,11 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_tokens", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_n_tokens(IntPtr image_tokens);
 
+    [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_nx", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_nx(IntPtr image_tokens);
 
+    [Obsolete("use mtmd_image_tokens_get_decoder_pos() instead")]
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_ny", CallingConvention = CallingConvention.Cdecl)]
     internal static extern UIntPtr mtmd_image_tokens_get_ny(IntPtr image_tokens);
 
@@ -165,10 +193,32 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_image_tokens_get_n_pos(IntPtr image_tokens);
 
+    [StructLayout(LayoutKind.Explicit)]
+    internal struct mtmd_decoder_pos
+    {
+        [FieldOffset(0)]
+        uint t;
+
+        [FieldOffset(4)]
+        uint x;
+
+        [FieldOffset(8)]
+        uint y;
+    };
+
+    /// <summary>
+    /// get position for decoder attention, to be used by M-RoPE models
+    /// </summary>
+    /// <param name="image_tokens"></param>
+    /// <param name="i">i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1</param>
+    /// <returns>return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)</returns>
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_image_tokens_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
+    internal static extern mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(IntPtr image_tokens, nuint i);
+
     // tokenize ----------------------------------------------------------
 
     /// <summary>
-    /// Native text structure consumed by <see cref="mtmd_tokenize"/>.
+    /// Native text structure consumed by <see cref="NativeApi.mtmd_tokenize(LLama.Native.SafeMtmdModelHandle,System.IntPtr,in LLama.Native.NativeApi.mtmd_input_text_native,System.IntPtr[],System.UIntPtr)"/>.
     /// </summary>
     internal unsafe struct mtmd_input_text_native
     {
@@ -259,6 +309,11 @@ internal static unsafe IntPtr mtmd_helper_bitmap_init_from_file(SafeMtmdModelHan
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_get_n_pos", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_get_n_pos(SafeMtmdInputChunks chunks);
 
+    [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_image_get_decoder_pos", CallingConvention = CallingConvention.Cdecl)]
+    // helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
+    // out_pos must have length == mtmd_helper_get_n_tokens(image)
+    internal static extern void mtmd_helper_image_get_decoder_pos(IntPtr /* mtmd_image_tokens* */ image, IntPtr /* mtmd_decoder_pos* */ out_pos);
+
     [DllImport(mtmdLibraryName, EntryPoint = "mtmd_helper_eval_chunks", CallingConvention = CallingConvention.Cdecl)]
     internal static extern int mtmd_helper_eval_chunks(
         SafeMtmdModelHandle ctx,