timkhronos · timkhronos · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/conversion/minimax.py b/conversion/minimax.py
@@ -117,20 +117,42 @@ def set_gguf_parameters(self):
     def modify_tensors(self, data_torch, name, bid):
         assert self.hparams_vision is not None
 
-        # Conv3d patch embed -> split into temporal_patch_size Conv2d slices, summed in C++.
-        # MiniMax-M3 has no patch-embed bias.
+        # Conv3d patch embed -> Conv2d slices
         if name == "vision_tower.vision_model.embeddings.patch_embedding.weight":
             if data_torch.ndim != 5:
                 raise ValueError(f"unexpected patch_embedding rank {data_torch.ndim} for {name}")
-            kt = data_torch.shape[2]  # temporal_patch_size
+            kt = data_torch.shape[2]
             base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
             for t in range(kt):
                 suffix = ".weight" if t == 0 else f".weight.{t}"
                 yield (base + suffix, data_torch[:, :, t, ...])
             return
 
-        # everything else resolves through the precomputed MMPROJ name map:
-        #   vision_tower.vision_model.*        -> v.* (auto, shared CLIP mapping)
-        #   multi_modal_projector.linear_{bid} -> mm.{bid}
-        #   patch_merge_mlp.linear_{1,2}       -> mm.merge.fc{1,2}
-        yield from super().modify_tensors(data_torch, name, bid)
+        # Permute ViT q/k. HF [Ta Ha Wa | Tb Hb Wb | pad] reorder to [Ta Tb | Ha Hb | Wa Wb | pad]. 
+        for new_name, tensor in super().modify_tensors(data_torch, name, bid):
+            if ".attn_q." in new_name or ".attn_k." in new_name:
+                tensor = self._permute_vit_qk(tensor, new_name)
+            yield new_name, tensor
+
+    def _permute_vit_qk(self, t: "Tensor", new_name: str) -> "Tensor":
+        n_head = self.hparams_vision["num_attention_heads"]
+        d_head = t.shape[0] // n_head
+        axis_dim = 2 * ((2 * (d_head // 2) // 3) // 2)
+        ah   = axis_dim // 2
+        half = 3 * ah
+        perm = (list(range(0, ah))            + list(range(half,        half + ah)) +
+                list(range(ah, 2 * ah))       + list(range(half + ah,   half + 2*ah)) +
+                list(range(2 * ah, 3 * ah))   + list(range(half + 2*ah, half + 3*ah)) +
+                list(range(2 * half, d_head)))
+
+        assert axis_dim % 2 == 0
+        assert 3 * axis_dim <= d_head
+        assert len(perm) == d_head
+        assert sorted(perm) == list(range(d_head)), "perm is not a bijection of d_head"
+        assert t.shape[0] == n_head * d_head, f"{new_name}: {t.shape[0]} != {n_head}*{d_head}"
+        assert d_head == 80
+
+        idx = torch.tensor(perm, dtype=torch.long)
+        if t.ndim == 2:
+            return t.reshape(n_head, d_head, t.shape[1])[:, idx, :].reshape(t.shape)
+        return t.reshape(n_head, d_head)[:, idx].reshape(t.shape)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -3875,39 +3875,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
             } break;
         case PROJECTOR_TYPE_MINIMAX_M3:
             {
-                // 3D RoPE cos/sin, host port of MiniMaxM3VL3DRotaryEmbedding.
-                // Block (2x2-merge) order, per-axis frequency denominator = axis_dim.
                 const int gh = image_size_height / patch_size;
                 const int gw = image_size_width  / patch_size;
-                const int n_pos    = gh * gw;
-                const int axis_dim = 26, nf = axis_dim / 2;   // 13 freqs/axis
-                const int rope_dim = 3 * axis_dim;            // 78
-                const float theta  = hparams.rope_theta;       // vision_config.rope_theta (10000)
-                std::vector<float> inv(nf);
-                for (int k = 0; k < nf; k++) {
-                    inv[k] = 1.0f / std::pow(theta, (2.0f * k) / axis_dim);
-                }
-                std::vector<float> cosb(rope_dim * n_pos), sinb(rope_dim * n_pos);
-                int p = 0;
+                std::vector<int32_t> pos_t, pos_h, pos_w;
+                pos_t.reserve(gh * gw);
+                pos_h.reserve(gh * gw);
+                pos_w.reserve(gh * gw);
                 for (int bh = 0; bh < gh / 2; bh++)
                 for (int bw = 0; bw < gw / 2; bw++)
                 for (int mh = 0; mh < 2; mh++)
                 for (int mw = 0; mw < 2; mw++) {
-                    const int coord[3] = { 0, bh * 2 + mh, bw * 2 + mw }; // t,h,w
-                    for (int axis = 0; axis < 3; axis++) {
-                        for (int k = 0; k < nf; k++) {
-                            const float ang = coord[axis] * inv[k];
-                            const int d0 = axis * nf + k;          // 0..38
-                            cosb[p*rope_dim + d0]      = std::cos(ang);
-                            sinb[p*rope_dim + d0]      = std::sin(ang);
-                            cosb[p*rope_dim + d0 + 39] = std::cos(ang); // emb = cat([f,f])
-                            sinb[p*rope_dim + d0 + 39] = std::sin(ang);
-                        }
-                    }
-                    p++;
+                    pos_t.push_back(0);
+                    pos_h.push_back(bh * 2 + mh);
+                    pos_w.push_back(bw * 2 + mw);
                 }
-                set_input_f32("minimax_cos", cosb);
-                set_input_f32("minimax_sin", sinb);
+                set_input_i32("minimax_pos_t", pos_t);
+                set_input_i32("minimax_pos_h", pos_h);
+                set_input_i32("minimax_pos_w", pos_w);
             } break;
         case PROJECTOR_TYPE_DOTS_OCR:
             {

diff --git a/tools/mtmd/models/minimax-m3.cpp b/tools/mtmd/models/minimax-m3.cpp
@@ -3,24 +3,29 @@
 // MiniMax-M3 vision graph
 
 ggml_tensor * clip_graph_minimax_m3::apply_rope(
-        ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin) {
-    const int64_t d    = x->ne[0];
-    const int64_t rd   = rope_cos->ne[0];
-    const int64_t half = rd / 2;
-    const size_t  es   = ggml_element_size(x);
-
-    ggml_tensor * x_rot  = ggml_cont(ctx0, ggml_view_3d(ctx0, x, rd,    x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0));
-    ggml_tensor * x_pass = ggml_cont(ctx0, ggml_view_3d(ctx0, x, d - rd, x->ne[1], x->ne[2], x->nb[1], x->nb[2], rd * es));
-
-    const size_t es_r = ggml_element_size(x_rot);
-    ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], 0));
-    ggml_tensor * x2 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], half * es_r));
-    ggml_tensor * rot = ggml_concat(ctx0, ggml_neg(ctx0, x2), x1, 0); 
-
-    ggml_tensor * out = ggml_add(ctx0,
-        ggml_mul(ctx0, x_rot, rope_cos),
-        ggml_mul(ctx0, rot,   rope_sin));
-    return ggml_concat(ctx0, out, x_pass, 0);
+        ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w) {
+    const int64_t Hn  = x->ne[1];
+    const int64_t P   = x->ne[2];
+    const size_t  es  = ggml_element_size(x);
+    const int     dh  = (int) x->ne[0];
+    const int     axd = 2 * ((2 * (dh / 2) / 3) / 2);
+
+    GGML_ASSERT(x->nb[0] == es);
+    GGML_ASSERT(3 * axd <= dh);
+
+    const float   th  = hparams.rope_theta;
+    auto sl = [&](int off, int n) {
+        return ggml_cont(ctx0, ggml_view_3d(ctx0, x, n, Hn, P, x->nb[1], x->nb[2], (size_t) off * es));
+    };
+    ggml_tensor * t   = sl(0,        axd);
+    ggml_tensor * h   = sl(axd,      axd);
+    ggml_tensor * w   = sl(2 * axd,  axd);
+    ggml_tensor * pad = sl(3 * axd,  dh - 3 * axd);
+
+    t = ggml_rope_ext(ctx0, t, pos_t, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+    h = ggml_rope_ext(ctx0, h, pos_h, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+    w = ggml_rope_ext(ctx0, w, pos_w, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+    return ggml_concat(ctx0, ggml_concat(ctx0, ggml_concat(ctx0, t, h, 0), w, 0), pad, 0);
 }
 
 ggml_cgraph * clip_graph_minimax_m3::build() {
@@ -49,21 +54,17 @@ ggml_cgraph * clip_graph_minimax_m3::build() {
         inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
     }
 
-    // 3D RoPE inputs
-    const int axis_dim = 2 * ((2 * (d_head / 2) / 3) / 2);
-    const int rope_dim = 3 * axis_dim;
-    ggml_tensor * rope_cos = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos);
-    ggml_set_name(rope_cos, "minimax_cos"); ggml_set_input(rope_cos);
-    ggml_tensor * rope_sin = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos);
-    ggml_set_name(rope_sin, "minimax_sin"); ggml_set_input(rope_sin);
+    ggml_tensor * pos_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_t, "minimax_pos_t"); ggml_set_input(pos_t);
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_h, "minimax_pos_h"); ggml_set_input(pos_h);
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_w, "minimax_pos_w"); ggml_set_input(pos_w);
 
     ggml_tensor * inpL = build_vit(
-        inp, n_pos,
-        NORM_TYPE_NORMAL,
-        FFN_GELU_ERF,
-        nullptr,
+        inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_ERF, nullptr,
         [&](ggml_tensor * c, const clip_layer &) {
-            return apply_rope(c, rope_cos, rope_sin); // rotate first rd dims, pass remaining dims through
+            return apply_rope(c, pos_t, pos_h, pos_w);
         });
 
     // projector

diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h
@@ -43,7 +43,7 @@ struct clip_graph_qwen3vl : clip_graph_qwen2vl {
 struct clip_graph_minimax_m3 : clip_graph {
     clip_graph_minimax_m3(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
     ggml_cgraph * build() override;
-    ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin);
+    ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w);
 };
 
 struct clip_graph_mimovl : clip_graph {