diff --git a/conversion/minimax.py b/conversion/minimax.py index 312e557673cb..4c5c9876a49e 100644 --- a/conversion/minimax.py +++ b/conversion/minimax.py @@ -117,20 +117,42 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch, name, bid): assert self.hparams_vision is not None - # Conv3d patch embed -> split into temporal_patch_size Conv2d slices, summed in C++. - # MiniMax-M3 has no patch-embed bias. + # Conv3d patch embed -> Conv2d slices if name == "vision_tower.vision_model.embeddings.patch_embedding.weight": if data_torch.ndim != 5: raise ValueError(f"unexpected patch_embedding rank {data_torch.ndim} for {name}") - kt = data_torch.shape[2] # temporal_patch_size + kt = data_torch.shape[2] base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] for t in range(kt): suffix = ".weight" if t == 0 else f".weight.{t}" yield (base + suffix, data_torch[:, :, t, ...]) return - # everything else resolves through the precomputed MMPROJ name map: - # vision_tower.vision_model.* -> v.* (auto, shared CLIP mapping) - # multi_modal_projector.linear_{bid} -> mm.{bid} - # patch_merge_mlp.linear_{1,2} -> mm.merge.fc{1,2} - yield from super().modify_tensors(data_torch, name, bid) + # Permute ViT q/k. HF [Ta Ha Wa | Tb Hb Wb | pad] reorder to [Ta Tb | Ha Hb | Wa Wb | pad]. + for new_name, tensor in super().modify_tensors(data_torch, name, bid): + if ".attn_q." in new_name or ".attn_k." in new_name: + tensor = self._permute_vit_qk(tensor, new_name) + yield new_name, tensor + + def _permute_vit_qk(self, t: "Tensor", new_name: str) -> "Tensor": + n_head = self.hparams_vision["num_attention_heads"] + d_head = t.shape[0] // n_head + axis_dim = 2 * ((2 * (d_head // 2) // 3) // 2) + ah = axis_dim // 2 + half = 3 * ah + perm = (list(range(0, ah)) + list(range(half, half + ah)) + + list(range(ah, 2 * ah)) + list(range(half + ah, half + 2*ah)) + + list(range(2 * ah, 3 * ah)) + list(range(half + 2*ah, half + 3*ah)) + + list(range(2 * half, d_head))) + + assert axis_dim % 2 == 0 + assert 3 * axis_dim <= d_head + assert len(perm) == d_head + assert sorted(perm) == list(range(d_head)), "perm is not a bijection of d_head" + assert t.shape[0] == n_head * d_head, f"{new_name}: {t.shape[0]} != {n_head}*{d_head}" + assert d_head == 80 + + idx = torch.tensor(perm, dtype=torch.long) + if t.ndim == 2: + return t.reshape(n_head, d_head, t.shape[1])[:, idx, :].reshape(t.shape) + return t.reshape(n_head, d_head)[:, idx].reshape(t.shape) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 358ab2ed285d..fea41a81b35f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3875,39 +3875,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32 } break; case PROJECTOR_TYPE_MINIMAX_M3: { - // 3D RoPE cos/sin, host port of MiniMaxM3VL3DRotaryEmbedding. - // Block (2x2-merge) order, per-axis frequency denominator = axis_dim. const int gh = image_size_height / patch_size; const int gw = image_size_width / patch_size; - const int n_pos = gh * gw; - const int axis_dim = 26, nf = axis_dim / 2; // 13 freqs/axis - const int rope_dim = 3 * axis_dim; // 78 - const float theta = hparams.rope_theta; // vision_config.rope_theta (10000) - std::vector inv(nf); - for (int k = 0; k < nf; k++) { - inv[k] = 1.0f / std::pow(theta, (2.0f * k) / axis_dim); - } - std::vector cosb(rope_dim * n_pos), sinb(rope_dim * n_pos); - int p = 0; + std::vector pos_t, pos_h, pos_w; + pos_t.reserve(gh * gw); + pos_h.reserve(gh * gw); + pos_w.reserve(gh * gw); for (int bh = 0; bh < gh / 2; bh++) for (int bw = 0; bw < gw / 2; bw++) for (int mh = 0; mh < 2; mh++) for (int mw = 0; mw < 2; mw++) { - const int coord[3] = { 0, bh * 2 + mh, bw * 2 + mw }; // t,h,w - for (int axis = 0; axis < 3; axis++) { - for (int k = 0; k < nf; k++) { - const float ang = coord[axis] * inv[k]; - const int d0 = axis * nf + k; // 0..38 - cosb[p*rope_dim + d0] = std::cos(ang); - sinb[p*rope_dim + d0] = std::sin(ang); - cosb[p*rope_dim + d0 + 39] = std::cos(ang); // emb = cat([f,f]) - sinb[p*rope_dim + d0 + 39] = std::sin(ang); - } - } - p++; + pos_t.push_back(0); + pos_h.push_back(bh * 2 + mh); + pos_w.push_back(bw * 2 + mw); } - set_input_f32("minimax_cos", cosb); - set_input_f32("minimax_sin", sinb); + set_input_i32("minimax_pos_t", pos_t); + set_input_i32("minimax_pos_h", pos_h); + set_input_i32("minimax_pos_w", pos_w); } break; case PROJECTOR_TYPE_DOTS_OCR: { diff --git a/tools/mtmd/models/minimax-m3.cpp b/tools/mtmd/models/minimax-m3.cpp index 427048b33ec9..8b602e0fd437 100644 --- a/tools/mtmd/models/minimax-m3.cpp +++ b/tools/mtmd/models/minimax-m3.cpp @@ -3,24 +3,29 @@ // MiniMax-M3 vision graph ggml_tensor * clip_graph_minimax_m3::apply_rope( - ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin) { - const int64_t d = x->ne[0]; - const int64_t rd = rope_cos->ne[0]; - const int64_t half = rd / 2; - const size_t es = ggml_element_size(x); - - ggml_tensor * x_rot = ggml_cont(ctx0, ggml_view_3d(ctx0, x, rd, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0)); - ggml_tensor * x_pass = ggml_cont(ctx0, ggml_view_3d(ctx0, x, d - rd, x->ne[1], x->ne[2], x->nb[1], x->nb[2], rd * es)); - - const size_t es_r = ggml_element_size(x_rot); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], 0)); - ggml_tensor * x2 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], half * es_r)); - ggml_tensor * rot = ggml_concat(ctx0, ggml_neg(ctx0, x2), x1, 0); - - ggml_tensor * out = ggml_add(ctx0, - ggml_mul(ctx0, x_rot, rope_cos), - ggml_mul(ctx0, rot, rope_sin)); - return ggml_concat(ctx0, out, x_pass, 0); + ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w) { + const int64_t Hn = x->ne[1]; + const int64_t P = x->ne[2]; + const size_t es = ggml_element_size(x); + const int dh = (int) x->ne[0]; + const int axd = 2 * ((2 * (dh / 2) / 3) / 2); + + GGML_ASSERT(x->nb[0] == es); + GGML_ASSERT(3 * axd <= dh); + + const float th = hparams.rope_theta; + auto sl = [&](int off, int n) { + return ggml_cont(ctx0, ggml_view_3d(ctx0, x, n, Hn, P, x->nb[1], x->nb[2], (size_t) off * es)); + }; + ggml_tensor * t = sl(0, axd); + ggml_tensor * h = sl(axd, axd); + ggml_tensor * w = sl(2 * axd, axd); + ggml_tensor * pad = sl(3 * axd, dh - 3 * axd); + + t = ggml_rope_ext(ctx0, t, pos_t, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + h = ggml_rope_ext(ctx0, h, pos_h, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + w = ggml_rope_ext(ctx0, w, pos_w, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + return ggml_concat(ctx0, ggml_concat(ctx0, ggml_concat(ctx0, t, h, 0), w, 0), pad, 0); } ggml_cgraph * clip_graph_minimax_m3::build() { @@ -49,21 +54,17 @@ ggml_cgraph * clip_graph_minimax_m3::build() { inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size); } - // 3D RoPE inputs - const int axis_dim = 2 * ((2 * (d_head / 2) / 3) / 2); - const int rope_dim = 3 * axis_dim; - ggml_tensor * rope_cos = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos); - ggml_set_name(rope_cos, "minimax_cos"); ggml_set_input(rope_cos); - ggml_tensor * rope_sin = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos); - ggml_set_name(rope_sin, "minimax_sin"); ggml_set_input(rope_sin); + ggml_tensor * pos_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_t, "minimax_pos_t"); ggml_set_input(pos_t); + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "minimax_pos_h"); ggml_set_input(pos_h); + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_w, "minimax_pos_w"); ggml_set_input(pos_w); ggml_tensor * inpL = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - FFN_GELU_ERF, - nullptr, + inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_ERF, nullptr, [&](ggml_tensor * c, const clip_layer &) { - return apply_rope(c, rope_cos, rope_sin); // rotate first rd dims, pass remaining dims through + return apply_rope(c, pos_t, pos_h, pos_w); }); // projector diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index c77821c29875..93b0adc8f40e 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -43,7 +43,7 @@ struct clip_graph_qwen3vl : clip_graph_qwen2vl { struct clip_graph_minimax_m3 : clip_graph { clip_graph_minimax_m3(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; - ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin); + ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w); }; struct clip_graph_mimovl : clip_graph {