Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions conversion/minimax.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,20 +117,42 @@ def set_gguf_parameters(self):
def modify_tensors(self, data_torch, name, bid):
assert self.hparams_vision is not None

# Conv3d patch embed -> split into temporal_patch_size Conv2d slices, summed in C++.
# MiniMax-M3 has no patch-embed bias.
# Conv3d patch embed -> Conv2d slices
if name == "vision_tower.vision_model.embeddings.patch_embedding.weight":
if data_torch.ndim != 5:
raise ValueError(f"unexpected patch_embedding rank {data_torch.ndim} for {name}")
kt = data_torch.shape[2] # temporal_patch_size
kt = data_torch.shape[2]
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
for t in range(kt):
suffix = ".weight" if t == 0 else f".weight.{t}"
yield (base + suffix, data_torch[:, :, t, ...])
return

# everything else resolves through the precomputed MMPROJ name map:
# vision_tower.vision_model.* -> v.* (auto, shared CLIP mapping)
# multi_modal_projector.linear_{bid} -> mm.{bid}
# patch_merge_mlp.linear_{1,2} -> mm.merge.fc{1,2}
yield from super().modify_tensors(data_torch, name, bid)
# Permute ViT q/k. HF [Ta Ha Wa | Tb Hb Wb | pad] reorder to [Ta Tb | Ha Hb | Wa Wb | pad].
for new_name, tensor in super().modify_tensors(data_torch, name, bid):
if ".attn_q." in new_name or ".attn_k." in new_name:
tensor = self._permute_vit_qk(tensor, new_name)
yield new_name, tensor

def _permute_vit_qk(self, t: "Tensor", new_name: str) -> "Tensor":
n_head = self.hparams_vision["num_attention_heads"]
d_head = t.shape[0] // n_head
axis_dim = 2 * ((2 * (d_head // 2) // 3) // 2)
ah = axis_dim // 2
half = 3 * ah
perm = (list(range(0, ah)) + list(range(half, half + ah)) +
list(range(ah, 2 * ah)) + list(range(half + ah, half + 2*ah)) +
list(range(2 * ah, 3 * ah)) + list(range(half + 2*ah, half + 3*ah)) +
list(range(2 * half, d_head)))

assert axis_dim % 2 == 0
assert 3 * axis_dim <= d_head
assert len(perm) == d_head
assert sorted(perm) == list(range(d_head)), "perm is not a bijection of d_head"
assert t.shape[0] == n_head * d_head, f"{new_name}: {t.shape[0]} != {n_head}*{d_head}"
assert d_head == 80

idx = torch.tensor(perm, dtype=torch.long)
if t.ndim == 2:
return t.reshape(n_head, d_head, t.shape[1])[:, idx, :].reshape(t.shape)
return t.reshape(n_head, d_head)[:, idx].reshape(t.shape)
36 changes: 10 additions & 26 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3875,39 +3875,23 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
} break;
case PROJECTOR_TYPE_MINIMAX_M3:
{
// 3D RoPE cos/sin, host port of MiniMaxM3VL3DRotaryEmbedding.
// Block (2x2-merge) order, per-axis frequency denominator = axis_dim.
const int gh = image_size_height / patch_size;
const int gw = image_size_width / patch_size;
const int n_pos = gh * gw;
const int axis_dim = 26, nf = axis_dim / 2; // 13 freqs/axis
const int rope_dim = 3 * axis_dim; // 78
const float theta = hparams.rope_theta; // vision_config.rope_theta (10000)
std::vector<float> inv(nf);
for (int k = 0; k < nf; k++) {
inv[k] = 1.0f / std::pow(theta, (2.0f * k) / axis_dim);
}
std::vector<float> cosb(rope_dim * n_pos), sinb(rope_dim * n_pos);
int p = 0;
std::vector<int32_t> pos_t, pos_h, pos_w;
pos_t.reserve(gh * gw);
pos_h.reserve(gh * gw);
pos_w.reserve(gh * gw);
for (int bh = 0; bh < gh / 2; bh++)
for (int bw = 0; bw < gw / 2; bw++)
for (int mh = 0; mh < 2; mh++)
for (int mw = 0; mw < 2; mw++) {
const int coord[3] = { 0, bh * 2 + mh, bw * 2 + mw }; // t,h,w
for (int axis = 0; axis < 3; axis++) {
for (int k = 0; k < nf; k++) {
const float ang = coord[axis] * inv[k];
const int d0 = axis * nf + k; // 0..38
cosb[p*rope_dim + d0] = std::cos(ang);
sinb[p*rope_dim + d0] = std::sin(ang);
cosb[p*rope_dim + d0 + 39] = std::cos(ang); // emb = cat([f,f])
sinb[p*rope_dim + d0 + 39] = std::sin(ang);
}
}
p++;
pos_t.push_back(0);
pos_h.push_back(bh * 2 + mh);
pos_w.push_back(bw * 2 + mw);
}
set_input_f32("minimax_cos", cosb);
set_input_f32("minimax_sin", sinb);
set_input_i32("minimax_pos_t", pos_t);
set_input_i32("minimax_pos_h", pos_h);
set_input_i32("minimax_pos_w", pos_w);
} break;
case PROJECTOR_TYPE_DOTS_OCR:
{
Expand Down
61 changes: 31 additions & 30 deletions tools/mtmd/models/minimax-m3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,29 @@
// MiniMax-M3 vision graph

ggml_tensor * clip_graph_minimax_m3::apply_rope(
ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin) {
const int64_t d = x->ne[0];
const int64_t rd = rope_cos->ne[0];
const int64_t half = rd / 2;
const size_t es = ggml_element_size(x);

ggml_tensor * x_rot = ggml_cont(ctx0, ggml_view_3d(ctx0, x, rd, x->ne[1], x->ne[2], x->nb[1], x->nb[2], 0));
ggml_tensor * x_pass = ggml_cont(ctx0, ggml_view_3d(ctx0, x, d - rd, x->ne[1], x->ne[2], x->nb[1], x->nb[2], rd * es));

const size_t es_r = ggml_element_size(x_rot);
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], 0));
ggml_tensor * x2 = ggml_cont(ctx0, ggml_view_3d(ctx0, x_rot, half, x_rot->ne[1], x_rot->ne[2], x_rot->nb[1], x_rot->nb[2], half * es_r));
ggml_tensor * rot = ggml_concat(ctx0, ggml_neg(ctx0, x2), x1, 0);

ggml_tensor * out = ggml_add(ctx0,
ggml_mul(ctx0, x_rot, rope_cos),
ggml_mul(ctx0, rot, rope_sin));
return ggml_concat(ctx0, out, x_pass, 0);
ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w) {
const int64_t Hn = x->ne[1];
const int64_t P = x->ne[2];
const size_t es = ggml_element_size(x);
const int dh = (int) x->ne[0];
const int axd = 2 * ((2 * (dh / 2) / 3) / 2);

GGML_ASSERT(x->nb[0] == es);
GGML_ASSERT(3 * axd <= dh);

const float th = hparams.rope_theta;
auto sl = [&](int off, int n) {
return ggml_cont(ctx0, ggml_view_3d(ctx0, x, n, Hn, P, x->nb[1], x->nb[2], (size_t) off * es));
};
ggml_tensor * t = sl(0, axd);
ggml_tensor * h = sl(axd, axd);
ggml_tensor * w = sl(2 * axd, axd);
ggml_tensor * pad = sl(3 * axd, dh - 3 * axd);

t = ggml_rope_ext(ctx0, t, pos_t, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
h = ggml_rope_ext(ctx0, h, pos_h, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
w = ggml_rope_ext(ctx0, w, pos_w, nullptr, axd, GGML_ROPE_TYPE_NEOX, 0, th, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
return ggml_concat(ctx0, ggml_concat(ctx0, ggml_concat(ctx0, t, h, 0), w, 0), pad, 0);
}

ggml_cgraph * clip_graph_minimax_m3::build() {
Expand Down Expand Up @@ -49,21 +54,17 @@ ggml_cgraph * clip_graph_minimax_m3::build() {
inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
}

// 3D RoPE inputs
const int axis_dim = 2 * ((2 * (d_head / 2) / 3) / 2);
const int rope_dim = 3 * axis_dim;
ggml_tensor * rope_cos = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos);
ggml_set_name(rope_cos, "minimax_cos"); ggml_set_input(rope_cos);
ggml_tensor * rope_sin = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, rope_dim, 1, n_pos);
ggml_set_name(rope_sin, "minimax_sin"); ggml_set_input(rope_sin);
ggml_tensor * pos_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_t, "minimax_pos_t"); ggml_set_input(pos_t);
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_h, "minimax_pos_h"); ggml_set_input(pos_h);
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
ggml_set_name(pos_w, "minimax_pos_w"); ggml_set_input(pos_w);

ggml_tensor * inpL = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
FFN_GELU_ERF,
nullptr,
inp, n_pos, NORM_TYPE_NORMAL, FFN_GELU_ERF, nullptr,
[&](ggml_tensor * c, const clip_layer &) {
return apply_rope(c, rope_cos, rope_sin); // rotate first rd dims, pass remaining dims through
return apply_rope(c, pos_t, pos_h, pos_w);
});

// projector
Expand Down
2 changes: 1 addition & 1 deletion tools/mtmd/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct clip_graph_qwen3vl : clip_graph_qwen2vl {
struct clip_graph_minimax_m3 : clip_graph {
clip_graph_minimax_m3(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * rope_cos, ggml_tensor * rope_sin);
ggml_tensor * apply_rope(ggml_tensor * x, ggml_tensor * pos_t, ggml_tensor * pos_h, ggml_tensor * pos_w);
};

struct clip_graph_mimovl : clip_graph {
Expand Down
Loading