From e644ecf5fee34b937a2042aef21ec6e27f44d025 Mon Sep 17 00:00:00 2001 From: Mapika Date: Fri, 12 Jun 2026 22:35:32 +0200 Subject: [PATCH] Detect fused experts using _apply_gate as activation marker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _is_fused_experts_module() required an act_fn attribute, so fused expert modules that implement their gated activation as a method instead — e.g. MiniMaxM3VLExperts, whose clamped swiglu lives in _apply_gate() — were silently left unquantized (PTQ completes, experts stay high-precision). Accept _apply_gate as an alternative activation marker. The eager forward of such modules still performs the same two F.linear calls per expert that _QuantFusedExperts intercepts, so no other change is needed. Signed-off-by: Mapika --- .../torch/quantization/plugins/huggingface.py | 7 +++++-- .../quantization/plugins/test_fused_experts.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 97e13f419f9..a888f50319a 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -1471,7 +1471,8 @@ def _is_fused_experts_module(module): Detects the standardized HuggingFace transformers 5.0+ fused expert pattern: ``gate_up_proj`` (3-D parameter), ``down_proj`` (3-D parameter), ``num_experts``, - and ``act_fn``. Matches ``MixtralExperts``, ``Qwen2MoeExperts``, + and ``act_fn`` (or ``_apply_gate`` for clamped-swiglu experts such as + ``MiniMaxM3VLExperts``). Matches ``MixtralExperts``, ``Qwen2MoeExperts``, ``Qwen3MoeExperts``, ``Qwen3_5MoeExperts``, ``DeepseekV3NaiveMoe``, ``JambaExperts``, ``OlmoeExperts``, etc. @@ -1480,7 +1481,9 @@ def _is_fused_experts_module(module): """ if not hasattr(module, "gate_up_proj") or not hasattr(module, "down_proj"): return False - if not hasattr(module, "num_experts") or not hasattr(module, "act_fn"): + if not hasattr(module, "num_experts") or not ( + hasattr(module, "act_fn") or hasattr(module, "_apply_gate") + ): return False gate_up = getattr(module, "gate_up_proj") down = getattr(module, "down_proj") diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index ce23f7a51d5..ff9fd48fa26 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -152,6 +152,21 @@ def test_module_missing_act_fn_not_detected(self): module.num_experts = 4 assert _is_fused_experts_module(module) is False + def test_module_with_apply_gate_detected(self): + """Clamped-swiglu experts (e.g. MiniMaxM3VLExperts) use _apply_gate instead of act_fn.""" + + class _ApplyGateExperts(nn.Module): + def __init__(self): + super().__init__() + self.gate_up_proj = nn.Parameter(torch.randn(4, 16, 8)) + self.down_proj = nn.Parameter(torch.randn(4, 8, 16)) + self.num_experts = 4 + + def _apply_gate(self, gate, up): + return up * torch.sigmoid(gate) + + assert _is_fused_experts_module(_ApplyGateExperts()) is True + def test_sparse_moe_block_not_detected_as_fused(self): block = _SyntheticSparseMoeBlock() assert _is_fused_experts_module(block) is False