From 3ad41c95b1191a5889b8614139c8bf2203a8ffcc Mon Sep 17 00:00:00 2001 From: fanxiangyu Date: Mon, 16 Mar 2026 15:07:20 +0800 Subject: [PATCH] 1 reduce --- fastdeploy/model_executor/models/glm4_moe.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index ecef165d86c..78ee4c23086 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -25,6 +25,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig +from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( support_graph_optimization, @@ -159,8 +160,16 @@ def __init__( default_initializer=paddle.nn.initializer.Constant(0), ) + # In pure-TP mode (tp>1, ep=1) both branches return partial sums, so we + # defer the all-reduce to after combining them — saving one collective. + # In all other modes (EP, EP+attn-TP, no parallelism) each branch handles + # its own reduction internally (reduce_results default=True), so we must + # NOT add an extra all-reduce here. + self._pure_tp = self.use_tp and not self.use_ep + self.experts = FusedMoE( fd_config, + reduce_results=not self._pure_tp, renormalize=self.norm_topk_prob, moe_intermediate_size=fd_config.model_config.moe_intermediate_size, num_experts=fd_config.model_config.n_routed_experts, @@ -181,14 +190,16 @@ def __init__( intermediate_size=shared_experts_intermediate_size, layer_id=layer_id, prefix=f"{prefix}.shared_experts", + reduce_results=not self._pure_tp, ) def forward(self, x, forward_meta: ForwardMeta = None): out = self.experts(x, self.gate, forward_meta) if self.n_shared_experts > 0: - shared_experts_out = self.shared_experts(x) - out = out + shared_experts_out - + out = out + self.shared_experts(x) + if self._pure_tp: + # Both branches produced partial sums; combine first, then single all-reduce. + out = tensor_model_parallel_all_reduce(out, self.tp_group) return out