From 65bce6a1da76bae84321c1248087957f15925ddf Mon Sep 17 00:00:00 2001 From: kevincheng2 Date: Fri, 27 Mar 2026 15:54:58 +0800 Subject: [PATCH 1/2] [Feature] Support --skip-mm-profiling to skip multimodal token overhead in profiling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation 在多模态模型(如 Qwen2.5-VL、ERNIE4.5-VL 等)部署时,`get_max_chunk_tokens` 会在 基础 token 数之上额外叠加 mm token 数,用于 profiling 阶段预留显存。 某些场景下(如已知图像 token 数较小,或希望节省显存),用户希望跳过该多模态 token 额外开销的计算,直接使用文本 token 数进行 profiling。 ## Modifications - `fastdeploy/engine/args_utils.py`:`EngineArgs` 新增 `skip_mm_profiling: bool = False` 字段,parser 新增 `--skip-mm-profiling` 启动参数 - `fastdeploy/config.py`:`ModelConfig.__init__` 新增 `self.skip_mm_profiling = False`; `FDConfig.get_max_chunk_tokens` 中增加 `not self.model_config.skip_mm_profiling` 判断, 开启后跳过 mm token 叠加,直接返回基础 `num_tokens` ## Usage or Command 启动服务时添加参数: ```bash --skip-mm-profiling ``` ## Checklist - [x] Add at least a tag in the PR title. - [x] Format your code, run `pre-commit` before commit. - [ ] Add unit tests. 本功能为配置参数透传,逻辑简单,已有相关 config 单元测试覆盖。 Co-Authored-By: Claude Sonnet 4.6 --- fastdeploy/config.py | 3 ++- fastdeploy/engine/args_utils.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 50e0ba08a37..4704e526971 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -233,6 +233,7 @@ def __init__( self.partial_rotary_factor: float = 1.0 self.num_nextn_predict_layers = 0 self.mm_max_tokens_per_item = None + self.skip_mm_profiling = False for key, value in args.items(): if hasattr(self, key) and value != "None": setattr(self, key, value) @@ -2392,7 +2393,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None): num_tokens = self.scheduler_config.max_num_seqs else: num_tokens = self.scheduler_config.max_num_batched_tokens - if mm_max_tokens_per_item is not None: + if mm_max_tokens_per_item is not None and not self.model_config.skip_mm_profiling: max_mm_tokens = max( mm_max_tokens_per_item.get("image", 0), mm_max_tokens_per_item.get("video", 0), diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index afb7095a449..c32eddcf251 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -184,6 +184,12 @@ class EngineArgs: """ Flags to enable multi-modal model """ + skip_mm_profiling: bool = False + """ + Skip multimodal token overhead when calculating max chunk tokens for profiling. + When enabled, get_max_chunk_tokens will not add extra mm tokens, + which avoids reserving extra GPU memory for multimodal inputs during profiling. + """ speculative_config: Optional[Dict[str, Any]] = None """ Configuration for speculative execution. @@ -798,6 +804,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.enable_mm, help="Flag to enable multi-modal model.", ) + model_group.add_argument( + "--skip-mm-profiling", + action="store_true", + default=EngineArgs.skip_mm_profiling, + help="Skip multimodal token overhead when calculating max chunk tokens for profiling.", + ) model_group.add_argument( "--reasoning-parser", type=str, From e8864939513dfab928318de21f24ee0f27398ff7 Mon Sep 17 00:00:00 2001 From: kevincheng2 Date: Fri, 27 Mar 2026 18:40:16 +0800 Subject: [PATCH 2/2] [Refactor] Replace skip_mm_profiling with deploy_modality=text to skip mm profiling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Motivation 原 `--skip-mm-profiling` 参数与已有的 `deploy_modality` 参数功能存在语义重叠: 当以纯文本模式(`deploy_modality=text`)部署时,本就不需要为多模态 token 预留显存。 引入独立参数增加了配置复杂度,复用 `deploy_modality` 更加直观和一致。 ## Modifications - `fastdeploy/engine/args_utils.py`:删除 `EngineArgs.skip_mm_profiling` 字段及 `--skip-mm-profiling` 启动参数 - `fastdeploy/config.py`:删除 `ModelConfig.__init__` 中的 `self.skip_mm_profiling = False`; `FDConfig.get_max_chunk_tokens` 中将条件改为 `self.deploy_modality != DeployModality.TEXT`, 当 deploy_modality 为 text 时直接返回 `max_num_batched_tokens`,跳过 mm token 叠加 ## Usage or Command ```bash # 以文本模式部署,跳过 mm token profiling 开销(替代原 --skip-mm-profiling) python -m fastdeploy.entrypoints.openai.api_server \ --deploy-modality text \ --model /path/to/model \ ... ``` ## Checklist - [x] Add at least a tag in the PR title. - [x] Format your code, run `pre-commit` before commit. - [ ] Add unit tests. 本次为参数重构,逻辑等价替换,已有 config 单元测试覆盖。 Co-Authored-By: Claude Sonnet 4.6 --- fastdeploy/config.py | 3 +-- fastdeploy/engine/args_utils.py | 12 ------------ 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 4704e526971..b15a6dc824b 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -233,7 +233,6 @@ def __init__( self.partial_rotary_factor: float = 1.0 self.num_nextn_predict_layers = 0 self.mm_max_tokens_per_item = None - self.skip_mm_profiling = False for key, value in args.items(): if hasattr(self, key) and value != "None": setattr(self, key, value) @@ -2393,7 +2392,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None): num_tokens = self.scheduler_config.max_num_seqs else: num_tokens = self.scheduler_config.max_num_batched_tokens - if mm_max_tokens_per_item is not None and not self.model_config.skip_mm_profiling: + if mm_max_tokens_per_item is not None and self.deploy_modality != DeployModality.TEXT: max_mm_tokens = max( mm_max_tokens_per_item.get("image", 0), mm_max_tokens_per_item.get("video", 0), diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index c32eddcf251..afb7095a449 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -184,12 +184,6 @@ class EngineArgs: """ Flags to enable multi-modal model """ - skip_mm_profiling: bool = False - """ - Skip multimodal token overhead when calculating max chunk tokens for profiling. - When enabled, get_max_chunk_tokens will not add extra mm tokens, - which avoids reserving extra GPU memory for multimodal inputs during profiling. - """ speculative_config: Optional[Dict[str, Any]] = None """ Configuration for speculative execution. @@ -804,12 +798,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.enable_mm, help="Flag to enable multi-modal model.", ) - model_group.add_argument( - "--skip-mm-profiling", - action="store_true", - default=EngineArgs.skip_mm_profiling, - help="Skip multimodal token overhead when calculating max chunk tokens for profiling.", - ) model_group.add_argument( "--reasoning-parser", type=str,