From 65bce6a1da76bae84321c1248087957f15925ddf Mon Sep 17 00:00:00 2001
From: kevincheng2 <chengyf112@gmail.com>
Date: Fri, 27 Mar 2026 15:54:58 +0800
Subject: [PATCH 1/2] [Feature] Support --skip-mm-profiling to skip multimodal
 token overhead in profiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

在多模态模型（如 Qwen2.5-VL、ERNIE4.5-VL 等）部署时，`get_max_chunk_tokens` 会在
基础 token 数之上额外叠加 mm token 数，用于 profiling 阶段预留显存。

某些场景下（如已知图像 token 数较小，或希望节省显存），用户希望跳过该多模态 token
额外开销的计算，直接使用文本 token 数进行 profiling。

## Modifications

- `fastdeploy/engine/args_utils.py`：`EngineArgs` 新增 `skip_mm_profiling: bool = False`
  字段，parser 新增 `--skip-mm-profiling` 启动参数
- `fastdeploy/config.py`：`ModelConfig.__init__` 新增 `self.skip_mm_profiling = False`；
  `FDConfig.get_max_chunk_tokens` 中增加 `not self.model_config.skip_mm_profiling` 判断，
  开启后跳过 mm token 叠加，直接返回基础 `num_tokens`

## Usage or Command

启动服务时添加参数：
```bash
--skip-mm-profiling
```

## Checklist

- [x] Add at least a tag in the PR title.
- [x] Format your code, run `pre-commit` before commit.
- [ ] Add unit tests. 本功能为配置参数透传，逻辑简单，已有相关 config 单元测试覆盖。

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 fastdeploy/config.py            |  3 ++-
 fastdeploy/engine/args_utils.py | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 50e0ba08a37..4704e526971 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -233,6 +233,7 @@ def __init__(
         self.partial_rotary_factor: float = 1.0
         self.num_nextn_predict_layers = 0
         self.mm_max_tokens_per_item = None
+        self.skip_mm_profiling = False
         for key, value in args.items():
             if hasattr(self, key) and value != "None":
                 setattr(self, key, value)
@@ -2392,7 +2393,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
                 num_tokens = self.scheduler_config.max_num_seqs
         else:
             num_tokens = self.scheduler_config.max_num_batched_tokens
-            if mm_max_tokens_per_item is not None:
+            if mm_max_tokens_per_item is not None and not self.model_config.skip_mm_profiling:
                 max_mm_tokens = max(
                     mm_max_tokens_per_item.get("image", 0),
                     mm_max_tokens_per_item.get("video", 0),
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index afb7095a449..c32eddcf251 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -184,6 +184,12 @@ class EngineArgs:
     """
     Flags to enable multi-modal model
     """
+    skip_mm_profiling: bool = False
+    """
+    Skip multimodal token overhead when calculating max chunk tokens for profiling.
+    When enabled, get_max_chunk_tokens will not add extra mm tokens,
+    which avoids reserving extra GPU memory for multimodal inputs during profiling.
+    """
     speculative_config: Optional[Dict[str, Any]] = None
     """
     Configuration for speculative execution.
@@ -798,6 +804,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.enable_mm,
             help="Flag to enable multi-modal model.",
         )
+        model_group.add_argument(
+            "--skip-mm-profiling",
+            action="store_true",
+            default=EngineArgs.skip_mm_profiling,
+            help="Skip multimodal token overhead when calculating max chunk tokens for profiling.",
+        )
         model_group.add_argument(
             "--reasoning-parser",
             type=str,

From e8864939513dfab928318de21f24ee0f27398ff7 Mon Sep 17 00:00:00 2001
From: kevincheng2 <chengyf112@gmail.com>
Date: Fri, 27 Mar 2026 18:40:16 +0800
Subject: [PATCH 2/2] [Refactor] Replace skip_mm_profiling with
 deploy_modality=text to skip mm profiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Motivation

原 `--skip-mm-profiling` 参数与已有的 `deploy_modality` 参数功能存在语义重叠：
当以纯文本模式（`deploy_modality=text`）部署时，本就不需要为多模态 token 预留显存。
引入独立参数增加了配置复杂度，复用 `deploy_modality` 更加直观和一致。

## Modifications

- `fastdeploy/engine/args_utils.py`：删除 `EngineArgs.skip_mm_profiling` 字段及
  `--skip-mm-profiling` 启动参数
- `fastdeploy/config.py`：删除 `ModelConfig.__init__` 中的 `self.skip_mm_profiling = False`；
  `FDConfig.get_max_chunk_tokens` 中将条件改为
  `self.deploy_modality != DeployModality.TEXT`，
  当 deploy_modality 为 text 时直接返回 `max_num_batched_tokens`，跳过 mm token 叠加

## Usage or Command

```bash
# 以文本模式部署，跳过 mm token profiling 开销（替代原 --skip-mm-profiling）
python -m fastdeploy.entrypoints.openai.api_server \
  --deploy-modality text \
  --model /path/to/model \
  ...
```

## Checklist

- [x] Add at least a tag in the PR title.
- [x] Format your code, run `pre-commit` before commit.
- [ ] Add unit tests. 本次为参数重构，逻辑等价替换，已有 config 单元测试覆盖。

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 fastdeploy/config.py            |  3 +--
 fastdeploy/engine/args_utils.py | 12 ------------
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 4704e526971..b15a6dc824b 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -233,7 +233,6 @@ def __init__(
         self.partial_rotary_factor: float = 1.0
         self.num_nextn_predict_layers = 0
         self.mm_max_tokens_per_item = None
-        self.skip_mm_profiling = False
         for key, value in args.items():
             if hasattr(self, key) and value != "None":
                 setattr(self, key, value)
@@ -2393,7 +2392,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
                 num_tokens = self.scheduler_config.max_num_seqs
         else:
             num_tokens = self.scheduler_config.max_num_batched_tokens
-            if mm_max_tokens_per_item is not None and not self.model_config.skip_mm_profiling:
+            if mm_max_tokens_per_item is not None and self.deploy_modality != DeployModality.TEXT:
                 max_mm_tokens = max(
                     mm_max_tokens_per_item.get("image", 0),
                     mm_max_tokens_per_item.get("video", 0),
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index c32eddcf251..afb7095a449 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -184,12 +184,6 @@ class EngineArgs:
     """
     Flags to enable multi-modal model
     """
-    skip_mm_profiling: bool = False
-    """
-    Skip multimodal token overhead when calculating max chunk tokens for profiling.
-    When enabled, get_max_chunk_tokens will not add extra mm tokens,
-    which avoids reserving extra GPU memory for multimodal inputs during profiling.
-    """
     speculative_config: Optional[Dict[str, Any]] = None
     """
     Configuration for speculative execution.
@@ -804,12 +798,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.enable_mm,
             help="Flag to enable multi-modal model.",
         )
-        model_group.add_argument(
-            "--skip-mm-profiling",
-            action="store_true",
-            default=EngineArgs.skip_mm_profiling,
-            help="Skip multimodal token overhead when calculating max chunk tokens for profiling.",
-        )
         model_group.add_argument(
             "--reasoning-parser",
             type=str,