diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py
index 3f99388d685..4afb3dc5c49 100644
--- a/fastdeploy/engine/async_llm.py
+++ b/fastdeploy/engine/async_llm.py
@@ -446,7 +446,7 @@ async def add_request(
                 )
             if envs.ZMQ_SEND_BATCH_DATA and self.connection_manager is not None:
                 request["zmq_worker_pid"] = self.connection_manager.worker_pid
-            if not envs.ENABLE_V1_DATA_PROCESSOR and self.cfg.model_config.enable_mm:
+            if self.cfg.model_config.enable_mm:
                 self.request_client.send_pyobj(request)
             else:
                 self.request_client.send_json(request)
@@ -543,8 +543,7 @@ async def generate(
                             )
                         else:
                             processed_output = response_item
-                        if not envs.ENABLE_V1_DATA_PROCESSOR:
-                            processed_output = RequestOutput.from_dict(processed_output)
+                        processed_output = RequestOutput.from_dict(processed_output)
                         # Enrich outputs with prompt metadata on the first packet
                         if req_id:
                             prompt_meta = self._prompt_metadata.get(req_id)
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index 28776b53ede..a2e72a1c844 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -1165,7 +1165,7 @@ def _insert_zmq_task_to_scheduler(self):
         while self.running:
             try:
                 block = True if len(added_requests) == 0 else False
-                if not self.cfg.model_config.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+                if not self.cfg.model_config.enable_mm:
                     err, data = self.recv_request_server.receive_json_once(block)
                 else:
                     err, data = self.recv_request_server.receive_pyobj_once(block)
@@ -1222,8 +1222,7 @@ def _insert_zmq_task_to_scheduler(self):
                         continue
                     err_msg = None
                     try:
-                        if not envs.ENABLE_V1_DATA_PROCESSOR:
-                            request = Request.from_dict(data)
+                        request = Request.from_dict(data)
                         request.metrics.scheduler_recv_req_time = time.time()
                         main_process_metrics.requests_number.inc()
                         trace_carrier = data.get("trace_carrier")
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 3f311e743fd..f03a18594de 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -437,7 +437,7 @@ async def add_requests(self, task):
     def _send_task(self, task):
         if envs.ZMQ_SEND_BATCH_DATA:
             task["zmq_worker_pid"] = self.worker_pid
-        if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+        if not self.enable_mm:
             self.zmq_client.send_json(task)
         else:
             if envs.FD_ENABLE_E2W_TENSOR_CONVERT:
@@ -599,7 +599,7 @@ async def run_control_method(self, request: ControlRequest):
         req_dict = request.to_dict()
         if envs.ZMQ_SEND_BATCH_DATA:
             req_dict["zmq_worker_pid"] = self.worker_pid
-        if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+        if not self.enable_mm:
             self.zmq_client.send_json(req_dict)
         else:
             self.zmq_client.send_pyobj(req_dict)
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 9d380b0db0c..09e06ffa0f6 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@
 
 import fastdeploy.envs as envs
 import fastdeploy.metrics.trace as tracing
-from fastdeploy.engine.request import Request, RequestOutput
+from fastdeploy.engine.request import RequestOutput
 from fastdeploy.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -145,10 +145,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
             prompt_tokens = None
             max_tokens = None
             try:
-                if not envs.ENABLE_V1_DATA_PROCESSOR:
-                    current_req_dict = request.to_dict_for_infer(f"{request_id}_0")
-                else:
-                    current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0")
+                current_req_dict = request.to_dict_for_infer(f"{request_id}_0")
                 if "chat_template" not in current_req_dict:
                     current_req_dict["chat_template"] = self.chat_template
                 current_req_dict["metrics"]["arrival_time"] = time.time()
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 4caf9fe210a..9c2b386fce0 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -27,7 +27,7 @@
 
 import fastdeploy.envs as envs
 import fastdeploy.metrics.trace as tracing
-from fastdeploy.engine.request import Request, RequestOutput
+from fastdeploy.engine.request import RequestOutput
 from fastdeploy.entrypoints.openai.protocol import (
     CompletionLogprobs,
     CompletionRequest,
@@ -178,10 +178,7 @@ async def create_completion(self, request: CompletionRequest):
             try:
                 for idx, prompt in enumerate(request_prompts):
                     request_id_idx = f"{request_id}_{idx}"
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
-                    else:
-                        current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0")
+                    current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
                     current_req_dict["metrics"]["arrival_time"] = time.time()
                     prompt_token_ids = await self.engine_client.format_and_add_data(current_req_dict)  # tokenize
                     if isinstance(prompt_token_ids, np.ndarray):
diff --git a/fastdeploy/entrypoints/openai/serving_embedding.py b/fastdeploy/entrypoints/openai/serving_embedding.py
index ec3223b3576..25f3f630510 100644
--- a/fastdeploy/entrypoints/openai/serving_embedding.py
+++ b/fastdeploy/entrypoints/openai/serving_embedding.py
@@ -15,20 +15,17 @@
 """
 
 import base64
-import time
 from collections.abc import AsyncGenerator
 from typing import Literal, Union
 
 import numpy as np
 from typing_extensions import assert_never, override
 
-import fastdeploy.envs as envs
 from fastdeploy.engine.pooling_params import PoolingParams
 from fastdeploy.engine.request import (
     EmbeddingOutput,
     EmbeddingRequestOutput,
     PoolingRequestOutput,
-    Request,
 )
 from fastdeploy.entrypoints.openai.protocol import (
     EmbeddingCompletionRequest,
@@ -69,25 +66,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_
     @override
     def _request_to_dict(self, ctx: ServeContext):
         request: EmbeddingRequest = ctx.request
-        if not envs.ENABLE_V1_DATA_PROCESSOR:
-            request_dict = super()._request_to_dict(ctx)
-            if hasattr(request, "to_pooling_params"):
-                pooling_params: PoolingParams = request.to_pooling_params()
-                pooling_params.verify("embed", self.cfg.model_config)
-                request_dict["pooling_params"] = pooling_params.to_dict()
-                request_dict["metrics"] = {}
-            return request_dict
-        else:
-            request_obj = None
-            if hasattr(request, "to_pooling_params"):
-                pooling_params: PoolingParams = request.to_pooling_params()
-                pooling_params.verify("embed", self.cfg.model_config)
-                request_obj = Request.from_generic_request(
-                    req=request, request_id=ctx.request_id, pooling_params=pooling_params
-                )
-                request_obj.metrics.arrival_time = time.time()
-                super()._process_chat_template_kwargs(request_obj)
-            return request_obj
+        request_dict = super()._request_to_dict(ctx)
+        if hasattr(request, "to_pooling_params"):
+            pooling_params: PoolingParams = request.to_pooling_params()
+            pooling_params.verify("embed", self.cfg.model_config)
+            request_dict["pooling_params"] = pooling_params.to_dict()
+            request_dict["metrics"] = {}
+        return request_dict
 
     @override
     def _request_to_batch_dicts(self, ctx: ServeContext):
diff --git a/fastdeploy/entrypoints/openai/serving_reward.py b/fastdeploy/entrypoints/openai/serving_reward.py
index cbde62deea5..cc3ed8a4729 100644
--- a/fastdeploy/entrypoints/openai/serving_reward.py
+++ b/fastdeploy/entrypoints/openai/serving_reward.py
@@ -14,14 +14,12 @@
 # limitations under the License.
 """
 
-import time
 from collections.abc import AsyncGenerator
 
 from typing_extensions import override
 
-import fastdeploy.envs as envs
 from fastdeploy.engine.pooling_params import PoolingParams
-from fastdeploy.engine.request import PoolingRequestOutput, Request, RewardRequestOutput
+from fastdeploy.engine.request import PoolingRequestOutput, RewardRequestOutput
 from fastdeploy.entrypoints.openai.protocol import (
     ChatRewardData,
     ChatRewardRequest,
@@ -46,25 +44,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_
     @override
     def _request_to_dict(self, ctx: ServeContext):
         request: ChatRewardRequest = ctx.request
-        if not envs.ENABLE_V1_DATA_PROCESSOR:
-            request_dict = super()._request_to_dict(ctx)
-            if hasattr(request, "to_pooling_params"):
-                pooling_params: PoolingParams = request.to_pooling_params()
-                pooling_params.verify("reward", self.cfg.model_config)
-                request_dict["pooling_params"] = pooling_params.to_dict()
-                request_dict["metrics"] = {}
-            return request_dict
-        else:
-            request_obj: Request = None
-            if hasattr(request, "to_pooling_params"):
-                pooling_params: PoolingParams = request.to_pooling_params()
-                pooling_params.verify("reward", self.cfg.model_config)
-                request_obj = Request.from_generic_request(
-                    req=request, request_id=ctx.request_id, pooling_params=pooling_params
-                )
-                request_obj.metrics.arrival_time = time.time()
-                super()._process_chat_template_kwargs(request_obj)
-            return request_obj
+        request_dict = super()._request_to_dict(ctx)
+        if hasattr(request, "to_pooling_params"):
+            pooling_params: PoolingParams = request.to_pooling_params()
+            pooling_params.verify("reward", self.cfg.model_config)
+            request_dict["pooling_params"] = pooling_params.to_dict()
+            request_dict["metrics"] = {}
+        return request_dict
 
     @override
     def _request_to_batch_dicts(self, ctx: ServeContext):
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index 72cd6dc7c48..fef58eaf6cc 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -94,8 +94,6 @@ def _validate_split_kv_size(value: int) -> int:
     "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
     # enable kv cache block scheduler v1 (no need for kv_cache_ratio)
     "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
-    # enable data processor v2
-    "ENABLE_V1_DATA_PROCESSOR": lambda: int(os.getenv("ENABLE_V1_DATA_PROCESSOR", "0")),
     # set prealloc block num for decoder
     "FD_ENC_DEC_BLOCK_NUM": lambda: int(os.getenv("FD_ENC_DEC_BLOCK_NUM", "2")),
     # enbale max prefill of one execute step
diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py
index 04c028d9060..56bbe1296f7 100644
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -19,7 +19,6 @@
 from fastdeploy.config import ErnieArchitectures, ModelConfig
 from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
 from fastdeploy.reasoning import ReasoningParserManager
-from fastdeploy.utils import envs
 from fastdeploy.utils import llm_logger as logger
 
 
@@ -83,10 +82,7 @@ def create_processor(self):
             logger.info(f"Plugin input processor not available ({e}), using built-in processor")
             if not self.model_config.enable_mm:
                 if not ErnieArchitectures.contains_ernie_arch(architecture):
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.text_processor import DataProcessor
-                    else:
-                        from fastdeploy.input.v1.text_processor import DataProcessor
+                    from fastdeploy.input.text_processor import DataProcessor
 
                     self.processor = DataProcessor(
                         model_name_or_path=self.model_name_or_path,
@@ -94,14 +90,7 @@ def create_processor(self):
                         tool_parser_obj=tool_parser_obj,
                     )
                 else:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.ernie4_5_processor import (
-                            Ernie4_5Processor,
-                        )
-                    else:
-                        from fastdeploy.input.v1.ernie4_5_processor import (
-                            Ernie4_5Processor,
-                        )
+                    from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
 
                     self.processor = Ernie4_5Processor(
                         model_name_or_path=self.model_name_or_path,
@@ -110,14 +99,9 @@ def create_processor(self):
                     )
             else:
                 if ErnieArchitectures.contains_ernie_arch(architecture):
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.ernie4_5_vl_processor import (
-                            Ernie4_5_VLProcessor,
-                        )
-                    else:
-                        from fastdeploy.input.v1.ernie4_5_vl_processor import (
-                            Ernie4_5_VLProcessor,
-                        )
+                    from fastdeploy.input.ernie4_5_vl_processor import (
+                        Ernie4_5_VLProcessor,
+                    )
 
                     self.processor = Ernie4_5_VLProcessor(
                         model_name_or_path=self.model_name_or_path,
@@ -128,14 +112,9 @@ def create_processor(self):
                         enable_processor_cache=self.enable_processor_cache,
                     )
                 elif "PaddleOCRVL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.paddleocr_vl_processor import (
-                            PaddleOCRVLProcessor,
-                        )
-                    else:
-                        from fastdeploy.input.v1.paddleocr_vl_processor import (
-                            PaddleOCRVLProcessor,
-                        )
+                    from fastdeploy.input.paddleocr_vl_processor import (
+                        PaddleOCRVLProcessor,
+                    )
 
                     self.processor = PaddleOCRVLProcessor(
                         config=self.model_config,
@@ -145,12 +124,7 @@ def create_processor(self):
                         reasoning_parser_obj=reasoning_parser_obj,
                     )
                 elif "Qwen2_5_VL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-                    else:
-                        from fastdeploy.input.v1.qwen_vl_processor import (
-                            QwenVLProcessor,
-                        )
+                    from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
 
                     self.processor = QwenVLProcessor(
                         config=self.model_config,
@@ -161,12 +135,7 @@ def create_processor(self):
                         enable_processor_cache=self.enable_processor_cache,
                     )
                 elif "Qwen3VL" in architecture:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
-                    else:
-                        from fastdeploy.input.v1.qwen3_vl_processor import (
-                            Qwen3VLProcessor,
-                        )
+                    from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
 
                     self.processor = Qwen3VLProcessor(
                         config=self.model_config,
diff --git a/fastdeploy/input/v1/__init__.py b/fastdeploy/input/v1/__init__.py
deleted file mode 100644
index f4ede90624a..00000000000
--- a/fastdeploy/input/v1/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
diff --git a/fastdeploy/input/v1/ernie4_5_processor.py b/fastdeploy/input/v1/ernie4_5_processor.py
deleted file mode 100644
index f6545dc068a..00000000000
--- a/fastdeploy/input/v1/ernie4_5_processor.py
+++ /dev/null
@@ -1,641 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.v1.text_processor import BaseDataProcessor
-from fastdeploy.utils import data_processor_logger
-
-_SAMPLING_EPS = 1e-5
-from fastdeploy.input.utils import process_stop_token_ids
-
-
-class Ernie4_5Processor(BaseDataProcessor):
-    """
-    初始化模型实例。
-
-    Args:
-        model_name_or_path (str): 模型名称或路径。
-
-    Attributes:
-        model_name_or_path (str): 存储模型名称或路径。
-        decode_status (dict): 存储解码状态信息。
-        tokenizer (object): 存储分词器实例。
-        eos_token_ids (list): 存储结束符号的token ID列表。
-        eos_token_id_len (int): 存储结束符号的token ID列表的长度。
-        pad_token_id (int): 存储填充符号的token ID。
-    """
-
-    def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
-
-        self.model_name_or_path = model_name_or_path
-        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-
-        # Generation config
-        try:
-            self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
-        except Exception as e:
-            data_processor_logger.warning(
-                f"Can't find generation config, so it will not use "
-                f"generation_config field in the model config, details={e}"
-            )
-            self.generation_config = None
-
-        self.decode_status = dict()
-        self.tool_parser_dict = dict()
-        self.thinking_parser_dict = dict()
-        self.model_status_dict = dict()
-        self._load_tokenizer()
-        data_processor_logger.info(
-            f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
-                                   {self.tokenizer.bos_token_id}, \
-                                   eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
-        )
-        try:
-            from paddleformers.trl.llm_utils import get_eos_token_id
-        except Exception:
-            from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
-        self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
-        self.eos_token_id_len = len(self.eos_token_ids)
-        self.pad_token_id = self.get_pad_id()
-        self.reasoning_parser = None
-        self.tool_parser_obj = tool_parser_obj
-        if reasoning_parser_obj:
-            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """
-        Preprocess the request
-
-        Args:
-            request (Dict): may contain text and messages fields
-
-        Returns:
-            bool: Whether preprocessing is successful
-            str: error message
-        """
-        data_processor_logger.info(f"Start processing request: {request}")
-        request = self._apply_default_parameters(request)
-        if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        # processing bad_words
-        bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
-
-        logits_processors_args = self._prepare_think_stop_sentence(
-            request.get("logits_processors_args") or {}, max_model_len
-        )
-        request["logits_processors_args"] = logits_processors_args
-
-        # processing prompt_token_ids
-        if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
-            if request.prompt is not None:
-                # prompt = request.prompt if request.prompt is not None else request.messages[0]
-                prompt = request.prompt
-                assert isinstance(prompt, str) or (
-                    isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
-                ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
-
-                if isinstance(prompt, list):  # if prompt is a token id list
-                    request.prompt_token_ids = prompt
-                else:
-                    tokens = self.tokenizer.tokenize(prompt)
-                    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-                    request.prompt_token_ids = token_ids
-                    data_processor_logger.debug(
-                        f"request_ids: {request.request_id}, prompt: {prompt}, "
-                        f"tokens: {tokens}, token_ids: {token_ids}"
-                    )
-            elif request.messages is not None:
-                task = request.to_dict()
-                chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
-                if chat_template_kwargs:
-                    if isinstance(chat_template_kwargs, dict):
-                        for k, v in chat_template_kwargs.items():
-                            if k not in task or task[k] is None:
-                                task[k] = v
-                    else:
-                        raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-                request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs)
-            else:
-                raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
-
-        if len(request.prompt_token_ids) == 0:
-            raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
-        # truncate prompts that exceed the length limit
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-        logits_processors_args = self._update_thinking_prompt_state(
-            request.prompt_token_ids, request.get("logits_processors_args") or {}
-        )
-        request["logits_processors_args"] = logits_processors_args
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if request.get("max_tokens") is None:
-            request.set("max_tokens", max(1, max_tokens))
-        else:
-            request.set("max_tokens", min(max_tokens, request.get("max_tokens")))
-        if request.get("temperature") < _SAMPLING_EPS:
-            # zero temperature is equivalent to greedy sampling
-            request.set("temperature", 1)
-            request.set("top_k", 1)
-        if request.get("top_p") < _SAMPLING_EPS:
-            request.set("top_p", _SAMPLING_EPS)
-            request.set("top_k", 1)
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-        if request.get("response_max_tokens") is not None and request.enable_thinking is False:
-            request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
-
-        data_processor_logger.info(f"Processed request: {request}")
-        return request
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """
-        Preprocess the request
-
-        Args:
-            request Request: may contain text and messages fields
-
-        Returns:
-            bool: Whether preprocessing is successful
-            str: error message
-        """
-        data_processor_logger.info(f"Start processing request: {request}")
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        # processing bad_words
-        bad_words = request.sampling_params.bad_words
-        bad_words_token_ids = request.sampling_params.bad_words_token_ids
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
-        logits_processors_args = self._prepare_think_stop_sentence(
-            getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
-        )
-        request.sampling_params.logits_processors_args = logits_processors_args
-
-        # processing prompt_token_ids
-        if not request.prompt_token_ids:
-            if request.prompt:
-                prompt = request.prompt
-                assert isinstance(prompt, str) or (
-                    isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
-                ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
-                if isinstance(prompt, list):  # if prompt is a token id list
-                    request.prompt_token_ids = prompt
-                else:
-                    request.prompt_tokens = prompt
-                    tokens = self.tokenizer.tokenize(prompt)
-                    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-                    request.prompt_token_ids = token_ids
-                    req_id = request.request_id
-                    data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
-            elif request.messages:
-                chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
-                if not chat_template_kwargs:
-                    chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-                if chat_template_kwargs:
-                    if isinstance(chat_template_kwargs, dict):
-                        for k, v in chat_template_kwargs.items():
-                            if not getattr(request, k, None):
-                                setattr(request, k, v)
-                    else:
-                        raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-                    if getattr(request, "enable_thinking") is None:
-                        setattr(request, "enable_thinking", True)
-                request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs)
-                delattr(request, "chat_template_kwargs")
-            else:
-                raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
-
-        if len(request.prompt_token_ids) == 0:
-            raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
-        # truncate prompts that exceed the length limit
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-        logits_processors_args = self._update_thinking_prompt_state(
-            request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {}
-        )
-        request.sampling_params.logits_processors_args = logits_processors_args
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-        if request.sampling_params.temperature < _SAMPLING_EPS:
-            # zero temperature is equivalent to greedy sampling
-            request.sampling_params.temperature = 1
-            request.sampling_params.top_k = 1
-        if request.sampling_params.top_p < _SAMPLING_EPS:
-            request.sampling_params.top_p = _SAMPLING_EPS
-            request.sampling_params.top_k = 1
-
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.sampling_params.n or 1
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-        if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False:
-            request.sampling_params.max_tokens = min(
-                request.sampling_params.response_max_tokens, request.sampling_params.max_tokens
-            )
-
-        data_processor_logger.info(f"Processed request: {request}")
-        return request
-
-    def process_response(self, response_dict, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_dict (Dict): response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        req_id = response_dict.request_id
-        token_ids = response_dict.outputs.token_ids
-
-        response_dict.usage = {"completion_tokens": response_dict.outputs.index + 1}
-        if token_ids[-1] == self.tokenizer.eos_token_id:
-            token_ids = token_ids[:-1]
-        full_text = self.tokenizer.decode(token_ids)
-        if self.reasoning_parser:
-            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                full_text,
-                response_dict,
-                self.model_status_dict[req_id],
-            )
-            response_dict.outputs.text = text
-            response_dict.outputs.reasoning_content = reasoning_content
-        else:
-            response_dict.outputs.text = full_text
-        if self.tool_parser_obj:
-            tool_parser = self.tool_parser_obj(self.tokenizer)
-            tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
-            if tool_call_info.tools_called:
-                response_dict.outputs.tool_calls = tool_call_info.tool_calls
-                response_dict.outputs.text = tool_call_info.content
-        if req_id in self.model_status_dict:
-            del self.model_status_dict[req_id]
-        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
-        if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
-            return None
-        return response_dict
-
-    def process_response_dict(self, response_dict, stream, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_dict (Dict): response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        if stream:
-            return self.process_response_obj_streaming(response_dict, **kwargs)
-        else:
-            return self.process_response_obj_normal(response_dict, **kwargs)
-
-    def process_response_obj_normal(self, response_obj, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_obj : response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        token_ids = response_obj.outputs.token_ids
-        is_end = response_obj.finished
-        req_id = response_obj.request_id
-        request = kwargs.get("request", None)
-        if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
-            if token_ids[-1] == self.tokenizer.eos_token_id:
-                token_ids = token_ids[:-1]
-        delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
-        if is_end:
-            full_text = previous_texts + delta_text
-            response_obj.outputs.text = full_text
-            if self.reasoning_parser:
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                    full_text,
-                    request,
-                    self.model_status_dict[req_id],
-                )
-                response_obj.outputs.text = text
-                response_obj.outputs.reasoning_content = reasoning_content
-                reasoning_tokens = self.tokenizer.tokenize(reasoning_content)
-                response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
-            if self.tool_parser_obj:
-                tool_parser = self.tool_parser_obj(self.tokenizer)
-                tool_call_info = tool_parser.extract_tool_calls(full_text, request)
-                if tool_call_info.tools_called:
-                    response_obj.outputs.tool_calls = tool_call_info.tool_calls
-                    response_obj.outputs.text = tool_call_info.content
-            response_obj.outputs.completion_tokens = full_text
-            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
-            del self.decode_status[req_id]
-            if req_id in self.model_status_dict:
-                del self.model_status_dict[req_id]
-        return response_obj
-
-    def process_response_obj_streaming(self, response_obj, **kwargs):
-        """
-        Preprocess the response streaming
-
-        Args:
-            response_obj : response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        token_ids = response_obj.outputs.token_ids
-        is_end = response_obj.finished
-        req_id = response_obj.request_id
-        request = kwargs.get("request", None)
-
-        if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
-            if token_ids[-1] == self.tokenizer.eos_token_id:
-                token_ids = token_ids[:-1]
-        delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
-        response_obj.outputs.completion_tokens = delta_text
-        if self.reasoning_parser:
-            reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
-                previous_texts,
-                previous_texts + delta_text,
-                delta_text,
-                previous_token_ids,
-                previous_token_ids + token_ids,
-                token_ids,
-                self.model_status_dict[req_id],
-            )
-            response_obj.outputs.delta_message = reasoning_delta_message
-            reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
-            reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
-            response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
-            response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
-            response_obj.outputs.reasoning_content = reasoning_content
-            response_obj.outputs.text = (
-                reasoning_delta_message.content or ""
-                if reasoning_delta_message and hasattr(reasoning_delta_message, "content")
-                else ""
-            )
-        else:
-            response_obj.outputs.text = delta_text
-        if self.tool_parser_obj:
-            if req_id not in self.tool_parser_dict:
-                self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
-            tool_parser = self.tool_parser_dict[req_id]
-            tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
-                previous_texts,
-                previous_texts + delta_text,
-                delta_text,
-                previous_token_ids,
-                previous_token_ids + token_ids,
-                token_ids,
-                request,
-            )
-            if tool_call_delta_message is None or tool_call_delta_message.tool_calls:
-                response_obj.outputs.delta_message = tool_call_delta_message
-
-        if is_end:
-            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
-            del self.decode_status[req_id]
-            if req_id in self.tool_parser_dict:
-                del self.tool_parser_dict[req_id]
-            if req_id in self.model_status_dict:
-                del self.model_status_dict[req_id]
-        return response_obj
-
-    def messages2ids(self, request_or_messages, **kwargs):
-        """
-        Convert multi-turn messages into ID sequences.
-
-        Args:
-            request_or_messages: Either a request dict containing 'messages' field,
-                                or a list of message dicts directly
-
-        Returns:
-            List of token IDs as strings (converted from token objects)
-        """
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat_template.")
-        message_dict = {
-            key: getattr(request_or_messages, key, None)
-            for key in ["messages", "tools", "documents", "enable_thinking", "system"]
-            if getattr(request_or_messages, key, None) is not None
-        }
-        spliced_message = self.tokenizer.apply_chat_template(
-            message_dict,
-            tokenize=False,
-            split_special_tokens=False,
-            add_special_tokens=False,
-            **kwargs,
-        )
-        request_or_messages.prompt_tokens = spliced_message
-        req_id = getattr(request_or_messages, "request_id", None)
-        tokens = self.tokenizer.tokenize(spliced_message)
-        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-        data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
-        return token_ids
-
-    def ids2tokens(self, token_id, task_id):
-        """
-        token ids to strings
-
-        Args:
-            token_ids (List[int]): token ids
-                        task_id (str): task id
-
-        Returns:
-            List[str]: strings
-        """
-
-        if task_id not in self.decode_status:
-            # prefix offset & read offset & history token ids & history token strings
-            self.decode_status[task_id] = [0, 0, [], ""]
-
-        prefix_offset = self.decode_status[task_id][0]
-        read_offset = self.decode_status[task_id][1]
-        previous_token_ids = self.decode_status[task_id][2]
-        previous_texts = self.decode_status[task_id][3]
-        decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
-            previous_token_ids + token_id, prefix_offset, read_offset
-        )
-        self.decode_status[task_id][0] = prefix_offset
-        self.decode_status[task_id][1] = read_offset
-        self.decode_status[task_id][2] += token_id
-        self.decode_status[task_id][3] += decode_str
-
-        return decode_str, previous_token_ids, previous_texts
-
-    def _load_tokenizer(self):
-        """
-        load tokenizer
-
-        Returns:
-            tokenizer (AutoTokenizer)
-        """
-        vocab_file_names = [
-            "tokenizer.model",
-            "spm.model",
-            "ernie_token_100k.model",
-        ]
-        for i in range(len(vocab_file_names)):
-            if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
-                Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
-                break
-        self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
-
-    def get_pad_id(self):
-        """
-        get pad_token_id, if not pad_token_id, use eos_token
-
-        Returns:
-            int: pad_token_id
-        """
-        # if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
-        #     return self.tokenizer.eos_token
-        return self.tokenizer.pad_token_id
-
-    def pad_batch_data(
-        self,
-        insts,
-        pad_id=0,
-        return_seq_len=False,
-        return_array=True,
-        pad_style="right",
-    ):
-        """Pad the instances to the max sequence length in batch."""
-        if len(insts) == 0:
-            padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]]
-            if return_seq_len:
-                seq_len = np.array([], dtype=np.int64) if return_array else []
-                return padded_insts, seq_len
-            return padded_insts
-
-        max_len = max(map(len, insts))
-        if pad_style == "left":
-            padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]
-        else:
-            padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]
-        if return_array:
-            padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len])
-
-        if return_seq_len:
-            seq_len = [len(inst) for inst in insts]
-            if return_array:
-                seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
-            return padded_insts, seq_len
-        return padded_insts
-
-    def update_stop_seq(self, stop_sequences):
-        """
-        Update stop sequences from request.
-        """
-        stop_seqs = []
-        if isinstance(stop_sequences, str):
-            stop_sequences = [stop_sequences]
-        for seq in stop_sequences:
-            if seq != self.tokenizer.eos_token_id:
-                stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
-        stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
-        data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
-        return stop_seqs, stop_seqs_len
-
-    def process_logprob_response(self, token_ids, **kwargs):
-        full_text = self.tokenizer.decode(token_ids, **kwargs)
-        return full_text
-
-    def update_bad_words(self, bad_words, bad_words_token_ids):
-        """Support bad words"""
-
-        token_ids = bad_words_token_ids
-
-        if token_ids is None:
-            token_ids = []
-        for bad_word in bad_words:
-            # To prohibit words both at the beginning
-            # and in the middle of text
-            # (related to add_prefix_space tokenizer parameter)
-            for add_prefix_space in [False, True]:
-                prefix = " " if add_prefix_space else ""
-                prompt = prefix + bad_word.lstrip()
-                prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
-                data_processor_logger.debug(f"processed bad_words: {prompt}, {prompt_token_ids}")
-
-                if len(prompt_token_ids) != 1:
-                    if not add_prefix_space:
-                        data_processor_logger.warning(
-                            f"Skip bad_words: <{prompt}>."
-                            f"Bad words should be a single token."
-                            f"Got tokens: {prompt_token_ids}."
-                        )
-                    continue
-
-                if prompt_token_ids[0] > self.tokenizer.vocab_size:
-                    if not add_prefix_space:
-                        data_processor_logger.warning(
-                            f"Skip bad_words: <{prompt}>."
-                            f"All token id values should be satisfying:"
-                            f" 0 <= token_id < {self.tokenizer.vocab_size}."
-                            f"Got token: {prompt_token_ids}."
-                        )
-                    continue
-
-                if prompt_token_ids not in token_ids:
-                    token_ids.extend(prompt_token_ids)
-        return token_ids
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py
deleted file mode 100644
index f7d30a78d58..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .ernie4_5_vl_processor import Ernie4_5_VLProcessor
-from .process import DataProcessor, fancy_print
-from .process_video import read_video_decord
-from .utils.video_utils import VideoReaderWrapper
-
-__all__ = [
-    "DataProcessor",
-    "fancy_print",
-    "VideoReaderWrapper",
-    "read_video_decord",
-    "Ernie4_5_VLProcessor",
-]
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py
deleted file mode 100644
index bae80b60a96..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import traceback
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
-from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-_SAMPLING_EPS = 1e-5
-
-
-class Ernie4_5_VLProcessor(Ernie4_5Processor):
-    """The processor class for ERNIE MoE VL models."""
-
-    def __init__(
-        self,
-        model_name_or_path,
-        limit_mm_per_prompt=None,
-        mm_processor_kwargs=None,
-        reasoning_parser_obj=None,
-        tool_parser_obj=None,
-        enable_processor_cache=False,
-    ):
-        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-        tokenizer_path = model_name_or_path
-        preprocessor_path = model_name_or_path
-        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-
-        self.ernie4_5_processor = DataProcessor(
-            tokenizer_name=tokenizer_path,
-            image_preprocessor_name=preprocessor_path,
-            enable_processor_cache=enable_processor_cache,
-            **processor_kwargs,
-        )
-        self.ernie4_5_processor.eval()
-        self.image_patch_id = self.ernie4_5_processor.image_patch_id
-        self.spatial_conv_size = self.ernie4_5_processor.spatial_conv_size
-
-        self.tool_parser_dict = dict()
-        self.decode_status = dict()
-        self.model_status_dict = dict()
-        self._load_tokenizer()
-
-        # Generation config
-        try:
-            self.generation_config = GenerationConfig.from_pretrained(model_name_or_path)
-        except Exception as e:
-            data_processor_logger.warning(
-                f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
-            )
-            self.generation_config = None
-
-        # self.eos_token_ids = [self.tokenizer.eos_token_id]
-        try:
-            from paddleformers.trl.llm_utils import get_eos_token_id
-        except Exception:
-            from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
-        self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
-        self.eos_token_id_len = len(self.eos_token_ids)
-        self.pad_token_id = self.get_pad_id()
-        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-        self.reasoning_parser = None
-        if reasoning_parser_obj:
-            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
-        self.tool_parser_obj = tool_parser_obj
-
-    def get_pad_id(self):
-        """get pad id"""
-        return self.tokenizer.pad_token_id
-
-    def _load_tokenizer(self):
-        """
-        load tokenizer
-
-        Returns:
-            tokenizer (AutoTokenizer)
-        """
-        self.tokenizer = self.ernie4_5_processor.tokenizer
-
-    def _apply_default_parameters(self, request):
-        """
-        Apply default value for parameters in request
-        """
-
-        def set_value(req, key, value):
-            value = getattr(self.generation_config, key, value)
-            if getattr(req.sampling_params, key) is None:
-                setattr(req.sampling_params, key, value)
-
-        set_value(request, "top_p", 0.7)
-        set_value(request, "temperature", 1.0)
-        set_value(request, "repetition_penalty", 1.0)
-        set_value(request, "frequency_penalty", 0.0)
-        set_value(request, "presence_penalty", 0.0)
-        return request
-
-    def _parse_processor_kwargs(self, kwargs):
-        """解析多模态处理器参数配置"""
-        if not kwargs:
-            return {}
-
-        try:
-            if not isinstance(kwargs, dict):
-                raise ValueError("mm-processor-kwargs must be a dictionary")
-
-            # 验证参数类型
-            data_processor_logger.info(f"kwargs:{kwargs}")
-            expected_types = {
-                "spatial_conv_size": int,
-                "temporal_conv_size": int,
-                "image_min_pixels": int,
-                "image_max_pixels": int,
-                "video_min_pixels": int,
-                "video_max_pixels": int,
-                "video_target_frames": int,
-                "video_frames_sample": str,
-                "video_max_frames": int,
-                "video_min_frames": int,
-                "video_fps": int,
-            }
-
-            for key, value in kwargs.items():
-                if key in expected_types and not isinstance(value, expected_types[key]):
-                    raise ValueError(
-                        f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
-                    )
-
-            return kwargs
-
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}, {str(traceback.format_exc())}")
-            return {}
-
-    def _parse_limits(self, limits):
-        """解析多模态限制配置"""
-        DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
-        if not limits:
-            return DEFAULT_LIMITS
-
-        try:
-            if not isinstance(limits, dict):
-                raise ValueError("limit-mm-per-prompt must be a dictionary")
-            data_processor_logger.info(f"_parse_limits:{limits}")
-            return {**DEFAULT_LIMITS, **limits}
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
-            return DEFAULT_LIMITS
-
-    def _check_mm_limits(self, item):
-        if isinstance(item, dict):
-            # 请求包含prompt和multi_modal_data
-            mm_data = item
-        else:
-            # 请求包含messages
-            mm_data = {"image": [], "video": []}
-
-            for message in item:
-                if isinstance(message.get("content"), list):
-                    for part in message["content"]:
-                        if part.get("type") == "image":
-                            mm_data["image"].append(part)
-                        elif part.get("type") == "video":
-                            mm_data["video"].append(part)
-
-        for modality, data in mm_data.items():
-            if modality in self.limit_mm_per_prompt:
-                limit = self.limit_mm_per_prompt[modality]
-                if len(data) > limit:
-                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """process the input data"""
-        task = request.to_dict()
-        task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs")
-        self.process_request_dict(task, max_model_len)
-        request = Request.from_dict(task)
-        request = self._apply_default_parameters(request)
-
-        return request
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """process the input data"""
-
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        bad_words = request.sampling_params.bad_words
-        bad_words_token_ids = request.sampling_params.bad_words_token_ids
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
-        logits_processors_args = self._prepare_think_stop_sentence(
-            getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
-        )
-        request.sampling_params.logits_processors_args = logits_processors_args
-
-        if request.prompt_token_ids:
-            messages = request.messages
-            if messages:
-                self._check_mm_limits(messages)
-            if getattr(request, "enable_thinking") is None:
-                setattr(request, "enable_thinking", True)
-            outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request)
-        elif request.prompt:
-            multimodal_data = request.multimodal_data
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            request.prompt_tokens = request.prompt
-            outputs = self.ernie4_5_processor.text2ids(request.prompt, images, videos)
-        elif request.messages:
-            messages = request.messages
-            self._check_mm_limits(messages)
-            chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
-            if not chat_template_kwargs:
-                chat_template_kwargs = request.chat_template_kwargs
-            if chat_template_kwargs:
-                if isinstance(chat_template_kwargs, dict):
-                    for k, v in chat_template_kwargs.items():
-                        if getattr(request, k, None) is None:
-                            setattr(request, k, v)
-                else:
-                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-                if getattr(request, "enable_thinking") is None:
-                    setattr(request, "enable_thinking", True)
-            outputs = self.ernie4_5_processor.request2ids(request)
-            delattr(request, "chat_template_kwargs")
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        if request.completion_token_ids:
-            self.append_completion_tokens(outputs, request.completion_token_ids)
-
-        outputs = self.pack_outputs(outputs)
-        request.prompt_token_ids = (
-            outputs["input_ids"].tolist()
-            if not getattr(request, "prompt_token_ids", None)
-            else request.prompt_token_ids
-        )
-        request.prompt_token_ids_len = len(request.prompt_token_ids)
-        request.multimodal_inputs = outputs
-
-        # 截断超过长度限制的prompt
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-        logits_processors_args = self._update_thinking_prompt_state(
-            request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {}
-        )
-        request.sampling_params.logits_processors_args = logits_processors_args
-
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-        if request.sampling_params.reasoning_max_tokens is None:
-            request.sampling_params.reasoning_max_tokens = max(int(request.sampling_params.max_tokens * 0.8), 1)
-            request.reasoning_max_tokens = request.sampling_params.reasoning_max_tokens
-        data_processor_logger.info(f"Processed request {request}")
-
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.sampling_params.n or 1
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-        if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS:
-            request.sampling_params.top_p = _SAMPLING_EPS
-            request.sampling_params.top_k = 1
-        if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False:
-            request.sampling_params.max_tokens = min(
-                request.sampling_params.response_max_tokens, request.sampling_params.max_tokens
-            )
-        return request
-
-    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
-        "append already completion tokens"
-
-        num_tokens = len(completion_token_ids)
-        multimodal_inputs["input_ids"].extend(completion_token_ids)
-        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
-        start = multimodal_inputs["cur_position"]
-        for i in range(num_tokens):
-            multimodal_inputs["position_ids"].append([start + i] * 3)
-        multimodal_inputs["cur_position"] += num_tokens
-
-    def pack_outputs(self, outs):
-        # Stack or nullify image-related fields
-        if not outs["images"]:
-            outs["images"] = None
-            outs["grid_thw"] = None
-            outs["image_type_ids"] = None
-        else:
-            outs["images"] = np.vstack(outs["images"])
-            outs["grid_thw"] = np.vstack(outs["grid_thw"])
-            outs["image_type_ids"] = np.array(outs["image_type_ids"])
-
-        outs["image_patch_id"] = self.image_patch_id
-        # Convert lists to arrays
-        outs["input_ids"] = np.array(outs["input_ids"], dtype=np.int64)
-        outs["token_type_ids"] = np.array(outs["token_type_ids"], dtype=np.int64)
-        outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
-        outs["mm_num_token_func"] = self.ernie4_5_processor.mm_num_tokens
-        return outs
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py
deleted file mode 100644
index c11444e6758..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .get_image_preprocessor import get_image_preprocessor
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
deleted file mode 100644
index 0ff6f7d1ed5..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""get image preprocessor"""
-
-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-
-def get_image_preprocessor(args):
-    """
-    get_image_preprocessor from args
-    """
-
-    if args.vision_model_name_or_path is None:
-        return None
-
-    data_processor_logger.info("use AdaptiveImageProcessor")
-    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
-    return image_preprocess
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
deleted file mode 100644
index 6dcdf3a4e96..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
+++ /dev/null
@@ -1,587 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""image preprocessor adaptive"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarrray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-__all__ = [
-    "AdaptiveImageProcessor",
-]
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Checks to see whether the pixel values have already been rescaled to [0, 1].
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
-    """dummy"""
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
-
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
-
-    raise ValueError(f"Could not make batched video from {videos}")
-
-
-class AdaptiveImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a adaptive image processor that dynamically resizes images based on the original images.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use when resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
-            in the image.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
-            The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-            The max pixels of the image to resize the image.
-        patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
-        temporal_conv_size (`int`, *optional*, defaults to 2):
-            The temporal conv size in resampler.
-        merge_size (`int`, *optional*, defaults to 2):
-            The merge size of the vision encoder to llm encoder.
-    """
-
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_conv_size: int = 2,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        """init"""
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_conv_size = temporal_conv_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
-        self.do_convert_rgb = do_convert_rgb
-
-    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
-        """设定pixels"""
-        if min_pixels is not None:
-            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
-            self.min_pixels = min_pixels
-            self.size["min_pixels"] = int(min_pixels)
-        if max_pixels is not None:
-            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
-            self.max_pixels = max_pixels
-            self.size["max_pixels"] = int(max_pixels)
-
-    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
-        """dummy"""
-        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,
-            min_pixels=actual_min_pixels,
-            max_pixels=actual_max_pixels,
-        )
-        return (resized_height, resized_width), (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        do_resize: bool = True,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
-                If pixel values range from 0 to 1, set `do_rescale=False`.
-            vision_info (`List[Dict]`, *optional*):
-                Optional list of dictionaries containing additional information about vision inputs.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Scale factor to use if rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Mean to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Standard deviation to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        if predetermined_grid_thw is not None:
-            assert len(predetermined_grid_thw) == len(
-                images
-            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
-
-        for img_idx, image in enumerate(images):
-            if do_resize:
-                if predetermined_grid_thw is not None:
-                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
-                    resized_height *= self.patch_size
-                    resized_width *= self.patch_size
-                else:
-                    resized_height, resized_width = smart_resize(
-                        height,
-                        width,
-                        factor=self.patch_size * self.merge_size,
-                        min_pixels=self.min_pixels,
-                        max_pixels=self.max_pixels,
-                    )
-                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
-                # 直接fromarray，不要靠paddleformers里面的
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-            if do_rescale:
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-
-            processed_images.append(image)
-        patches = np.array(processed_images)
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])
-
-        channel = patches.shape[1]  # [time, C, H, W]
-        grid_t = patches.shape[0]
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        patches = patches.reshape(
-            [
-                grid_t,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
-        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.patch_size * self.patch_size,
-            ]
-        )  # [grid_t * grid_h * grid_w, C * psz * psz]
-
-        return flatten_patches, (grid_t, grid_h, grid_w)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        do_resize: bool = True,
-        size: Optional[Union[int, List[int]]] = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            videos (`VideoInput`):
-                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
-                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if images is not None:
-            images = make_batched_images(images)
-        if videos is not None:
-            videos = make_batched_videos(videos)
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        if images is not None:
-            pixel_values, vision_grid_thws = [], []
-            for img_idx, image in enumerate(images):
-                if predetermined_grid_thw is not None:
-                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
-                else:
-                    predetermined_grid_thw_one = None
-                patches, image_grid_thw = self._preprocess(
-                    image,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw_one,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(image_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-            data = {
-                "pixel_values": pixel_values,
-                "image_grid_thw": vision_grid_thws,
-            }
-
-        if videos is not None:
-            pixel_values, vision_grid_thws = [], []
-            for images in videos:
-                patches, video_grid_thw = self._preprocess(
-                    images,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(video_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-
-            data = {
-                "pixel_values_videos": pixel_values,
-                "video_grid_thw": vision_grid_thws,
-            }
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
-
-
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = IMAGE_FACTOR,
-    min_pixels: int = MIN_PIXELS,
-    max_pixels: int = MAX_PIXELS,
-):
-    """
-    Rescales the image so that the following conditions are met:
-
-    1. Both dimensions (height and width) are divisible by 'factor'.
-
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process.py
deleted file mode 100644
index d8b90e54d57..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py
+++ /dev/null
@@ -1,751 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-""" process.py """
-import copy
-import os
-import pickle
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers.image_utils import ChannelDimension
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
-from .process_video import read_frames_decord, read_video_decord
-from .utils.render_timestamp import render_frame_timestamp
-
-
-def fancy_print(input_ids, tokenizer, image_patch_id=None):
-    """
-    input_ids: input_ids
-    tokenizer: the tokenizer of models
-    """
-    i = 0
-    res = ""
-    text_ids = []
-    real_image_token_len = 0
-    while i < len(input_ids):
-        if input_ids[i] == image_patch_id:
-            if len(text_ids) > 0:
-                res += tokenizer.decode(text_ids)
-                text_ids = []
-
-            real_image_token_len += 1
-        else:
-            if real_image_token_len != 0:
-                res += f"<|IMAGE@{real_image_token_len}|>"
-                real_image_token_len = 0
-
-            text_ids.append(input_ids[i])
-
-        i += 1
-    if len(text_ids) > 0:
-
-        res += tokenizer.decode(text_ids)
-        text_ids = []
-    return res
-
-
-class DataProcessor(MMBaseDataProcessor):
-    """
-    Processes multimodal chat messages into model-ready inputs,
-    handling text, images, and videos with 3D positional embeddings.
-    """
-
-    CLS_TOKEN = "<|begin_of_sentence|>"
-    SEP_TOKEN = "<|end_of_sentence|>"
-    EOS_TOKEN = "</s>"
-    IMG_START = "<|IMAGE_START|>"
-    IMG_END = "<|IMAGE_END|>"
-    VID_START = "<|VIDEO_START|>"
-    VID_END = "<|VIDEO_END|>"
-
-    def __init__(
-        self,
-        tokenizer_name: str,
-        image_preprocessor_name: str,
-        enable_processor_cache: bool = False,
-        spatial_conv_size: int = 2,
-        temporal_conv_size: int = 2,
-        image_min_pixels: int = 4 * 28 * 28,
-        image_max_pixels: int = 6177 * 28 * 28,
-        video_min_pixels: int = 299 * 28 * 28,
-        video_max_pixels: int = 1196 * 28 * 28,
-        video_target_frames: int = -1,
-        video_frames_sample: str = "leading",
-        video_max_frames: int = 180,
-        video_min_frames: int = 16,
-        video_fps: int = 2,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        # Tokenizer and image preprocessor
-        self.model_name_or_path = tokenizer_name
-        self._load_tokenizer()
-        self.tokenizer.ignored_index = -100
-        self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
-        self.enable_processor_cache = enable_processor_cache
-
-        # Convolution sizes for patch aggregation
-        self.spatial_conv_size = spatial_conv_size
-        self.temporal_conv_size = temporal_conv_size
-
-        # Pixel constraints
-        self.image_min_pixels = image_min_pixels
-        self.image_max_pixels = image_max_pixels
-        self.video_min_pixels = video_min_pixels
-        self.video_max_pixels = video_max_pixels
-
-        # Video sampling parameters
-        self.target_frames = video_target_frames
-        self.frames_sample = video_frames_sample
-        self.max_frames = video_max_frames
-        self.min_frames = video_min_frames
-        self.fps = video_fps
-
-        # Special tokens and IDs
-        self.cls_token = self.CLS_TOKEN
-        self.sep_token = self.SEP_TOKEN
-        self.eos_token = self.EOS_TOKEN
-        self.image_start = self.IMG_START
-        self.image_end = self.IMG_END
-        self.video_start = self.VID_START
-        self.video_end = self.VID_END
-        self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
-        self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start)
-        self.image_end_id = self.tokenizer.convert_tokens_to_ids(self.image_end)
-        self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start)
-        self.video_end_id = self.tokenizer.convert_tokens_to_ids(self.video_end)
-        self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token)
-        self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token)
-
-        self.token_type_mapping = self._build_token_type_mapping()
-        self.is_training = True
-        self.role_prefixes = {
-            "system": "",
-            "user": "User: ",
-            "bot": "Assistant: ",
-            "assistant": "Assistant: ",
-            "tool": "Tool: ",
-        }
-
-    @staticmethod
-    def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
-        """
-        Calculate the number of tokens in the multimodal input.
-        """
-        if isinstance(grid_thw, paddle.Tensor):
-            grid_thw = grid_thw.numpy()
-
-        if len(grid_thw) == 0:
-            return 0
-
-        def calc_one(thw):
-            t, h, w = map(int, thw)
-            if t == 1:
-                return t * h * w // 4
-            else:
-                return t * h * w // 4 // 2
-
-        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
-            return [calc_one(x) for x in grid_thw]
-
-        return calc_one(grid_thw)
-
-    def _build_token_type_mapping(self) -> Dict[Any, int]:
-        mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
-        for token in (
-            self.IMG_START,
-            self.IMG_END,
-            self.VID_START,
-            self.VID_END,
-        ):
-            mapping[token] = IDS_TYPE_FLAG["image"]
-        mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"]
-        return mapping
-
-    def train(self) -> None:
-        """Enable training mode (produces labels)."""
-        self.is_training = True
-
-    def eval(self) -> None:
-        """Enable evaluation mode (doesn't produce labels)."""
-        self.is_training = False
-
-    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
-        """
-        Convert chat text into model inputs.
-
-        Args:
-            text (str): The chat text containing placeholders for images and videos.
-            images (list, optional): List of images to be processed and inserted at image placeholders.
-            videos (list, optional): List of videos to be processed and inserted at video placeholders.
-            image_uuid (list, optional): List of unique identifiers for each image, used for caching or hashing.
-            video_uuid (list, optional): List of unique identifiers for each video, used for caching or hashing.
-        Returns:
-            dict: A dictionary with keys input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels, etc.
-        """
-
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "mm_positions": [],
-            "mm_hashes": [],
-        }
-
-        IMAGE_PLACEHOLDER = "<|image@placeholder|>"
-        VIDEO_PLACEHOLDER = "<|video@placeholder|>"
-        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
-        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-        st, image_idx, video_idx = 0, 0, 0
-        while st < len(text):
-            image_pos = text.find(IMAGE_PLACEHOLDER, st)
-            image_pos = len(text) if image_pos == -1 else image_pos
-            video_pos = text.find(VIDEO_PLACEHOLDER, st)
-            video_pos = len(text) if video_pos == -1 else video_pos
-            ed = min(image_pos, video_pos)
-
-            self._add_text(text[st:ed], outputs)
-            if ed == len(text):
-                break
-
-            if ed == image_pos:
-                image = images[image_idx]
-                uuid = image_uuid[image_idx] if image_uuid else None
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid)
-                else:
-                    # cached images are already processed
-                    self._add_processed_image(image, outputs, uuid)
-                image_idx += 1
-                st = ed + IMAGE_PLACEHOLDER_LEN
-            else:
-                item = videos[video_idx]
-                uuid = video_uuid[video_idx] if video_uuid else None
-                if not isinstance(item, tuple):
-                    if isinstance(item, dict):
-                        frames = self._load_and_process_video(item["video"], item)
-                    else:
-                        frames = self._load_and_process_video(item, {})
-                    self._add_video(frames, outputs, uuid)
-                else:
-                    # cached frames are already processed
-                    self._add_processed_video(item, outputs, uuid)
-                video_idx += 1
-                st = ed + VIDEO_PLACEHOLDER_LEN
-
-        return outputs
-
-    def extract_mm_items(self, request: Request):
-        messages = parse_chat_messages(request.messages)
-        mm_items = []
-        for msg in messages:
-            role = msg.get("role")
-            assert role in self.role_prefixes, f"Unsupported role: {role}"
-            content = msg.get("content")
-            if not isinstance(content, list):
-                content = [content]
-            for item in content:
-                if item.get("type") in ["image", "video"]:
-                    mm_items.append(item)
-
-        missing_hashes, missing_idx = [], []
-        for idx, item in enumerate(mm_items):
-            if not item.get("data"):
-                # raw data not provided, should be retrieved from processor cache
-                missing_hashes.append(item.get("uuid"))
-                missing_idx.append(idx)
-
-        if len(missing_hashes) > 0 and not self.enable_processor_cache:
-            raise ValueError("Missing items cannot be retrieved without processor cache.")
-
-        dealer = None
-        if self.enable_processor_cache:
-            context = zmq.Context()
-            dealer = context.socket(zmq.DEALER)
-            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
-            missing_items = self.get_processor_cache(dealer, missing_hashes)
-            for idx in range(len(missing_items)):
-                if not missing_items[idx]:
-                    raise ValueError(f"Missing item {idx} not found in processor cache")
-                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
-        images, videos = [], []
-        image_uuid, video_uuid = [], []
-        for item in mm_items:
-            if item.get("type") == "image":
-                images.append(item["data"])
-                image_uuid.append(item["uuid"])
-            elif item.get("type") == "video":
-                videos.append(item["data"])
-                video_uuid.append(item["uuid"])
-            else:
-                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-        return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
-
-    def request2ids(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        """
-        Convert chat messages into model inputs.
-        Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
-        """
-        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
-
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat template.")
-
-        chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-        message_dict = {
-            key: getattr(request, key, None)
-            for key in ["messages", "tools", "documents", "enable_thinking", "system"]
-            if getattr(request, key, None) is not None
-        }
-        prompt = self.tokenizer.apply_chat_template(
-            message_dict,
-            tokenize=False,
-            add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
-            **chat_template_kwargs,
-        )
-        request.prompt_tokens = prompt
-
-        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                t, h, w = outputs["grid_thw"][idx][0]
-                meta["thw"] = (t, h, w)
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        if self.is_training:
-            assert tgts, "Training must give tgt"
-            self._extract_labels(outputs, tgts)
-
-        return outputs
-
-    def prompt_token_ids2outputs(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "mm_positions": [],
-            "mm_hashes": [],
-        }
-        prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else []
-        prompt_token_ids_len = len(prompt_token_ids)
-        if not request.messages:
-            outputs["input_ids"].extend(prompt_token_ids)
-            outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
-            for i in range(prompt_token_ids_len):
-                outputs["position_ids"].append([i] * 3)
-            outputs["cur_position"] += prompt_token_ids_len
-            return outputs
-        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
-        st, image_idx, video_idx = 0, 0, 0
-        while st < prompt_token_ids_len:
-            cur_token_id = prompt_token_ids[st]
-            if cur_token_id == self.image_start_id:
-                if image_idx >= len(images):
-                    raise ValueError("prompt token ids has more image placeholder than in messages")
-                # append image_start_id
-                outputs["input_ids"].extend([cur_token_id])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st += 1
-                # process placeholder token ids
-                cur_idx = st
-                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_end_id:
-                    cur_idx += 1
-                if cur_idx >= prompt_token_ids_len:
-                    raise ValueError("image token ids not complete")
-                image = images[image_idx]
-                uuid = image_uuid[image_idx] if image_uuid else None
-                token_len = cur_idx - st
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid, token_len)
-                else:
-                    self._add_processed_image(image, outputs, uuid, token_len)
-                image_idx += 1
-                st = cur_idx
-            elif cur_token_id == self.video_start_id:
-                if video_idx >= len(videos):
-                    raise ValueError("prompt token ids has more video placeholder than in messages")
-                # append video_start_id
-                outputs["input_ids"].extend([cur_token_id])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st += 1
-                # process placeholder token ids
-                cur_idx = st
-                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.video_end_id:
-                    cur_idx += 1
-                if cur_idx >= prompt_token_ids_len:
-                    raise ValueError("video token ids not complete")
-                video = videos[video_idx]
-                uuid = video_uuid[video_idx] if video_uuid else None
-                token_len = cur_idx - st
-                if not isinstance(video, tuple):
-                    if isinstance(video, dict):
-                        frames = self._load_and_process_video(video["video"], video)
-                    else:
-                        frames = self._load_and_process_video(video, {})
-                    self._add_video(frames, outputs, uuid, token_len)
-                else:
-                    self._add_processed_video(video, outputs, uuid, token_len)
-                video_idx += 1
-                st = cur_idx
-            else:
-                outputs["input_ids"].extend([cur_token_id])
-                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
-                outputs["position_ids"].append([outputs["cur_position"]] * 3)
-                outputs["cur_position"] += 1
-                st += 1
-        if image_idx != len(images):
-            raise ValueError("number of images does not match")
-        if video_idx != len(videos):
-            raise ValueError("number of videos does not match")
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                t, h, w = outputs["grid_thw"][idx][0]
-                meta["thw"] = (t, h, w)
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        return outputs
-
-    def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
-        token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
-        outputs["input_ids"].append(token_id)
-        outputs["token_type_ids"].append(self.token_type_mapping[token])
-        pos = outputs["cur_position"]
-        outputs["position_ids"].append([pos] * 3)
-        outputs["cur_position"] += 1
-
-    def _add_text(self, tokens, outputs: Dict) -> None:
-        if isinstance(tokens, str):
-            tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
-        outputs["input_ids"].extend(tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
-
-        start = outputs["cur_position"]
-        for i in range(len(tokens)):
-            outputs["position_ids"].append([start + i] * 3)
-        outputs["cur_position"] += len(tokens)
-
-    def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
-        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
-            img.height,
-            img.width,
-            min_pixels=self.image_min_pixels,
-            max_pixels=self.image_max_pixels,
-        )[1]
-        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
-        if token_len and token_len != num_tokens:
-            raise ValueError("image tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-        outputs["num_input_image_tokens"] += num_tokens
-
-        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-        # Preprocess pixels
-        ret = self.image_preprocessor.preprocess(
-            images=[img.convert("RGB")],
-            do_normalize=False,
-            do_rescale=False,
-            predetermined_grid_thw=np.array([[patches_h, patches_w]]),
-            do_convert_rgb=True,
-            input_data_format=ChannelDimension.LAST,
-        )
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(ret["image_grid_thw"])
-        outputs["image_type_ids"].append(0)
-
-    def _add_processed_image(
-        self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
-    ) -> None:
-        img, meta = img_cache
-        num_tokens = img.shape[0] // (self.spatial_conv_size**2)
-        if token_len and num_tokens != token_len:
-            raise ValueError("image tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-        _, h, w = meta["thw"]
-        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-        outputs["images"].append(img)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[1, h, w]]))
-        outputs["image_type_ids"].append(0)
-
-    def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
-        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
-            frames[0].height,
-            frames[0].width,
-            min_pixels=self.video_min_pixels,
-            max_pixels=self.video_max_pixels,
-        )[1]
-        num_frames = len(frames)
-        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
-        if token_len and num_tokens != token_len:
-            raise ValueError("video tokens num not match the size")
-
-        pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-        ret = self.image_preprocessor.preprocess(
-            images=None,
-            videos=pixel_stack,
-            do_normalize=False,
-            do_rescale=False,
-            predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
-            do_convert_rgb=True,
-            input_data_format=ChannelDimension.LAST,
-        )
-        outputs["images"].append(ret["pixel_values_videos"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(ret["video_grid_thw"])
-        outputs["image_type_ids"].extend([1] * num_frames)
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["num_input_video_tokens"] += num_tokens
-
-        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-    def _add_processed_video(
-        self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
-    ) -> None:
-        frames, meta = frames_cache
-        num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
-        if token_len and num_tokens != token_len:
-            raise ValueError("video tokens num not match the size")
-
-        t, h, w = meta["thw"]
-        outputs["images"].append(frames)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["image_type_ids"].extend([1] * t)
-
-        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
-        outputs["position_ids"].extend(pos_ids)
-        outputs["cur_position"] = np.max(pos_ids) + 1
-
-    def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None:
-        input_ids = copy.deepcopy(outputs["input_ids"])
-        labels = [self.tokenizer.ignored_index] * len(input_ids)
-
-        tgt_count = input_ids.count(self.sep_token_id)
-        assert tgt_count == len(tgts), f"len(tgts) != len(src) {len(tgts)} vs {tgt_count}"
-
-        tgt_index = 0
-        for i, token_id in enumerate(input_ids):
-            if token_id == self.sep_token_id:
-                labels_token = self.tokenizer.tokenize(tgts[tgt_index])
-                labels_token_id = self.tokenizer.convert_tokens_to_ids(labels_token)
-                labels[i - len(labels_token_id) : i] = labels_token_id
-                labels[i] = self.eos_token_id  # </s>
-                tgt_index += 1
-
-        outputs["labels"] = labels
-
-    def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
-        reader, meta, path = read_video_decord(url, save_to_disk=False)
-
-        video_frame_args = dict()
-        video_frame_args["fps"] = item.get("fps", self.fps)
-        video_frame_args["min_frames"] = item.get("min_frames", self.min_frames)
-        video_frame_args["max_frames"] = item.get("max_frames", self.max_frames)
-        video_frame_args["target_frames"] = item.get("target_frames", self.target_frames)
-        video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample)
-
-        video_frame_args = self._set_video_frame_args(video_frame_args, meta)
-
-        frames_data, _, timestamps = read_frames_decord(
-            path,
-            reader,
-            meta,
-            target_frames=video_frame_args["target_frames"],
-            target_fps=video_frame_args["fps"],
-            frames_sample=video_frame_args["frames_sample"],
-            save_to_disk=False,
-        )
-
-        frames: List[Image.Image] = []
-        for img_array, ts in zip(frames_data, timestamps):
-            frames.append(render_frame_timestamp(img_array, ts))
-        # Ensure even number of frames for temporal conv
-        if len(frames) % 2 != 0:
-            frames.append(copy.deepcopy(frames[-1]))
-        return frames
-
-    def _set_video_frame_args(self, video_frame_args, video_meta):
-        """
-        根据已知参数和优先级，设定最终的抽帧参数
-        """
-        # 优先级：video_target_frames > (video_min_frames, video_max_frames) > video_fps
-        if video_frame_args["target_frames"] > 0:
-            if video_frame_args["fps"] >= 0:
-                raise ValueError("fps must be negative if target_frames is given")
-            if (
-                video_frame_args["min_frames"] > 0
-                and video_frame_args["target_frames"] < video_frame_args["min_frames"]
-            ):
-                raise ValueError("target_frames must be larger than min_frames")
-            if (
-                video_frame_args["max_frames"] > 0
-                and video_frame_args["target_frames"] > video_frame_args["max_frames"]
-            ):
-                raise ValueError("target_frames must be smaller than max_frames")
-        else:
-            if video_frame_args["fps"] < 0:
-                raise ValueError("Must provide either positive target_fps or positive target_frames.")
-            # 先计算在video_fps下抽到的帧数
-            frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
-            # 判断是否在目标区间内，如果不是，则取target_frames为上界或下界
-            if (
-                video_frame_args["min_frames"] > 0
-                and video_frame_args["max_frames"] > 0
-                and video_frame_args["min_frames"] > video_frame_args["max_frames"]
-            ):
-                raise ValueError("min_frames must be smaller than max_frames")
-            if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
-                video_frame_args["target_frames"] = video_frame_args["min_frames"]
-                video_frame_args["fps"] = -1
-            if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
-                video_frame_args["target_frames"] = video_frame_args["max_frames"]
-                video_frame_args["fps"] = -1
-
-        return video_frame_args
-
-    def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]:
-        # Downsample time if needed
-        t_eff = t // self.temporal_conv_size if t != 1 else 1
-        gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
-        time_idx = np.repeat(np.arange(t_eff), gh * gw)
-        h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
-        w_idx = np.tile(np.arange(gw), t_eff * gh)
-
-        coords = list(zip(time_idx, h_idx, w_idx))
-        return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
-
-    def _load_tokenizer(self):
-        """
-        load tokenizer
-
-        Returns:
-            tokenizer (AutoTokenizer)
-        """
-        vocab_file_names = [
-            "tokenizer.model",
-            "spm.model",
-            "ernie_token_100k.model",
-        ]
-        for i in range(len(vocab_file_names)):
-            if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
-                Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
-                break
-        self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
-
-    def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
-        """
-        get cache correspond to given hash values
-        """
-        req = pickle.dumps(mm_hashes)
-        socket.send_multipart([b"", req])
-        _, resp = socket.recv_multipart()
-        mm_items = pickle.loads(resp)
-        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
-        return mm_items
-
-    def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
-        """
-        update cache data
-        """
-        req = pickle.dumps((mm_hashes, mm_items))
-        socket.send_multipart([b"", req])
-        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py
deleted file mode 100644
index 91120096c70..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import os
-import random
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-from .utils.io_utils import EXTRACTED_FRAME_DIR, get_filename
-from .utils.video_utils import VideoReaderWrapper
-
-
-def read_video_decord(video_path, save_to_disk):
-    """get reader and meta by decord"""
-    # video_path = get_downloadable(video_path, save_to_disk=save_to_disk)
-    if isinstance(video_path, VideoReaderWrapper):
-        video_reader = video_path
-    else:
-        if isinstance(video_path, bytes):
-            video_path = io.BytesIO(video_path)
-        video_reader = VideoReaderWrapper(video_path, num_threads=1)
-    vlen = len(video_reader)
-    fps = video_reader.get_avg_fps()
-    duration = vlen / float(fps)
-
-    video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
-
-    return video_reader, video_meta, video_path
-
-
-def get_frame_indices(
-    vlen,
-    target_frames=-1,
-    target_fps=-1,
-    frames_sample="middle",
-    fix_start=None,
-    input_fps=-1,
-):
-    """
-    取出对应的frame index
-    """
-    assert frames_sample in ["rand", "middle", "leading"]
-    if target_frames > 0:
-        assert target_fps <= 0, "target_fps must be negative if target_frames is given."
-        if target_frames > vlen:
-            acc_samples = vlen
-            data_processor_logger.info(
-                f"target_frames={target_frames} is larger than video length {vlen}, "
-                f"will sample {acc_samples} frames."
-            )
-        else:
-            acc_samples = target_frames
-            data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
-
-        # split the video into `acc_samples` intervals, and sample from each interval.
-        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
-        ranges = []
-        for idx, interv in enumerate(intervals[:-1]):
-            ranges.append((interv, intervals[idx + 1] - 1))
-        if frames_sample == "rand":
-            try:
-                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
-            except Exception:
-                frame_indices = np.random.permutation(vlen)[:acc_samples]
-                frame_indices.sort()
-                frame_indices = list(frame_indices)
-        elif fix_start is not None:
-            frame_indices = [x[0] + fix_start for x in ranges]
-        elif frames_sample == "leading":
-            frame_indices = [x[0] for x in ranges]
-        elif frames_sample == "middle":
-            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
-        else:
-            raise NotImplementedError
-
-    elif target_fps > 0:
-        assert target_frames <= 0, "target_frames must be negative if target_fps is given."
-        assert input_fps > 0, "input_fps must be provided if target_fps is given."
-        data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
-        duration = float(vlen) / input_fps
-        delta = 1 / target_fps  # gap between frames, this is also the clip length each frame represents
-        if frames_sample == "middle":
-            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
-        elif frames_sample == "leading":
-            frame_seconds = np.arange(0, duration, delta)
-        if frames_sample == "rand":
-            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
-            rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
-            frame_seconds += rand_offset * delta
-        frame_indices = np.around(frame_seconds * input_fps).astype(int)
-        frame_indices = [e for e in frame_indices if e < vlen]
-
-    else:
-        raise ValueError("Must provide either positive target_fps or positive target_frames.")
-
-    return frame_indices
-
-
-def read_frames_decord(
-    video_path,
-    video_reader,
-    video_meta,
-    target_frames=-1,
-    target_fps=-1,
-    frames_sample="middle",
-    fix_start=None,
-    save_to_disk=False,
-    cache_dir=EXTRACTED_FRAME_DIR,
-    frame_indices=None,
-    tol=10,
-):
-    """get frames by decord"""
-
-    if frame_indices is None:
-        frame_indices = get_frame_indices(
-            video_meta["num_of_frame"],
-            target_frames=target_frames,
-            target_fps=target_fps,
-            frames_sample=frames_sample,
-            fix_start=fix_start,
-            input_fps=video_meta["fps"],
-        )
-
-    frames = []
-    for frame_indice_index in range(0, len(frame_indices)):
-        frame_indice = frame_indices[frame_indice_index]
-        try:
-            frames.append(video_reader[frame_indice].asnumpy())  # (T, H, W, C)
-        except Exception as e:
-            data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
-            previous_counter = 1
-            later_counter = 1
-            previous_after_flag = True
-            if frame_indice == 0 or frame_indice == len(video_reader) - 1:
-                cur_tol = tol * 2
-            else:
-                cur_tol = tol
-            while previous_counter < cur_tol or later_counter < cur_tol:
-                if previous_after_flag:
-                    if frame_indice - previous_counter < 0:
-                        previous_counter += 1
-                        previous_after_flag = not previous_after_flag
-                        continue
-                    try:
-                        frames.append(video_reader[frame_indice - previous_counter].asnumpy())
-                        data_processor_logger.info(
-                            f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
-                        )
-                        frame_indices[frame_indice_index] = frame_indice - previous_counter
-                        break
-                    except Exception as e:
-                        previous_counter += 1
-                        data_processor_logger.info(f"error: {e}")
-                else:
-                    if frame_indice + later_counter >= len(video_reader):
-                        later_counter += 1
-                        previous_after_flag = not previous_after_flag
-                        continue
-                    try:
-                        frames.append(video_reader[frame_indice + later_counter].asnumpy())
-                        data_processor_logger.info(
-                            f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
-                        )
-                        frame_indices[frame_indice_index] = frame_indice + later_counter
-                        break
-                    except Exception:
-                        later_counter += 1
-                previous_after_flag = not previous_after_flag
-
-    frames = np.stack(frames, axis=0)
-    assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
-
-    ret = []
-
-    url_sha1 = get_filename()
-    for idx, frame in enumerate(frames):
-        tmp = Image.fromarray(frame, "RGB")
-        if save_to_disk:
-            save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
-            if not os.path.exists(os.path.dirname(save_path)):
-                os.makedirs(os.path.dirname(save_path))
-            tmp.save(save_path)
-            tmp = save_path
-        ret.append(tmp)
-
-    time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
-
-    return ret, frame_indices, time_stamps
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf
deleted file mode 100644
index 7e3bb2f8ce7..00000000000
Binary files a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf and /dev/null differ
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py
deleted file mode 100644
index 31be300c18e..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py
deleted file mode 100644
index 1535b64d4f0..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import base64
-import datetime
-import hashlib
-import os
-import threading
-import uuid
-from pathlib import Path
-
-import requests
-from PIL import Image
-
-RAW_VIDEO_DIR = "./download_tmp/raw_video/"
-RAW_IMAGE_DIR = "./download_tmp/raw_images/"
-EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
-TMP_DIR = "./download_tmp/upload_tmp/"
-
-
-def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3):
-    """
-    Description: 下载url，如果url是PIL直接返回
-    Args:
-        url(str, PIL): http/本地路径/io.Bytes，注意io.Bytes是图片字节流
-        download_path: 在save_to_disk=True的情况下生效，返回保存地址
-        save_to_disk: 是否保存在本地路径
-
-    """
-    from .video_utils import VideoReaderWrapper
-
-    if isinstance(url, Image.Image):
-        return url
-    elif isinstance(url, VideoReaderWrapper):
-        return url
-    elif url.startswith("http"):
-        response = requests.get(url)
-        bytes_data = response.content
-    elif os.path.isfile(url):
-        if save_to_disk:
-            return url
-        bytes_data = open(url, "rb").read()
-    else:
-        bytes_data = base64.b64decode(url)
-    if not save_to_disk:
-        return bytes_data
-
-    download_path = os.path.join(download_dir, get_filename(url))
-    Path(download_path).parent.mkdir(parents=True, exist_ok=True)
-    with open(download_path, "wb") as f:
-        f.write(bytes_data)
-    return download_path
-
-
-def get_filename(url=None):
-    """
-    Get Filename
-    """
-    if url is None:
-        return str(uuid.uuid4()).replace("-", "")
-    t = datetime.datetime.now()
-    if not isinstance(url, bytes):
-        url = url.encode("utf-8")
-
-    md5_hash = hashlib.md5(url).hexdigest()
-    pid = os.getpid()
-    tid = threading.get_ident()
-
-    # 去掉后缀，防止save-jpg报错
-    image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
-    return image_filname
-
-
-def get_downloadable(
-    url,
-    download_dir=RAW_VIDEO_DIR,
-    save_to_disk=False,
-    retry=0,
-    retry_interval=3,
-):
-    """download video and store it in the disk
-
-    return downloaded **path** if save_to_disk is set to true
-    return downloaded **bytes** if save_to_disk is set to false
-    """
-
-    if not os.path.exists(download_dir):
-        os.makedirs(download_dir)
-    downloaded_path = file_download(
-        url,
-        download_dir,
-        save_to_disk=save_to_disk,
-        retry=retry,
-        retry_interval=retry_interval,
-    )
-    return downloaded_path
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py
deleted file mode 100644
index 9b24226ed88..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-from pathlib import Path
-
-from PIL import Image, ImageDraw, ImageFont
-
-cur_directory = Path(__file__).parent.absolute()
-FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf")
-
-
-def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
-    """
-    函数功能: 给pil.image的图片渲染时间戳
-    时间戳的大小为 min(width, height)的rate
-    字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10%
-    返回一个 Image 对象
-    """
-    draw = ImageDraw.Draw(image)  # 创建一个可绘制对象
-    width, height = image.size  # 获取图片大小
-    font_size = int(min(width, height) * rate)  # 设置字体大小
-    outline_size = int(font_size * 0.1)  # 设置轮廓大小
-    font = ImageFont.truetype(font_path, font_size)  # 加载字体文件, 设置字体大小
-    x = 0
-    y = 0  # 文本的x坐标, y坐标
-
-    # 绘制黑色的时间戳，白色的边框
-    draw.text(
-        (x, y),
-        number,
-        font=font,
-        fill=(0, 0, 0),
-        stroke_width=outline_size,
-        stroke_fill=(255, 255, 255),
-    )
-
-    return image
-
-
-def timestamp_converting(time_stamp_in_seconds):
-    """
-    convert timestamp format from seconds to hr:min:sec
-    """
-    # get hours
-    hours = 0
-    while time_stamp_in_seconds >= 3600:
-        hours += 1
-        time_stamp_in_seconds -= 3600
-    # get minutes
-    mins = 0
-    while time_stamp_in_seconds >= 60:
-        mins += 1
-        time_stamp_in_seconds -= 60
-    time_hours = f"{int(hours):02d}"
-    time_mins = f"{int(mins):02d}"
-    time_secs = f"{time_stamp_in_seconds:05.02f}"
-    fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
-
-    return fi_time_stamp
-
-
-def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
-    """
-    function: get the timestamp of a frame, 在均匀抽帧时用。
-
-    num_frames: 总帧数
-    frameid_list: 被抽帧的帧的索引
-    duration: 视频的总时长
-    return: timestamp; xx:xx:xx (str)
-    """
-    time_stamp = duration * 1.0 * frame_id / num_frames
-
-    return time_stamp
-
-
-def render_frame_timestamp(frame, timestamp, font_rate=0.1):
-    """
-    函数功能, 给frame, 按照顺序将 index 渲染上去
-    逻辑思路: 把index渲染到图片的左上方
-
-    frame: 帧，PIL.Image object
-    timestamp: 时间戳，单位是秒
-    font_rate: 字体大小占 min(wi, hei)的比率
-    """
-
-    time_stamp = "time: " + timestamp_converting(timestamp)
-    new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
-
-    return new_frame
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py
deleted file mode 100644
index a4769ca8ecc..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import os
-from tempfile import NamedTemporaryFile as ntf
-
-import decord
-
-try:
-    # moviepy 1.0
-    import moviepy.editor as mp
-except:
-    # moviepy 2.0
-    import moviepy as mp
-
-
-def is_gif(data: bytes) -> bool:
-    """
-    check if a bytes is a gif based on the magic head
-    """
-    return data[:6] in (b"GIF87a", b"GIF89a")
-
-
-class VideoReaderWrapper(decord.VideoReader):
-    """
-    Solving memory leak bug
-
-    https://github.com/dmlc/decord/issues/208
-    """
-
-    def __init__(self, video_path, *args, **kwargs):
-        with ntf(delete=True, suffix=".gif") as gif_file:
-            gif_input = None
-            self.original_file = None
-            if isinstance(video_path, str):
-                self.original_file = video_path
-                if video_path.lower().endswith(".gif"):
-                    gif_input = video_path
-            elif isinstance(video_path, bytes):
-                if is_gif(video_path):
-                    gif_file.write(video_path)
-                    gif_input = gif_file.name
-            elif isinstance(video_path, io.BytesIO):
-                video_path.seek(0)
-                tmp_bytes = video_path.read()
-                video_path.seek(0)
-                if is_gif(tmp_bytes):
-                    gif_file.write(tmp_bytes)
-                    gif_input = gif_file.name
-
-            if gif_input is not None:
-                clip = mp.VideoFileClip(gif_input)
-                mp4_file = ntf(delete=False, suffix=".mp4")
-                clip.write_videofile(mp4_file.name, verbose=False, logger=None)
-                clip.close()
-                video_path = mp4_file.name
-                self.original_file = video_path
-
-            super().__init__(video_path, *args, **kwargs)
-            self.seek(0)
-
-    def __getitem__(self, key):
-        frames = super().__getitem__(key)
-        self.seek(0)
-        return frames
-
-    def __del__(self):
-        if self.original_file and os.path.exists(self.original_file):
-            os.remove(self.original_file)
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py b/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py
deleted file mode 100644
index 8f79e65d634..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .paddleocr_vl_processor import PaddleOCRVLProcessor
-from .process import DataProcessor
-
-__all__ = ["DataProcessor", "PaddleOCRVLProcessor"]
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py
deleted file mode 100644
index 8e333d5bf96..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py
+++ /dev/null
@@ -1,275 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""Image processor class for Keye."""
-
-# TODO: Support videos
-
-import json
-import logging
-import math
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_utils import (
-    ImageInput,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-)
-
-_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-def adjust_size(size, patch_size):
-    num_patches = size // patch_size
-    if num_patches % 2 != 0:
-        num_patches -= 1
-    return num_patches * patch_size
-
-
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = 28,
-    min_pixels: int = 28 * 28 * 130,
-    max_pixels: int = 28 * 28 * 1280,
-):
-    """Rescales the image so that the following conditions are met:
-
-    1. Both dimensions (height and width) are divisible by 'factor'.
-
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-    3. The aspect ratio of the image is maintained as closely as possible.
-
-    """
-    # if height < factor or width < factor:
-    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
-    # if int(height < factor//4) + int(width < factor//4):
-    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
-
-    if height < factor:
-        logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
-        width = round((width * factor) / height)
-        height = factor
-
-    if width < factor:
-        logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
-        height = round((height * factor) / width)
-        width = factor
-
-    if max(height, width) / min(height, width) > 200:
-        raise ValueError(
-            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
-        )
-    h_bar = round(height / factor) * factor
-    w_bar = round(width / factor) * factor
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = math.ceil(height * beta / factor) * factor
-        w_bar = math.ceil(width * beta / factor) * factor
-    return h_bar, w_bar
-
-
-class ImageProcessor(BaseImageProcessor):
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: int = 3,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 28 * 28 * 130,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_patch_size: int = 1,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_patch_size = temporal_patch_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
-        self.do_convert_rgb = do_convert_rgb
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_dir):
-        pretrained_model_dir = Path(pretrained_model_dir)
-        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
-        with open(image_processor_config_path, "r", encoding="utf-8") as f:
-            image_processor_config = json.load(f)
-        return cls(**image_processor_config)
-
-    def _preprocess(
-        self,
-        images,
-        do_resize: Optional[bool] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-    ):
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [image.convert("RGB") for image in images]
-
-        width, height = images[0].size
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        for image in images:
-            if do_resize:
-                resized_height, resized_width = smart_resize(
-                    height,
-                    width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
-                )
-
-                image = image.resize((resized_width, resized_height), resample=self.resample)
-
-            image = to_numpy_array(image)
-
-            if do_rescale:
-                image = (image * rescale_factor).astype(np.float32)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image -= np.array(image_mean, dtype=np.float32)
-                image /= np.array(image_std, dtype=np.float32)
-
-            processed_images.append(image)
-
-        patches = np.array(processed_images)
-        patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-        patches = patches.reshape(
-            grid_t,
-            self.temporal_patch_size,
-            channel,
-            grid_h,
-            self.patch_size,
-            grid_w,
-            self.patch_size,
-        )
-        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
-        assert self.temporal_patch_size == 1
-        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images,
-        videos=None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-        return_tensors=None,
-    ):
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if videos is not None:
-            raise NotImplementedError("Videos are not yet supported")
-
-        patches, image_grid_thw = self._preprocess(
-            images,
-            do_resize=do_resize,
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_convert_rgb=do_convert_rgb,
-        )
-        pixel_values = np.array(patches)
-        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py
deleted file mode 100644
index f7d7cfbd2b1..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py
+++ /dev/null
@@ -1,322 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-_SAMPLING_EPS = 1e-5
-from fastdeploy.input.utils import process_stop_token_ids
-
-
-class PaddleOCRVLProcessor(TextProcessor):
-    """
-    PaddleOCR Vision-Language processor for handling multimodal inputs.
-
-    This processor extends TextProcessor to support:
-    - Image processing
-    - Multimodal feature extraction
-    - Tokenization and position encoding
-    - Request processing and model input generation
-
-    Attributes:
-        processor (DataProcessor): Underlying data processor instance
-        tokenizer: Text tokenizer instance
-        limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
-    """
-
-    def __init__(
-        self,
-        config,
-        model_name_or_path,
-        limit_mm_per_prompt=None,
-        mm_processor_kwargs=None,
-        reasoning_parser_obj=None,
-        tool_parser_obj=None,
-        enable_processor_cache=False,
-    ):
-        """
-        Initialize PaddleOCRVLProcessor instance.
-
-        Args:
-            config: Model configuration object
-            model_name_or_path (str): Pretrained model name or path
-            limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
-            mm_processor_kwargs (dict, optional): Multimodal processor arguments
-            reasoning_parser_obj: Reasoning parser instance
-            tool_parser_obj: Tool parser instance
-        """
-        super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-        self.processor = DataProcessor(
-            model_path=model_name_or_path,
-            enable_processor_cache=enable_processor_cache,
-            tokens_per_second=config.vision_config.tokens_per_second,
-            tokenizer=self.tokenizer,
-            **processor_kwargs,
-        )
-        self.image_patch_id = self.processor.image_patch_id
-        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """
-        Process incoming request and generate model inputs.
-
-        Args:
-            request: Input request object
-            max_model_len (int, optional): Maximum context length
-            **kwargs: Additional processing parameters
-
-        Returns:
-            Request: Processed request with model inputs
-        """
-        task = request.to_dict()
-        task["enable_thinking"] = kwargs.get("enable_thinking", False)
-        self.process_request_dict(task, max_model_len)
-        request = Request.from_dict(task)
-        request = self._apply_default_parameters(request)
-        return request
-
-    def _parse_processor_kwargs(self, kwargs):
-        """
-        Parse and validate multimodal processor arguments.
-
-        Args:
-            kwargs (dict): Processor configuration arguments
-
-        Returns:
-            dict: Validated processor arguments
-
-        Raises:
-            ValueError: If arguments format is invalid
-        """
-        if not kwargs:
-            return {}
-
-        try:
-            if not isinstance(kwargs, dict):
-                raise ValueError("mm-processor-kwargs must be a dictionary")
-
-            # Validate kwargs types against expected schema
-            data_processor_logger.info(f"Processing kwargs: {kwargs}")
-            expected_types = {
-                "video_max_frames": int,  # Maximum video frames parameter
-                "video_min_frames": int,  # Minimum video frames parameter
-            }
-
-            for key, value in kwargs.items():
-                if key in expected_types and not isinstance(value, expected_types[key]):
-                    raise ValueError(
-                        f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
-                    )
-
-            return kwargs
-
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
-            return {}
-
-    def _parse_limits(self, limits):
-        """
-        Parse and validate multimodal input limits.
-
-        Args:
-            limits (dict): Input limits configuration
-
-        Returns:
-            dict: Validated limits with defaults
-
-        Raises:
-            ValueError: If limits format is invalid
-        """
-        DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
-        if not limits:
-            return DEFAULT_LIMITS
-
-        try:
-            if not isinstance(limits, dict):
-                raise ValueError("limit-mm-per-prompt must be a dictionary")
-            data_processor_logger.info(f"_parse_limits:{limits}")
-            return {**DEFAULT_LIMITS, **limits}
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
-            return DEFAULT_LIMITS
-
-    def _check_mm_limits(self, item):
-        """
-        Validate multimodal inputs against configured limits.
-
-        Args:
-            item: Input request item to validate
-
-        Raises:
-            ValueError: If input exceeds configured limits
-        """
-        if isinstance(item, dict):
-            # 请求包含prompt和multi_modal_data
-            mm_data = item
-        else:
-            # 请求包含messages
-            mm_data = {"image": [], "video": []}
-
-            for message in item:
-                if isinstance(message.get("content"), list):
-                    for part in message["content"]:
-                        if part.get("type") in ["image_url", "image"]:
-                            mm_data["image"].append(part)
-                        elif part.get("type") in ["video_url", "video"]:
-                            mm_data["video"].append(part)
-
-        for modality, data in mm_data.items():
-            if modality in self.limit_mm_per_prompt:
-                limit = self.limit_mm_per_prompt[modality]
-                if len(data) > limit:
-                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """
-        Process request dictionary into model inputs.
-
-        Args:
-            request (dict): Input request dictionary
-            max_model_len (int, optional): Maximum context length
-
-        Returns:
-            dict: Processed request with model inputs
-
-        Raises:
-            ValueError: If request format is invalid
-        """
-
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        if request.prompt:
-            multimodal_data = request.multimodal_data
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request.prompt, images, videos)
-
-        elif request.messages:
-            messages = request.messages
-            self._check_mm_limits(messages)
-            outputs = self.processor.request2ids(request)
-
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        metadata = request.metadata
-        # Handle continuation of previous generation by appending existing tokens
-        if metadata and metadata.get("generated_token_ids"):
-            self.append_generated_tokens(outputs, metadata["generated_token_ids"])
-        outputs = self.pack_outputs(outputs)
-
-        request.prompt_token_ids = outputs["input_ids"].tolist()
-        request.prompt_token_ids_len = len(request.prompt_token_ids)
-        request.multimodal_inputs = outputs
-
-        # Handle prompt truncation if exceeds model context length
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[
-                : max_model_len - 1
-            ]  # Leave space for at least 1 new token
-
-        # Set default max_tokens if not specified
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-
-        if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS:
-            request.sampling_params.top_p = _SAMPLING_EPS
-            request.sampling_params.top_k = 1
-
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-
-        return request
-
-    def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
-        """
-        Append generated tokens to existing outputs.
-
-        Args:
-            outputs: Current model outputs
-            generated_token_ids: Generated tokens to append
-        """
-        num_tokens = len(generated_token_ids)
-        multimodal_inputs["input_ids"].extend(generated_token_ids)
-        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
-        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
-        multimodal_inputs["position_ids"].append(pos_ids)
-        multimodal_inputs["cur_position"] += num_tokens
-
-    def pack_outputs(self, outputs):
-        """
-        Prepare final output dictionary for model.
-
-        Args:
-            outputs: Intermediate processing outputs
-
-        Returns:
-            dict: Packed output dictionary with all required fields
-        """
-        if not outputs["images"]:
-            outputs["images"] = None  # No images case
-            outputs["grid_thw"] = None  # No spatial dimensions
-            outputs["image_type_ids"] = None  # No type IDs
-        else:
-            outputs["images"] = np.vstack(outputs["images"])  # Stack image features vertically
-            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])  # Stack spatial dimensions
-            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])  # Convert to numpy array
-
-        # Convert all outputs to numpy arrays with appropriate types
-        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)  # Token IDs as int64
-        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)  # Type IDs as int64
-        outputs["position_ids"] = np.concatenate(
-            outputs["position_ids"], axis=1, dtype=np.int64
-        )  # Concatenate position ID
-
-        outputs["image_patch_id"] = self.processor.image_token_id
-        outputs["video_patch_id"] = self.processor.video_token_id
-        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
-        return outputs
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process.py b/fastdeploy/input/v1/paddleocr_vl_processor/process.py
deleted file mode 100644
index ea6b63ee9d8..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/process.py
+++ /dev/null
@@ -1,622 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor
-from .process_video import sample_frames
-
-
-class DataProcessor(MMBaseDataProcessor):
-    """
-    Processes multimodal inputs (text, images, videos) into model-ready formats.
-
-    Handles:
-    - Tokenization of text with special tokens for visual content
-    - Image and video preprocessing
-    - Generation of 3D positional embeddings
-    - Conversion of chat messages to model inputs
-
-    Attributes:
-        tokenizer: Text tokenizer instance
-        image_processor: Image/video preprocessor
-        image_token: Special token for image placeholders
-        video_token: Special token for video placeholders
-        vision_start: Token marking start of visual content
-    """
-
-    def __init__(
-        self,
-        model_path: str,
-        enable_processor_cache: bool = False,
-        video_min_frames: int = 4,
-        video_max_frames: int = 768,
-        video_target_frames: int = -1,
-        video_fps: int = -1,
-        tokens_per_second: int = 2,
-        tokenizer=None,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize the data processor.
-
-        Args:
-            model_path: Path to pretrained model
-            video_min_frames: Minimum frames to sample from videos
-            video_max_frames: Maximum frames to sample from videos
-            tokens_per_second: Temporal resolution for positional embeddings
-            **kwargs: Additional configuration
-        """
-        super().__init__()
-        self.min_frames = video_min_frames
-        self.max_frames = video_max_frames
-        self.target_frames = video_target_frames
-        self.fps = video_fps
-
-        # Initialize tokenizer with left padding and fast tokenizer
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
-            self.tokenizer.ignored_index = -100  # Set ignored index for loss calculation
-        else:
-            self.tokenizer = tokenizer
-        self.image_processor = ImageProcessor.from_pretrained(model_path)  # Initialize image processor
-        self.enable_processor_cache = enable_processor_cache
-
-        # Convolution sizes for patch aggregation
-        self.spatial_conv_size = self.image_processor.merge_size
-        self.temporal_conv_size = self.image_processor.temporal_patch_size
-
-        # Special tokens and IDs
-        self.image_token = "<|IMAGE_PLACEHOLDER|>"
-        self.video_token = "<|video_pad|>"
-
-        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-        self.image_patch_id = self.image_token_id
-
-        self.vision_start = "<|IMAGE_START|>"
-        self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
-        self.tokens_per_second = tokens_per_second
-
-        self.role_prefixes = {
-            "system": "",
-            "user": "User: ",
-            "bot": "Assistant: ",
-            "assistant": "Assistant: ",
-        }
-
-    @staticmethod
-    def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
-        """
-        Calculate the number of tokens in the multimodal input.
-        """
-        if isinstance(grid_thw, paddle.Tensor):
-            grid_thw = grid_thw.numpy()
-
-        if len(grid_thw) == 0:
-            return 0
-
-        def calc_one(thw):
-            t, h, w = map(int, thw)
-            return t * h * w // 4
-
-        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
-            return [calc_one(x) for x in grid_thw]
-
-        return calc_one(grid_thw)
-
-    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
-        """
-        Convert text with image/video placeholders into model inputs.
-
-        Args:
-            text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
-            images: List of PIL Images corresponding to image placeholders
-            videos: List of video data corresponding to video placeholders
-            image_uuid: List of unique identifiers for each image, used for caching or hashing.
-            video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
-        Returns:
-            Dict containing:
-                - input_ids: Token IDs
-                - token_type_ids: Type identifiers (text/image/video)
-                - position_ids: 3D positional embeddings
-                - images: Preprocessed visual features
-                - grid_thw: Spatial/temporal dimensions
-                - image_type_ids: Visual content type (0=image, 1=video)
-        """
-
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "fps": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-            "vit_seqlen": [],
-            "vit_position_ids": [],
-        }
-
-        # Define placeholders and their lengths
-        IMAGE_PLACEHOLDER = self.image_token
-        VIDEO_PLACEHOLDER = self.video_token
-        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
-        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
-        # Initialize tracking variables for text parsing
-        st, image_idx, video_idx = 0, 0, 0  # Start position, image counter, video counter
-        while st < len(text):
-            # Find next image or video placeholder in text
-            image_pos = text.find(IMAGE_PLACEHOLDER, st)
-            image_pos = len(text) if image_pos == -1 else image_pos  # Set to end if not found
-            video_pos = text.find(VIDEO_PLACEHOLDER, st)
-            video_pos = len(text) if video_pos == -1 else video_pos  # Set to end if not found
-            ed = min(image_pos, video_pos)  # End position is first placeholder found
-
-            self._add_text(text[st:ed], outputs)
-            if ed == len(text):
-                break
-
-            if ed == image_pos:
-                image = images[image_idx]
-                uuid = image_uuid[image_idx] if image_uuid else None
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid)
-                else:
-                    self._add_processed_image(image, outputs, uuid)
-                image_idx += 1
-                st = ed + IMAGE_PLACEHOLDER_LEN
-            else:
-                item = videos[video_idx]
-                uuid = video_uuid[video_idx] if video_uuid else None
-                if not isinstance(item, tuple):
-                    if isinstance(item, dict):
-                        frames, meta = self._load_and_process_video(item["video"], item)
-                    else:
-                        frames, meta = self._load_and_process_video(item, {})
-                    self._add_video(frames, meta, outputs, uuid)
-                else:
-                    # cached frames are already processed
-                    self._add_processed_video(item, outputs, uuid)
-                video_idx += 1
-                st = ed + VIDEO_PLACEHOLDER_LEN
-
-        return outputs
-
-    def request2ids(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        """
-        Convert chat request with multimodal messages into model inputs.
-
-        Args:
-            request: Dictionary containing:
-                - messages: List of chat messages with text/image/video content
-                - request_id: Unique identifier for logging
-            tgts: Optional target sequences
-
-        Returns:
-            Dict with same structure as text2ids() output
-        """
-
-        # Parse and validate chat messages
-        messages = parse_chat_messages(request.messages)
-        mm_items = []
-        for msg in messages:
-            role = msg.get("role")
-            assert role in self.role_prefixes, f"Unsupported role: {role}"
-
-            # Normalize content to list format
-            content = msg.get("content")
-            if not isinstance(content, list):
-                content = [content]
-            # Collect all visual content items
-            for item in content:
-                if item.get("type") in ["image", "video"]:
-                    mm_items.append(item)
-
-        missing_hashes, missing_idx = [], []
-        for idx, item in enumerate(mm_items):
-            if not item.get("data"):
-                # raw data not provided, should be retrieved from processor cache
-                missing_hashes.append(item.get("uuid"))
-                missing_idx.append(idx)
-
-        if len(missing_hashes) > 0 and not self.enable_processor_cache:
-            raise ValueError("Missing items cannot be retrieved without processor cache.")
-
-        if self.enable_processor_cache:
-            context = zmq.Context()
-            dealer = context.socket(zmq.DEALER)
-            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
-            missing_items = self.get_processor_cache(dealer, missing_hashes)
-            for idx in range(len(missing_items)):
-                if not missing_items[idx]:
-                    raise ValueError(f"Missing item {idx} not found in processor cache")
-                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
-        images, videos = [], []
-        image_uuid, video_uuid = [], []
-        for item in mm_items:
-            if item.get("type") == "image":
-                images.append(item["data"])
-                image_uuid.append(item["uuid"])
-            elif item.get("type") == "video":
-                videos.append(item["data"])
-                video_uuid.append(item["uuid"])
-            else:
-                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat template.")
-
-        chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-        prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
-            **chat_template_kwargs,
-        )
-        request.prompt_tokens = prompt
-
-        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                t, h, w = outputs["grid_thw"][idx]
-                meta["thw"] = (t, h, w)
-                meta["fps"] = outputs["fps"][idx]
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        return outputs
-
-    def _add_text(self, tokens, outputs: Dict) -> None:
-        """
-        Add text tokens to model inputs dictionary.
-
-        Args:
-            tokens: Text string or already tokenized IDs
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles both raw text and pre-tokenized inputs
-            - Updates position IDs for 3D embeddings
-        """
-        if not tokens:
-            return None
-
-        if isinstance(tokens, str):
-            tokens_str = self.tokenizer.tokenize(tokens)
-            tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
-        num_tokens = len(tokens)
-        outputs["input_ids"].extend(tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
-        pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-    def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
-        """
-        Generate 3D positional embeddings for text tokens.
-
-        Args:
-            start_pos: Starting position index
-            num_tokens: Number of tokens to generate positions for
-
-        Returns:
-            numpy.ndarray: 3D position IDs shaped (3, num_tokens)
-        """
-        text_array = np.arange(num_tokens).reshape(1, -1)
-        text_index = np.broadcast_to(text_array, (3, num_tokens))
-        position = text_index + start_pos
-        return position
-
-    def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
-        """
-        Add image data to model inputs dictionary.
-
-        Args:
-            img: PIL Image to process
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Preprocesses image and calculates spatial dimensions
-            - Adds image token IDs and type markers
-            - Generates appropriate position embeddings
-        """
-        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
-        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["grid_thw"].tolist()
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-        outputs["num_input_image_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].append(0)
-
-        # position_ids
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-        outputs["fps"].append(0)
-        numel = h * w
-        outputs["vit_seqlen"].append(numel)
-        outputs["vit_position_ids"].append(np.arange(numel) % numel)
-
-    def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
-        img, meta = img_cache
-        num_tokens = img.shape[0] // self.image_processor.merge_size**2
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-        _, h, w = meta["thw"]
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["images"].append(img)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[1, h, w]]))
-        outputs["image_type_ids"].append(0)
-
-        outputs["fps"].append(0)
-
-    def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None:
-        """
-        Add video data to model inputs dictionary.
-
-        Args:
-            frames: Video frames as numpy array
-            meta: Video metadata containing fps/duration
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles temporal dimension in position embeddings
-            - Uses video-specific token IDs and type markers
-        """
-        ret = self.image_processor.preprocess(images=frames)
-
-        num_tokens = ret["image_grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["image_grid_thw"].tolist()
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.video_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["num_input_video_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].extend([1] * grid_thw[0])
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-        outputs["fps"].append(fps)
-        numel = h * w
-        outputs["vit_seqlen"].append(numel)
-        outputs["vit_position_ids"].append(np.arange(numel) % numel)
-
-    def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
-        frames, meta = frames_cache
-        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
-
-        t, h, w = meta["thw"]
-        outputs["images"].append(frames)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["image_type_ids"].extend([1] * t)
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(fps)
-
-    def _compute_vision_positions(
-        self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
-    ) -> np.ndarray:
-        """
-        Generate 3D position IDs for visual inputs.
-
-        Args:
-            start_pos: Base position in sequence
-            t: Temporal patches (1 for images)
-            h: Height in patches
-            w: Width in patches
-            second_per_grid_t: Time per temporal patch
-
-        Returns:
-            np.ndarray: Position IDs for [t,h,w] dimensions
-        """
-        h //= self.spatial_conv_size
-        w //= self.spatial_conv_size
-
-        tn = np.arange(t).reshape(-1, 1)
-        tn = np.broadcast_to(tn, (t, h * w))
-        tn = tn * int(second_per_grid_t) * self.tokens_per_second
-        t_index = tn.flatten()
-
-        hn = np.arange(h).reshape(1, -1, 1)
-        h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
-        wn = np.arange(w).reshape(1, 1, -1)
-        w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
-        position = np.stack([t_index, h_index, w_index]) + start_pos
-        return position
-
-    def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
-        """
-        Load and preprocess video into frames.
-
-        Args:
-            url: Video file path or bytes
-            item: Dictionary containing processing parameters
-
-        Returns:
-            tuple: (frames, metadata) where:
-                - frames: Processed video frames as numpy array
-                - metadata: Updated video metadata dictionary
-        """
-        reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
-        # Apply frame sampling if fps or target_frames specified
-        fps = item.get("fps", self.fps)
-        num_frames = item.get("target_frames", self.target_frames)
-
-        frame_indices = list(range(meta["num_of_frame"]))
-        if fps > 0 or num_frames > 0:
-            # Get frame sampling constraints
-            min_frames = item.get("min_frames", self.min_frames)
-            max_frames = item.get("max_frames", self.max_frames)
-
-            # Sample frames according to specifications
-            frame_indices = sample_frames(
-                frame_factor=self.temporal_conv_size,  # Ensure divisible by temporal patch size
-                min_frames=min_frames,
-                max_frames=max_frames,
-                metadata=meta,
-                fps=fps,
-                num_frames=num_frames,
-            )
-
-            # Update metadata with new frame count and fps
-            meta["num_of_frame"] = len(frame_indices)
-            if fps is not None:
-                meta["fps"] = fps  # Use specified fps
-                meta["duration"] = len(frame_indices) / fps
-            else:
-                meta["fps"] = len(frame_indices) / meta["duration"]  # Calculate fps from sampled frames
-
-        frames = []
-        for idx in frame_indices:
-            frame = reader[idx].asnumpy()
-            image = Image.fromarray(frame, "RGB")
-            frames.append(image)
-        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
-        return frames, meta
-
-    def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
-        """
-        get cache correspond to given hash values
-        """
-        req = pickle.dumps(mm_hashes)
-        socket.send_multipart([b"", req])
-        _, resp = socket.recv_multipart()
-        mm_items = pickle.loads(resp)
-        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
-        return mm_items
-
-    def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
-        """
-        update cache data
-        """
-        req = pickle.dumps((mm_hashes, mm_items))
-        socket.send_multipart([b"", req])
-        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
-
-    def apply_chat_template(self, request):
-        """
-        Apply chat template to convert messages into token sequence.
-
-        Args:
-            request: Dictionary containing chat messages
-
-        Returns:
-            List of token IDs
-
-        Raises:
-            ValueError: If model doesn't support chat templates
-        """
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat_template.")
-
-        raw_prompt = self.tokenizer.apply_chat_template(
-            request["messages"],
-            tokenize=False,
-            add_generation_prompt=request.get("add_generation_prompt", True),
-            chat_template=request.get("chat_template", None),
-        )
-        prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "")
-        request["text_after_process"] = raw_prompt
-
-        tokens = self.tokenizer.tokenize(prompt_token_str)
-        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-        data_processor_logger.info(
-            f"req_id:{request.get('request_id', ''), } prompt: {raw_prompt} tokens: {tokens}, token_ids: {token_ids}"
-        )
-        return token_ids
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py b/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py
deleted file mode 100644
index c7089d26dc2..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import Optional, Union
-
-import numpy as np
-
-
-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = None,
-    num_frames: Optional[int] = None,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        num_frames = total_num_frames / metadata["fps"] * fps
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = math.floor(num_frames / frame_factor) * frame_factor
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py b/fastdeploy/input/v1/qwen3_vl_processor/__init__.py
deleted file mode 100644
index 9f959610c8d..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .process import DataProcessor
-from .qwen3_vl_processor import Qwen3VLProcessor
-
-__all__ = [
-    "DataProcessor",
-    "Qwen3VLProcessor",
-]
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py
deleted file mode 100644
index 167f3e340db..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py
+++ /dev/null
@@ -1,413 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-IMAGE_MEAN = [0.5, 0.5, 0.5]
-IMAGE_STD = [0.5, 0.5, 0.5]
-
-MIN_PIXELS = 65536
-MAX_PIXELS = 16777216
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-def round_by_factor(number: int, factor: int) -> int:
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
-    """
-    Smart image resizing that maintains aspect ratio and respects constraints.
-
-    Args:
-        height: Original image height
-        width: Original image width
-        factor: Patch size factor
-        min_pixels: Minimum allowed pixels
-        max_pixels: Maximum allowed pixels
-        max_ratio: Maximum allowed aspect ratio
-
-    Returns:
-        tuple: (new_height, new_width)
-
-    Raises:
-        ValueError: If calculated dimensions are invalid
-    """
-    if max(height, width) / min(height, width) > max_ratio:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * max_ratio, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * max_ratio, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Check if image pixel values are already normalized to [0, 1] range.
-
-    Args:
-        image: Input image array
-
-    Returns:
-        bool: True if image is already scaled
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 16,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = IMAGE_MEAN,
-        image_std: Union[float, List[float]] = IMAGE_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            # mutual exclusion and upper branch
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/process.py b/fastdeploy/input/v1/qwen3_vl_processor/process.py
deleted file mode 100644
index 0731d04972d..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/process.py
+++ /dev/null
@@ -1,814 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor
-
-VIDEO_MIN_PIXELS = 128 * 28 * 28
-VIDEO_MAX_PIXELS = 768 * 28 * 28
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 768
-
-
-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = -1,
-    num_frames: Optional[int] = -1,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        min_frames = ceil_by_factor(min_frames, frame_factor)
-        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
-        num_frames = total_num_frames / metadata["fps"] * fps
-
-        if num_frames > total_num_frames:
-            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = floor_by_factor(num_frames, frame_factor)
-
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Hack code ensures that num_frames can always be divided by 4
-    # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
-    if num_frames > 2 and num_frames % 4 != 0:
-        num_frames = (num_frames // 4) * 4  # 向下取整到 4 的倍数
-        total_num_frames = (total_num_frames // 4) * 4
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
-
-
-class DataProcessor(MMBaseDataProcessor):
-    """
-    Processes multimodal inputs (text, images, videos) into model-ready formats.
-
-    Handles:
-    - Tokenization of text with special tokens for visual content
-    - Image and video preprocessing
-    - Generation of 3D positional embeddings
-    - Conversion of chat messages to model inputs
-
-    Attributes:
-        tokenizer: Text tokenizer instance
-        image_processor: Image/video preprocessor
-        image_token: Special token for image placeholders
-        video_token: Special token for video placeholders
-        vision_start: Token marking start of visual content
-    """
-
-    def __init__(
-        self,
-        model_path: str,
-        enable_processor_cache: bool = False,
-        video_min_frames: int = FPS_MIN_FRAMES,
-        video_max_frames: int = FPS_MAX_FRAMES,
-        video_target_frames: int = -1,
-        video_fps: int = FPS,
-        tokens_per_second: int = 2,
-        tokenizer=None,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize the data processor.
-
-        Args:
-            model_path: Path to pretrained model
-            video_min_frames: Minimum frames to sample from videos
-            video_max_frames: Maximum frames to sample from videos
-            tokens_per_second: Temporal resolution for positional embeddings
-            **kwargs: Additional configuration
-        """
-        super().__init__()
-        self.min_frames = video_min_frames
-        self.max_frames = video_max_frames
-        self.target_frames = video_target_frames
-        self.fps = video_fps
-        self.frame_factor = FRAME_FACTOR
-
-        # Initialize tokenizer with left padding and fast tokenizer
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
-            self.tokenizer.ignored_index = -100  # Set ignored index for loss calculation
-        else:
-            self.tokenizer = tokenizer
-
-        self.image_processor = ImageProcessor.from_pretrained(model_path)  # Initialize image processor
-        self.enable_processor_cache = enable_processor_cache
-
-        # Convolution sizes for patch aggregation
-        self.spatial_conv_size = self.image_processor.merge_size
-        self.temporal_conv_size = self.image_processor.temporal_patch_size
-
-        # Special tokens and IDs
-        self.image_token = "<|image_pad|>"
-        self.video_token = "<|video_pad|>"
-
-        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-
-        self.vision_start = "<|vision_start|>"
-        self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
-        self.tokens_per_second = tokens_per_second
-
-        self.role_prefixes = {
-            "system": "",
-            "user": "User: ",
-            "bot": "Assistant: ",
-            "assistant": "Assistant: ",
-        }
-
-    @staticmethod
-    def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
-        """
-        Calculate the number of tokens in the multimodal input.
-        """
-        if isinstance(grid_thw, paddle.Tensor):
-            grid_thw = grid_thw.numpy()
-
-        if len(grid_thw) == 0:
-            return 0
-
-        def calc_one(thw):
-            t, h, w = map(int, thw)
-            return t * h * w // 4
-
-        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
-            return [calc_one(x) for x in grid_thw]
-
-        return calc_one(grid_thw)
-
-    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
-        """
-        Convert text with image/video placeholders into model inputs.
-
-        Args:
-            text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
-            images: List of PIL Images corresponding to image placeholders
-            videos: List of video data corresponding to video placeholders
-            image_uuid: List of unique identifiers for each image, used for caching or hashing.
-            video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
-        Returns:
-            Dict containing:
-                - input_ids: Token IDs
-                - token_type_ids: Type identifiers (text/image/video)
-                - position_ids: 3D positional embeddings
-                - images: Preprocessed visual features
-                - grid_thw: Spatial/temporal dimensions
-                - image_type_ids: Visual content type (0=image, 1=video)
-        """
-
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "fps": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-        }
-
-        # Define placeholders and their lengths
-        IMAGE_PLACEHOLDER = "<|image_pad|>"
-        VIDEO_PLACEHOLDER = "<|video_pad|>"
-        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
-        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
-        # Initialize tracking variables for text parsing
-        st, image_idx, video_idx = 0, 0, 0  # Start position, image counter, video counter
-        while st < len(text):
-            # Find next image or video placeholder in text
-            image_pos = text.find(IMAGE_PLACEHOLDER, st)
-            image_pos = len(text) if image_pos == -1 else image_pos  # Set to end if not found
-            video_pos = text.find(VIDEO_PLACEHOLDER, st)
-            video_pos = len(text) if video_pos == -1 else video_pos  # Set to end if not found
-            ed = min(image_pos, video_pos)  # End position is first placeholder found
-
-            self._add_text(text[st:ed], outputs)
-            if ed == len(text):
-                break
-
-            if ed == image_pos:
-                image = images[image_idx]
-                uuid = image_uuid[image_idx] if image_uuid else None
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid)
-                else:
-                    self._add_processed_image(image, outputs, uuid)
-                image_idx += 1
-                st = ed + IMAGE_PLACEHOLDER_LEN
-            else:
-                item = videos[video_idx]
-                uuid = video_uuid[video_idx] if video_uuid else None
-                if not isinstance(item, tuple):
-                    if isinstance(item, dict):
-                        frames, meta = self._load_and_process_video(item["video"], item)
-                    else:
-                        frames, meta = self._load_and_process_video(item, {})
-                    self._add_video(frames, meta, outputs, uuid)
-                else:
-                    # cached frames are already processed
-                    self._add_processed_video(item, outputs, uuid)
-                video_idx += 1
-                st = ed + VIDEO_PLACEHOLDER_LEN
-
-        return outputs
-
-    def prompt_token_ids2outputs(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "fps": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-        }
-        prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else []
-        prompt_token_ids_len = len(prompt_token_ids)
-
-        if not request.messages:
-            self._add_text(prompt_token_ids, outputs)
-            return outputs
-
-        messages = parse_chat_messages(request.messages)
-        mm_items = []
-        for msg in messages:
-            role = msg.get("role")
-            assert role in self.role_prefixes, f"Unsupported role: {role}"
-
-            content = msg.get("content")
-            if not isinstance(content, list):
-                content = [content]
-            for item in content:
-                if item.get("type") in ["image", "video"]:
-                    mm_items.append(item)
-
-        missing_hashes, missing_idx = [], []
-        for idx, item in enumerate(mm_items):
-            if not item.get("data"):
-                missing_hashes.append(item.get("uuid"))
-                missing_idx.append(idx)
-
-        if len(missing_hashes) > 0 and not self.enable_processor_cache:
-            raise ValueError("Missing items cannot be retrieved without processor cache.")
-
-        dealer = None
-        if self.enable_processor_cache:
-            context = zmq.Context()
-            dealer = context.socket(zmq.DEALER)
-            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
-            missing_items = self.get_processor_cache(dealer, missing_hashes)
-            for idx in range(len(missing_items)):
-                if not missing_items[idx]:
-                    raise ValueError(f"Missing item {idx} not found in processor cache")
-                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
-        st, mm_idx = 0, 0
-        while st < prompt_token_ids_len:
-            if prompt_token_ids[st] != self.image_token_id:
-                cur_idx = st
-                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
-                    cur_idx += 1
-                self._add_text(prompt_token_ids[st:cur_idx], outputs)
-                st = cur_idx
-                continue
-
-            if mm_idx >= len(mm_items):
-                raise ValueError("prompt token ids has more multimodal placeholder than in messages")
-
-            cur_idx = st
-            while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
-                cur_idx += 1
-
-            item = mm_items[mm_idx]
-            uuid = item.get("uuid")
-            token_len = cur_idx - st
-            if item.get("type") == "image":
-                image = item.get("data")
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid, token_len)
-                else:
-                    self._add_processed_image(image, outputs, uuid, token_len)
-            elif item.get("type") == "video":
-                video = item.get("data")
-                if not isinstance(video, tuple):
-                    if isinstance(video, dict):
-                        frames, meta = self._load_and_process_video(video["video"], video)
-                    else:
-                        frames, meta = self._load_and_process_video(video, {})
-                    self._add_video(frames, meta, outputs, uuid, token_len)
-                else:
-                    self._add_processed_video(video, outputs, uuid, token_len)
-            else:
-                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-            mm_idx += 1
-            st = cur_idx
-
-        if mm_idx != len(mm_items):
-            raise ValueError("number of multimodal items does not match prompt token ids")
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                grid_thw = np.asarray(outputs["grid_thw"][idx])
-                if grid_thw.ndim > 1:
-                    t, h, w = grid_thw[0]
-                else:
-                    t, h, w = grid_thw
-                meta["thw"] = (int(t), int(h), int(w))
-                meta["fps"] = outputs["fps"][idx]
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            if hashes_to_cache:
-                self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        return outputs
-
-    def request2ids(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        """
-        Convert chat request with multimodal messages into model inputs.
-
-        Args:
-            request: Request containing:
-                - messages: List of chat messages with text/image/video content
-                - request_id: Unique identifier for logging
-            tgts: Optional target sequences
-
-        Returns:
-            Dict with same structure as text2ids() output
-        """
-
-        messages = parse_chat_messages(request.messages)
-        mm_items = []
-        for msg in messages:
-            role = msg.get("role")
-            assert role in self.role_prefixes, f"Unsupported role: {role}"
-
-            content = msg.get("content")
-            if not isinstance(content, list):
-                content = [content]
-            for item in content:
-                if item.get("type") in ["image", "video"]:
-                    mm_items.append(item)
-
-        missing_hashes, missing_idx = [], []
-        for idx, item in enumerate(mm_items):
-            if not item.get("data"):
-                missing_hashes.append(item.get("uuid"))
-                missing_idx.append(idx)
-
-        if len(missing_hashes) > 0 and not self.enable_processor_cache:
-            raise ValueError("Missing items cannot be retrieved without processor cache.")
-
-        if self.enable_processor_cache:
-            context = zmq.Context()
-            dealer = context.socket(zmq.DEALER)
-            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
-            missing_items = self.get_processor_cache(dealer, missing_hashes)
-            for idx in range(len(missing_items)):
-                if not missing_items[idx]:
-                    raise ValueError(f"Missing item {idx} not found in processor cache")
-                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
-        images, videos = [], []
-        image_uuid, video_uuid = [], []
-        for item in mm_items:
-            if item.get("type") == "image":
-                images.append(item["data"])
-                image_uuid.append(item["uuid"])
-            elif item.get("type") == "video":
-                videos.append(item["data"])
-                video_uuid.append(item["uuid"])
-            else:
-                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat template.")
-
-        chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-        prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
-            **chat_template_kwargs,
-        )
-        request.prompt_tokens = prompt
-
-        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                grid_thw = np.asarray(outputs["grid_thw"][idx])
-                if grid_thw.ndim > 1:
-                    t, h, w = grid_thw[0]
-                else:
-                    t, h, w = grid_thw
-                meta["thw"] = (int(t), int(h), int(w))
-                meta["fps"] = outputs["fps"][idx]
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            if hashes_to_cache:
-                self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        return outputs
-
-    def _add_text(self, tokens, outputs: Dict) -> None:
-        """
-        Add text tokens to model inputs dictionary.
-
-        Args:
-            tokens: Text string or already tokenized IDs
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles both raw text and pre-tokenized inputs
-            - Updates position IDs for 3D embeddings
-        """
-        if not tokens:
-            return None
-
-        if isinstance(tokens, str):
-            tokens_str = self.tokenizer.tokenize(tokens)
-            tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
-        num_tokens = len(tokens)
-        outputs["input_ids"].extend(tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
-        pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-    def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
-        """
-        Generate 3D positional embeddings for text tokens.
-
-        Args:
-            start_pos: Starting position index
-            num_tokens: Number of tokens to generate positions for
-
-        Returns:
-            numpy.ndarray: 3D position IDs shaped (3, num_tokens)
-        """
-        text_array = np.arange(num_tokens).reshape(1, -1)
-        text_index = np.broadcast_to(text_array, (3, num_tokens))
-        position = text_index + start_pos
-        return position
-
-    def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None) -> None:
-        """
-        Add image data to model inputs dictionary.
-
-        Args:
-            img: PIL Image to process
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Preprocesses image and calculates spatial dimensions
-            - Adds image token IDs and type markers
-            - Generates appropriate position embeddings
-        """
-        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
-        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["grid_thw"].tolist()
-        if token_len is not None and token_len != num_tokens:
-            raise ValueError("image tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-        outputs["num_input_image_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].append(0)
-
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
-
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(0)
-
-    def _add_processed_image(
-        self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None
-    ) -> None:
-        img, meta = img_cache
-        num_tokens = img.shape[0] // self.image_processor.merge_size**2
-        if token_len is not None and token_len != num_tokens:
-            raise ValueError("image tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-        _, h, w = meta["thw"]
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["images"].append(img)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[1, h, w]]))
-        outputs["image_type_ids"].append(0)
-
-        outputs["fps"].append(0)
-
-    def _add_video(
-        self, frames, meta: Dict, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None
-    ) -> None:
-        """
-        Add video data to model inputs dictionary.
-
-        Args:
-            frames: Video frames as numpy array
-            meta: Video metadata containing fps/duration
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles temporal dimension in position embeddings
-            - Uses video-specific token IDs and type markers
-        """
-        ret = self.image_processor.preprocess(
-            images=frames,
-            min_pixels=VIDEO_MIN_PIXELS,
-            max_pixels=VIDEO_MAX_PIXELS,
-        )
-
-        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["grid_thw"].tolist()
-        if token_len is not None and token_len != num_tokens:
-            raise ValueError("video tokens num not match the size")
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        # Hack code. In order to adapt to the framework, only image_token can be passed
-        # The correct way should be to use [self.video_token_id] * num_tokens
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["num_input_video_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].extend([1] * grid_thw[0])
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(fps)
-
-    def _add_processed_video(
-        self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None
-    ) -> None:
-        frames, meta = frames_cache
-        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
-        if token_len is not None and token_len != num_tokens:
-            raise ValueError("video tokens num not match the size")
-
-        t, h, w = meta["thw"]
-        outputs["images"].append(frames)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["image_type_ids"].extend([1] * t)
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(fps)
-
-    def _compute_vision_positions(
-        self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
-    ) -> np.ndarray:
-        """
-        Generate 3D position IDs for visual inputs.
-
-        Args:
-            start_pos: Base position in sequence
-            t: Temporal patches (1 for images)
-            h: Height in patches
-            w: Width in patches
-            second_per_grid_t: Time per temporal patch
-
-        Returns:
-            np.ndarray: Position IDs for [t,h,w] dimensions
-        """
-        h //= self.spatial_conv_size
-        w //= self.spatial_conv_size
-
-        tn = np.arange(t).reshape(-1, 1)
-        tn = np.broadcast_to(tn, (t, h * w))
-        tn = tn * int(second_per_grid_t) * self.tokens_per_second
-        t_index = tn.flatten()
-
-        hn = np.arange(h).reshape(1, -1, 1)
-        h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
-        wn = np.arange(w).reshape(1, 1, -1)
-        w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
-        position = np.stack([t_index, h_index, w_index]) + start_pos
-        return position
-
-    def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
-        """
-        Load and preprocess video into frames.
-
-        Args:
-            url: Video file path or bytes
-            item: Dictionary containing processing parameters
-
-        Returns:
-            tuple: (frames, metadata) where:
-                - frames: Processed video frames as numpy array
-                - metadata: Updated video metadata dictionary
-        """
-        reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
-        # Apply frame sampling if fps or target_frames specified
-        fps = item.get("fps", self.fps)
-        num_frames = item.get("target_frames", self.target_frames)
-
-        frame_indices = list(range(meta["num_of_frame"]))
-        if fps > 0 or num_frames > 0:
-            # Get frame sampling constraints
-            min_frames = item.get("min_frames", self.min_frames)
-            max_frames = item.get("max_frames", self.max_frames)
-
-            # Sample frames according to specifications
-            frame_indices = sample_frames(
-                frame_factor=self.frame_factor,  # Ensure divisible by temporal patch size
-                min_frames=min_frames,
-                max_frames=max_frames,
-                metadata=meta,
-                fps=fps,
-                num_frames=num_frames,
-            )
-
-            # Update metadata with new frame count and fps
-            meta["num_of_frame"] = len(frame_indices)
-            if fps is not None:
-                meta["fps"] = fps  # Use specified fps
-                meta["duration"] = len(frame_indices) / fps
-            else:
-                meta["fps"] = len(frame_indices) / meta["duration"]  # Calculate fps from sampled frames
-
-        frames = []
-        for idx in frame_indices:
-            frame = reader[idx].asnumpy()
-            image = Image.fromarray(frame, "RGB")
-            frames.append(image)
-        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
-        return frames, meta
-
-    def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
-        """
-        get cache correspond to given hash values
-        """
-        req = pickle.dumps(mm_hashes)
-        socket.send_multipart([b"", req])
-        _, resp = socket.recv_multipart()
-        mm_items = pickle.loads(resp)
-        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
-        return mm_items
-
-    def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
-        """
-        update cache data
-        """
-        req = pickle.dumps((mm_hashes, mm_items))
-        socket.send_multipart([b"", req])
-        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py
deleted file mode 100644
index c72de49bcc3..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-
-class Qwen3VLProcessor(TextProcessor):
-    """
-    Qwen Vision-Language processor for handling multimodal inputs.
-
-    This processor extends TextProcessor to support:
-    - Image and video processing
-    - Multimodal feature extraction
-    - Tokenization and position encoding
-    - Request processing and model input generation
-
-    Attributes:
-        processor (DataProcessor): Underlying data processor instance
-        tokenizer: Text tokenizer instance
-        limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
-    """
-
-    def __init__(
-        self,
-        config,
-        model_name_or_path,
-        limit_mm_per_prompt=None,
-        mm_processor_kwargs=None,
-        reasoning_parser_obj=None,
-        tool_parser_obj=None,
-        enable_processor_cache=False,
-    ):
-        """
-        Initialize QwenVLProcessor instance.
-
-        Args:
-            config: Model configuration object
-            model_name_or_path (str): Pretrained model name or path
-            limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
-            mm_processor_kwargs (dict, optional): Multimodal processor arguments
-            reasoning_parser_obj: Reasoning parser instance
-            tool_parser_obj: Tool parser instance
-        """
-        super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-
-        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-        self.processor = DataProcessor(
-            model_path=model_name_or_path,
-            enable_processor_cache=enable_processor_cache,
-            # tokens_per_second=config.vision_config.tokens_per_second,
-            tokenizer=self.tokenizer,
-            **processor_kwargs,
-        )
-        self.image_patch_id = self.processor.image_token_id
-        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
-    def _parse_processor_kwargs(self, kwargs):
-        """
-        Parse and validate multimodal processor arguments.
-
-        Args:
-            kwargs (dict): Processor configuration arguments
-
-        Returns:
-            dict: Validated processor arguments
-
-        Raises:
-            ValueError: If arguments format is invalid
-        """
-        if not kwargs:
-            return {}
-
-        try:
-            if not isinstance(kwargs, dict):
-                raise ValueError("mm-processor-kwargs must be a dictionary")
-
-            # Validate kwargs types against expected schema
-            data_processor_logger.info(f"Processing kwargs: {kwargs}")
-            expected_types = {
-                "video_max_frames": int,  # Maximum video frames parameter
-                "video_min_frames": int,  # Minimum video frames parameter
-            }
-
-            for key, value in kwargs.items():
-                if key in expected_types and not isinstance(value, expected_types[key]):
-                    raise ValueError(
-                        f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
-                    )
-
-            return kwargs
-
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
-            return {}
-
-    def _parse_limits(self, limits):
-        """
-        Parse and validate multimodal input limits.
-
-        Args:
-            limits (dict): Input limits configuration
-
-        Returns:
-            dict: Validated limits with defaults
-
-        Raises:
-            ValueError: If limits format is invalid
-        """
-        DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
-        if not limits:
-            return DEFAULT_LIMITS
-
-        try:
-            if not isinstance(limits, dict):
-                raise ValueError("limit-mm-per-prompt must be a dictionary")
-            data_processor_logger.info(f"_parse_limits:{limits}")
-            return {**DEFAULT_LIMITS, **limits}
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
-            return DEFAULT_LIMITS
-
-    def _check_mm_limits(self, item):
-        """
-        Validate multimodal inputs against configured limits.
-
-        Args:
-            item: Input request item to validate
-
-        Raises:
-            ValueError: If input exceeds configured limits
-        """
-        if isinstance(item, dict):
-            # 请求包含prompt和multi_modal_data
-            mm_data = item
-        else:
-            # 请求包含messages
-            mm_data = {"image": [], "video": []}
-
-            for message in item:
-                if isinstance(message.get("content"), list):
-                    for part in message["content"]:
-                        if part.get("type") in ["image_url", "image"]:
-                            mm_data["image"].append(part)
-                        elif part.get("type") in ["video_url", "video"]:
-                            mm_data["video"].append(part)
-
-        for modality, data in mm_data.items():
-            if modality in self.limit_mm_per_prompt:
-                limit = self.limit_mm_per_prompt[modality]
-                if len(data) > limit:
-                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """
-        Process incoming request and generate model inputs.
-
-        Args:
-            request: Input request object
-            max_model_len (int, optional): Maximum context length
-            **kwargs: Additional processing parameters
-
-        Returns:
-            Request: Processed request with model inputs
-        """
-        task = request.to_dict()
-        task["enable_thinking"] = kwargs.get("enable_thinking", False)
-        self.process_request_dict(task, max_model_len)
-        request = Request.from_dict(task)
-        request = self._apply_default_parameters(request)
-        return request
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """
-        Process request dictionary into model inputs.
-
-        Args:
-            request Request: Input request dictionary
-            max_model_len (int, optional): Maximum context length
-
-        Returns:
-            Request: Processed request with model inputs
-
-        Raises:
-            ValueError: If request format is invalid
-        """
-
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        stop_sequences = request.sampling_params.stop
-        if stop_sequences:
-            stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
-            request.sampling_params.stop_token_ids = stop_seqs
-            request.sampling_params.stop_seqs_len = stop_seqs_len
-
-        bad_words = request.sampling_params.bad_words
-        bad_words_token_ids = request.sampling_params.bad_words_token_ids
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
-        if request.prompt_token_ids:
-            messages = request.messages
-            if messages:
-                self._check_mm_limits(messages)
-            if request.enable_thinking is None:
-                request.enable_thinking = kwargs.get("enable_thinking", False)
-            outputs = self.processor.prompt_token_ids2outputs(request)
-
-        elif request.prompt:
-            multimodal_data = request.multimodal_data
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request.prompt, images, videos)
-
-        elif request.messages:
-            messages = request.messages
-            self._check_mm_limits(messages)
-            chat_template_kwargs = request.chat_template_kwargs
-            if chat_template_kwargs:
-                if isinstance(chat_template_kwargs, dict):
-                    for k, v in chat_template_kwargs.items():
-                        if getattr(request, k, None) is None:
-                            setattr(request, k, v)
-                else:
-                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-            if request.enable_thinking is None:
-                request.enable_thinking = kwargs.get("enable_thinking", False)
-            outputs = self.processor.request2ids(request)
-            delattr(request, "chat_template_kwargs")
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        # Handle continuation of previous generation by appending existing tokens
-        if request.completion_token_ids:
-            self.append_completion_tokens(outputs, request.completion_token_ids)
-
-        # qwen25_vl not support thinking
-        request.enable_thinking = False
-
-        outputs = self.pack_outputs(outputs)
-
-        request.prompt_token_ids = (
-            outputs["input_ids"].tolist()
-            if not getattr(request, "prompt_token_ids", None)
-            else request.prompt_token_ids
-        )
-        request.prompt_token_ids_len = len(request.prompt_token_ids)
-        request.multimodal_inputs = outputs
-
-        # Handle prompt truncation if exceeds model context length
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[
-                : max_model_len - 1
-            ]  # Leave space for at least 1 new token
-
-        # Set default max_tokens if not specified
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-        data_processor_logger.info(f"Processed request {request}")
-
-        return request
-
-    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
-        """
-        Append completion tokens to existing outputs.
-
-        Args:
-            outputs: Current model outputs
-            completion_token_ids: completion tokens to append
-        """
-
-        num_tokens = len(completion_token_ids)
-        multimodal_inputs["input_ids"].extend(completion_token_ids)
-        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
-        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
-        multimodal_inputs["position_ids"].append(pos_ids)
-        multimodal_inputs["cur_position"] += num_tokens
-
-    def pack_outputs(self, outputs):
-        """
-        Prepare final output dictionary for model.
-
-        Args:
-            outputs: Intermediate processing outputs
-
-        Returns:
-            dict: Packed output dictionary with all required fields
-        """
-        if not outputs["images"]:
-            outputs["images"] = None  # No images case
-            outputs["grid_thw"] = None  # No spatial dimensions
-            outputs["image_type_ids"] = None  # No type IDs
-        else:
-            outputs["images"] = np.vstack(outputs["images"])  # Stack image features vertically
-            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])  # Stack spatial dimensions
-            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])  # Convert to numpy array
-
-        # Convert all outputs to numpy arrays with appropriate types
-        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)  # Token IDs as int64
-        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)  # Type IDs as int64
-        outputs["position_ids"] = np.concatenate(
-            outputs["position_ids"], axis=1, dtype=np.int64
-        )  # Concatenate position ID
-
-        outputs["image_patch_id"] = self.processor.image_token_id
-        outputs["video_patch_id"] = self.processor.video_token_id
-        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
-
-        return outputs
diff --git a/fastdeploy/input/v1/qwen_vl_processor/__init__.py b/fastdeploy/input/v1/qwen_vl_processor/__init__.py
deleted file mode 100644
index c876cde7125..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .process import DataProcessor
-from .qwen_vl_processor import QwenVLProcessor
-
-__all__ = [
-    "DataProcessor",
-    "QwenVLProcessor",
-]
diff --git a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen_vl_processor/image_processor.py
deleted file mode 100644
index b6a1db19bc5..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-def round_by_factor(number: int, factor: int) -> int:
-    """
-    Round number to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    """
-    Round number up to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    """
-    Round number down to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
-    """
-    Smart image resizing that maintains aspect ratio and respects constraints.
-
-    Args:
-        height: Original image height
-        width: Original image width
-        factor: Patch size factor
-        min_pixels: Minimum allowed pixels
-        max_pixels: Maximum allowed pixels
-        max_ratio: Maximum allowed aspect ratio
-
-    Returns:
-        tuple: (new_height, new_width)
-
-    Raises:
-        ValueError: If calculated dimensions are invalid
-    """
-    if max(height, width) / min(height, width) > max_ratio:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * max_ratio, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * max_ratio, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Check if image pixel values are already normalized to [0, 1] range.
-
-    Args:
-        image: Input image array
-
-    Returns:
-        bool: True if image is already scaled
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 14,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
-        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/qwen_vl_processor/process.py b/fastdeploy/input/v1/qwen_vl_processor/process.py
deleted file mode 100644
index 3aaade025cf..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/process.py
+++ /dev/null
@@ -1,591 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor
-from .process_video import sample_frames
-
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 768
-
-
-class DataProcessor(MMBaseDataProcessor):
-    """
-    Processes multimodal inputs (text, images, videos) into model-ready formats.
-
-    Handles:
-    - Tokenization of text with special tokens for visual content
-    - Image and video preprocessing
-    - Generation of 3D positional embeddings
-    - Conversion of chat messages to model inputs
-
-    Attributes:
-        tokenizer: Text tokenizer instance
-        image_processor: Image/video preprocessor
-        image_token: Special token for image placeholders
-        video_token: Special token for video placeholders
-        vision_start: Token marking start of visual content
-    """
-
-    def __init__(
-        self,
-        model_path: str,
-        enable_processor_cache: bool = False,
-        video_min_frames: int = FPS_MIN_FRAMES,
-        video_max_frames: int = FPS_MAX_FRAMES,
-        video_target_frames: int = -1,
-        video_fps: int = FPS,
-        tokens_per_second: int = 2,
-        tokenizer=None,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize the data processor.
-
-        Args:
-            model_path: Path to pretrained model
-            video_min_frames: Minimum frames to sample from videos
-            video_max_frames: Maximum frames to sample from videos
-            tokens_per_second: Temporal resolution for positional embeddings
-            **kwargs: Additional configuration
-        """
-        super().__init__()
-        self.min_frames = video_min_frames
-        self.max_frames = video_max_frames
-        self.target_frames = video_target_frames
-        self.fps = video_fps
-        self.frame_factor = FRAME_FACTOR
-
-        # Initialize tokenizer with left padding and fast tokenizer
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
-            self.tokenizer.ignored_index = -100  # Set ignored index for loss calculation
-        else:
-            self.tokenizer = tokenizer
-        self.image_processor = ImageProcessor.from_pretrained(model_path)  # Initialize image processor
-        self.enable_processor_cache = enable_processor_cache
-
-        # Convolution sizes for patch aggregation
-        self.spatial_conv_size = self.image_processor.merge_size
-        self.temporal_conv_size = self.image_processor.temporal_patch_size
-
-        # Special tokens and IDs
-        self.image_token = "<|image_pad|>"
-        self.video_token = "<|video_pad|>"
-
-        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-
-        self.vision_start = "<|vision_start|>"
-        self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
-        self.tokens_per_second = tokens_per_second
-
-        self.role_prefixes = {
-            "system": "",
-            "user": "User: ",
-            "bot": "Assistant: ",
-            "assistant": "Assistant: ",
-        }
-
-    @staticmethod
-    def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
-        """
-        Calculate the number of tokens in the multimodal input.
-        """
-        if isinstance(grid_thw, paddle.Tensor):
-            grid_thw = grid_thw.numpy()
-
-        if len(grid_thw) == 0:
-            return 0
-
-        def calc_one(thw):
-            t, h, w = map(int, thw)
-            return t * h * w // 4
-
-        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
-            return [calc_one(x) for x in grid_thw]
-
-        return calc_one(grid_thw)
-
-    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
-        """
-        Convert text with image/video placeholders into model inputs.
-
-        Args:
-            text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
-            images: List of PIL Images corresponding to image placeholders
-            videos: List of video data corresponding to video placeholders
-            image_uuid: List of unique identifiers for each image, used for caching or hashing.
-            video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
-        Returns:
-            Dict containing:
-                - input_ids: Token IDs
-                - token_type_ids: Type identifiers (text/image/video)
-                - position_ids: 3D positional embeddings
-                - images: Preprocessed visual features
-                - grid_thw: Spatial/temporal dimensions
-                - image_type_ids: Visual content type (0=image, 1=video)
-        """
-
-        outputs = {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "fps": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-        }
-
-        # Define placeholders and their lengths
-        IMAGE_PLACEHOLDER = "<|image_pad|>"
-        VIDEO_PLACEHOLDER = "<|video_pad|>"
-        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
-        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
-        # Initialize tracking variables for text parsing
-        st, image_idx, video_idx = 0, 0, 0  # Start position, image counter, video counter
-        while st < len(text):
-            # Find next image or video placeholder in text
-            image_pos = text.find(IMAGE_PLACEHOLDER, st)
-            image_pos = len(text) if image_pos == -1 else image_pos  # Set to end if not found
-            video_pos = text.find(VIDEO_PLACEHOLDER, st)
-            video_pos = len(text) if video_pos == -1 else video_pos  # Set to end if not found
-            ed = min(image_pos, video_pos)  # End position is first placeholder found
-
-            self._add_text(text[st:ed], outputs)
-            if ed == len(text):
-                break
-
-            if ed == image_pos:
-                image = images[image_idx]
-                uuid = image_uuid[image_idx] if image_uuid else None
-                if not isinstance(image, tuple):
-                    self._add_image(image, outputs, uuid)
-                else:
-                    self._add_processed_image(image, outputs, uuid)
-                image_idx += 1
-                st = ed + IMAGE_PLACEHOLDER_LEN
-            else:
-                item = videos[video_idx]
-                uuid = video_uuid[video_idx] if video_uuid else None
-                if not isinstance(item, tuple):
-                    if isinstance(item, dict):
-                        frames, meta = self._load_and_process_video(item["video"], item)
-                    else:
-                        frames, meta = self._load_and_process_video(item, {})
-                    self._add_video(frames, meta, outputs, uuid)
-                else:
-                    # cached frames are already processed
-                    self._add_processed_video(item, outputs, uuid)
-                video_idx += 1
-                st = ed + VIDEO_PLACEHOLDER_LEN
-
-        return outputs
-
-    def request2ids(
-        self, request: Request, tgts: List[str] = None
-    ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
-        """
-        Convert chat request with multimodal messages into model inputs.
-
-        Args:
-            request: Dictionary containing:
-                - messages: List of chat messages with text/image/video content
-                - request_id: Unique identifier for logging
-            tgts: Optional target sequences
-
-        Returns:
-            Dict with same structure as text2ids() output
-        """
-
-        # Parse and validate chat messages
-        messages = parse_chat_messages(request.messages)
-        mm_items = []
-        for msg in messages:
-            role = msg.get("role")
-            assert role in self.role_prefixes, f"Unsupported role: {role}"
-
-            # Normalize content to list format
-            content = msg.get("content")
-            if not isinstance(content, list):
-                content = [content]
-            # Collect all visual content items
-            for item in content:
-                if item.get("type") in ["image", "video"]:
-                    mm_items.append(item)
-
-        missing_hashes, missing_idx = [], []
-        for idx, item in enumerate(mm_items):
-            if not item.get("data"):
-                # raw data not provided, should be retrieved from processor cache
-                missing_hashes.append(item.get("uuid"))
-                missing_idx.append(idx)
-
-        if len(missing_hashes) > 0 and not self.enable_processor_cache:
-            raise ValueError("Missing items cannot be retrieved without processor cache.")
-
-        if self.enable_processor_cache:
-            context = zmq.Context()
-            dealer = context.socket(zmq.DEALER)
-            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
-            missing_items = self.get_processor_cache(dealer, missing_hashes)
-            for idx in range(len(missing_items)):
-                if not missing_items[idx]:
-                    raise ValueError(f"Missing item {idx} not found in processor cache")
-                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
-        images, videos = [], []
-        image_uuid, video_uuid = [], []
-        for item in mm_items:
-            if item.get("type") == "image":
-                images.append(item["data"])
-                image_uuid.append(item["uuid"])
-            elif item.get("type") == "video":
-                videos.append(item["data"])
-                video_uuid.append(item["uuid"])
-            else:
-                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
-        if self.tokenizer.chat_template is None:
-            raise ValueError("This model does not support chat template.")
-
-        chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-        prompt = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
-            **chat_template_kwargs,
-        )
-        request.prompt_tokens = prompt
-
-        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
-        if self.enable_processor_cache:
-            missing_idx = set(missing_idx)
-            hashes_to_cache, items_to_cache = [], []
-            for idx in range(len(mm_items)):
-                if idx in missing_idx:
-                    continue
-                meta = {}
-                t, h, w = outputs["grid_thw"][idx]
-                meta["thw"] = (t, h, w)
-                meta["fps"] = outputs["fps"][idx]
-                hashes_to_cache.append(outputs["mm_hashes"][idx])
-                items_to_cache.append((outputs["images"][idx], meta))
-            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
-        return outputs
-
-    def _add_text(self, tokens, outputs: Dict) -> None:
-        """
-        Add text tokens to model inputs dictionary.
-
-        Args:
-            tokens: Text string or already tokenized IDs
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles both raw text and pre-tokenized inputs
-            - Updates position IDs for 3D embeddings
-        """
-        if not tokens:
-            return None
-
-        if isinstance(tokens, str):
-            tokens_str = self.tokenizer.tokenize(tokens)
-            tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
-        num_tokens = len(tokens)
-        outputs["input_ids"].extend(tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
-        pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-    def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
-        """
-        Generate 3D positional embeddings for text tokens.
-
-        Args:
-            start_pos: Starting position index
-            num_tokens: Number of tokens to generate positions for
-
-        Returns:
-            numpy.ndarray: 3D position IDs shaped (3, num_tokens)
-        """
-        text_array = np.arange(num_tokens).reshape(1, -1)
-        text_index = np.broadcast_to(text_array, (3, num_tokens))
-        position = text_index + start_pos
-        return position
-
-    def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
-        """
-        Add image data to model inputs dictionary.
-
-        Args:
-            img: PIL Image to process
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Preprocesses image and calculates spatial dimensions
-            - Adds image token IDs and type markers
-            - Generates appropriate position embeddings
-        """
-        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
-        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["grid_thw"].tolist()
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-        outputs["num_input_image_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].append(0)
-
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
-
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(0)
-
-    def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
-        img, meta = img_cache
-        num_tokens = img.shape[0] // self.image_processor.merge_size**2
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
-        _, h, w = meta["thw"]
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["images"].append(img)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[1, h, w]]))
-        outputs["image_type_ids"].append(0)
-
-        outputs["fps"].append(0)
-
-    def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None:
-        """
-        Add video data to model inputs dictionary.
-
-        Args:
-            frames: Video frames as numpy array
-            meta: Video metadata containing fps/duration
-            outputs: Dictionary accumulating model inputs
-
-        Note:
-            - Handles temporal dimension in position embeddings
-            - Uses video-specific token IDs and type markers
-        """
-        ret = self.image_processor.preprocess(images=frames)
-
-        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
-        grid_thw = ret["grid_thw"].tolist()
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        # Hack code. In order to adapt to the framework, only image_token can be passed
-        # The correct way should be to use [self.video_token_id] * num_tokens
-        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["num_input_video_tokens"] += int(num_tokens)
-
-        outputs["images"].append(ret["pixel_values"])
-        if not uuid:
-            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
-        else:
-            outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(grid_thw)
-        outputs["image_type_ids"].extend([1] * grid_thw[0])
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        t, h, w = grid_thw
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(fps)
-
-    def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
-        frames, meta = frames_cache
-        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
-
-        t, h, w = meta["thw"]
-        outputs["images"].append(frames)
-        outputs["mm_hashes"].append(uuid)
-        outputs["grid_thw"].append(np.array([[t, h, w]]))
-
-        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
-        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
-        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
-        outputs["image_type_ids"].extend([1] * t)
-
-        fps = meta["fps"]
-        second_per_grid_t = self.temporal_conv_size / fps
-        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-        outputs["position_ids"].append(pos_ids)
-        outputs["cur_position"] = pos_ids.max() + 1
-
-        outputs["fps"].append(fps)
-
-    def _compute_vision_positions(
-        self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
-    ) -> np.ndarray:
-        """
-        Generate 3D position IDs for visual inputs.
-
-        Args:
-            start_pos: Base position in sequence
-            t: Temporal patches (1 for images)
-            h: Height in patches
-            w: Width in patches
-            second_per_grid_t: Time per temporal patch
-
-        Returns:
-            np.ndarray: Position IDs for [t,h,w] dimensions
-        """
-        h //= self.spatial_conv_size
-        w //= self.spatial_conv_size
-
-        tn = np.arange(t).reshape(-1, 1)
-        tn = np.broadcast_to(tn, (t, h * w))
-        tn = tn * int(second_per_grid_t) * self.tokens_per_second
-        t_index = tn.flatten()
-
-        hn = np.arange(h).reshape(1, -1, 1)
-        h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
-        wn = np.arange(w).reshape(1, 1, -1)
-        w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
-        position = np.stack([t_index, h_index, w_index]) + start_pos
-        return position
-
-    def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
-        """
-        Load and preprocess video into frames.
-
-        Args:
-            url: Video file path or bytes
-            item: Dictionary containing processing parameters
-
-        Returns:
-            tuple: (frames, metadata) where:
-                - frames: Processed video frames as numpy array
-                - metadata: Updated video metadata dictionary
-        """
-        reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
-        # Apply frame sampling if fps or target_frames specified
-        fps = item.get("fps", self.fps)
-        num_frames = item.get("target_frames", self.target_frames)
-
-        frame_indices = list(range(meta["num_of_frame"]))
-        if fps > 0 or num_frames > 0:
-            # Get frame sampling constraints
-            min_frames = item.get("min_frames", self.min_frames)
-            max_frames = item.get("max_frames", self.max_frames)
-
-            # Sample frames according to specifications
-            frame_indices = sample_frames(
-                frame_factor=self.frame_factor,  # Ensure divisible by temporal patch size
-                min_frames=min_frames,
-                max_frames=max_frames,
-                metadata=meta,
-                fps=fps,
-                num_frames=num_frames,
-            )
-
-            # Update metadata with new frame count and fps
-            meta["num_of_frame"] = len(frame_indices)
-            if fps is not None:
-                meta["fps"] = fps  # Use specified fps
-                meta["duration"] = len(frame_indices) / fps
-            else:
-                meta["fps"] = len(frame_indices) / meta["duration"]  # Calculate fps from sampled frames
-
-        frames = []
-        for idx in frame_indices:
-            frame = reader[idx].asnumpy()
-            image = Image.fromarray(frame, "RGB")
-            frames.append(image)
-        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
-        return frames, meta
-
-    def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
-        """
-        get cache correspond to given hash values
-        """
-        req = pickle.dumps(mm_hashes)
-        socket.send_multipart([b"", req])
-        _, resp = socket.recv_multipart()
-        mm_items = pickle.loads(resp)
-        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
-        return mm_items
-
-    def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
-        """
-        update cache data
-        """
-        req = pickle.dumps((mm_hashes, mm_items))
-        socket.send_multipart([b"", req])
-        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/qwen_vl_processor/process_video.py b/fastdeploy/input/v1/qwen_vl_processor/process_video.py
deleted file mode 100644
index 891f272033b..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/process_video.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from typing import Optional, Union
-
-import numpy as np
-
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ceil_by_factor, floor_by_factor
-
-
-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = -1,
-    num_frames: Optional[int] = -1,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        min_frames = ceil_by_factor(min_frames, frame_factor)
-        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
-        num_frames = total_num_frames / metadata["fps"] * fps
-
-        if num_frames > total_num_frames:
-            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = floor_by_factor(num_frames, frame_factor)
-
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Hack code ensures that num_frames can always be divided by 4
-    # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
-    if num_frames > 2 and num_frames % 4 != 0:
-        num_frames = (num_frames // 4) * 4  # 向下取整到 4 的倍数
-        total_num_frames = (total_num_frames // 4) * 4
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
diff --git a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py
deleted file mode 100644
index e0d846d53a2..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py
+++ /dev/null
@@ -1,338 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.utils import process_stop_token_ids
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-
-class QwenVLProcessor(TextProcessor):
-    """
-    Qwen Vision-Language processor for handling multimodal inputs.
-
-    This processor extends TextProcessor to support:
-    - Image and video processing
-    - Multimodal feature extraction
-    - Tokenization and position encoding
-    - Request processing and model input generation
-
-    Attributes:
-        processor (DataProcessor): Underlying data processor instance
-        tokenizer: Text tokenizer instance
-        limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
-    """
-
-    def __init__(
-        self,
-        config,
-        model_name_or_path,
-        limit_mm_per_prompt=None,
-        mm_processor_kwargs=None,
-        reasoning_parser_obj=None,
-        tool_parser_obj=None,
-        enable_processor_cache=False,
-    ):
-        """
-        Initialize QwenVLProcessor instance.
-
-        Args:
-            config: Model configuration object
-            model_name_or_path (str): Pretrained model name or path
-            limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
-            mm_processor_kwargs (dict, optional): Multimodal processor arguments
-            reasoning_parser_obj: Reasoning parser instance
-            tool_parser_obj: Tool parser instance
-        """
-        super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-
-        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-        self.processor = DataProcessor(
-            model_path=model_name_or_path,
-            enable_processor_cache=enable_processor_cache,
-            tokens_per_second=config.vision_config.tokens_per_second,
-            tokenizer=self.tokenizer,
-            **processor_kwargs,
-        )
-        self.image_patch_id = self.processor.image_token_id
-        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
-    def _parse_processor_kwargs(self, kwargs):
-        """
-        Parse and validate multimodal processor arguments.
-
-        Args:
-            kwargs (dict): Processor configuration arguments
-
-        Returns:
-            dict: Validated processor arguments
-
-        Raises:
-            ValueError: If arguments format is invalid
-        """
-        if not kwargs:
-            return {}
-
-        try:
-            if not isinstance(kwargs, dict):
-                raise ValueError("mm-processor-kwargs must be a dictionary")
-
-            # Validate kwargs types against expected schema
-            data_processor_logger.info(f"Processing kwargs: {kwargs}")
-            expected_types = {
-                "video_max_frames": int,  # Maximum video frames parameter
-                "video_min_frames": int,  # Minimum video frames parameter
-            }
-
-            for key, value in kwargs.items():
-                if key in expected_types and not isinstance(value, expected_types[key]):
-                    raise ValueError(
-                        f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
-                    )
-
-            return kwargs
-
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
-            return {}
-
-    def _parse_limits(self, limits):
-        """
-        Parse and validate multimodal input limits.
-
-        Args:
-            limits (dict): Input limits configuration
-
-        Returns:
-            dict: Validated limits with defaults
-
-        Raises:
-            ValueError: If limits format is invalid
-        """
-        DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
-        if not limits:
-            return DEFAULT_LIMITS
-
-        try:
-            if not isinstance(limits, dict):
-                raise ValueError("limit-mm-per-prompt must be a dictionary")
-            data_processor_logger.info(f"_parse_limits:{limits}")
-            return {**DEFAULT_LIMITS, **limits}
-        except Exception as e:
-            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
-            return DEFAULT_LIMITS
-
-    def _check_mm_limits(self, item):
-        """
-        Validate multimodal inputs against configured limits.
-
-        Args:
-            item: Input request item to validate
-
-        Raises:
-            ValueError: If input exceeds configured limits
-        """
-        if isinstance(item, dict):
-            # 请求包含prompt和multi_modal_data
-            mm_data = item
-        else:
-            # 请求包含messages
-            mm_data = {"image": [], "video": []}
-
-            for message in item:
-                if isinstance(message.get("content"), list):
-                    for part in message["content"]:
-                        if part.get("type") in ["image_url", "image"]:
-                            mm_data["image"].append(part)
-                        elif part.get("type") in ["video_url", "video"]:
-                            mm_data["video"].append(part)
-
-        for modality, data in mm_data.items():
-            if modality in self.limit_mm_per_prompt:
-                limit = self.limit_mm_per_prompt[modality]
-                if len(data) > limit:
-                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """
-        Process incoming request and generate model inputs.
-
-        Args:
-            request: Input request object
-            max_model_len (int, optional): Maximum context length
-            **kwargs: Additional processing parameters
-
-        Returns:
-            Request: Processed request with model inputs
-        """
-        task = request.to_dict()
-        task["enable_thinking"] = kwargs.get("enable_thinking", False)
-        self.process_request_dict(task, max_model_len)
-        request = Request.from_dict(task)
-        request = self._apply_default_parameters(request)
-        return request
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """
-        Process request dictionary into model inputs.
-
-        Args:
-            request (dict): Input request dictionary
-            max_model_len (int, optional): Maximum context length
-
-        Returns:
-            dict: Processed request with model inputs
-
-        Raises:
-            ValueError: If request format is invalid
-        """
-
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        bad_words = request.sampling_params.bad_words
-        bad_words_token_ids = request.sampling_params.bad_words_token_ids
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
-        if request.prompt:
-            multimodal_data = request.multimodal_data
-            if multimodal_data is None:
-                multimodal_data = {}
-            self._check_mm_limits(multimodal_data)
-            images = multimodal_data.get("image", None)
-            videos = multimodal_data.get("video", None)
-            outputs = self.processor.text2ids(request.prompt, images, videos)
-
-        elif request.messages:
-            messages = request.messages
-            self._check_mm_limits(messages)
-            chat_template_kwargs = request.chat_template_kwargs
-            if chat_template_kwargs:
-                if isinstance(chat_template_kwargs, dict):
-                    for k, v in chat_template_kwargs.items():
-                        if getattr(request, k, v):
-                            setattr(request, k, v)
-                else:
-                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-            if getattr(request, "enable_thinking") is None:
-                setattr(request, "enable_thinking", True)
-            outputs = self.processor.request2ids(request)
-            delattr(request, "chat_template_kwargs")
-        else:
-            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
-        # Handle continuation of previous generation by appending existing tokens
-        if request.completion_token_ids:
-            self.append_completion_tokens(outputs, request.completion_token_ids)
-
-        # qwen25_vl not support thinking
-        request.enable_thinking = False
-
-        outputs = self.pack_outputs(outputs)
-
-        request.prompt_token_ids = outputs["input_ids"].tolist()
-        request.prompt_token_ids_len = len(request.prompt_token_ids)
-        request.multimodal_inputs = outputs
-
-        # Handle prompt truncation if exceeds model context length
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[
-                : max_model_len - 1
-            ]  # Leave space for at least 1 new token
-
-        # Set default max_tokens if not specified
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.sampling_params.n or 1
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-        data_processor_logger.info(f"Processed request {request}")
-
-        return request
-
-    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
-        """
-        Append completion tokens to existing outputs.
-
-        Args:
-            outputs: Current model outputs
-            completion_token_ids: completion tokens to append
-        """
-
-        num_tokens = len(completion_token_ids)
-        multimodal_inputs["input_ids"].extend(completion_token_ids)
-        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
-        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
-        multimodal_inputs["position_ids"].append(pos_ids)
-        multimodal_inputs["cur_position"] += num_tokens
-
-    def pack_outputs(self, outputs):
-        """
-        Prepare final output dictionary for model.
-
-        Args:
-            outputs: Intermediate processing outputs
-
-        Returns:
-            dict: Packed output dictionary with all required fields
-        """
-        if not outputs["images"]:
-            outputs["images"] = None  # No images case
-            outputs["grid_thw"] = None  # No spatial dimensions
-            outputs["image_type_ids"] = None  # No type IDs
-        else:
-            outputs["images"] = np.vstack(outputs["images"])  # Stack image features vertically
-            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])  # Stack spatial dimensions
-            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])  # Convert to numpy array
-
-        # Convert all outputs to numpy arrays with appropriate types
-        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)  # Token IDs as int64
-        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)  # Type IDs as int64
-        outputs["position_ids"] = np.concatenate(
-            outputs["position_ids"], axis=1, dtype=np.int64
-        )  # Concatenate position ID
-
-        outputs["image_patch_id"] = self.processor.image_token_id
-        outputs["video_patch_id"] = self.processor.video_token_id
-        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
-        return outputs
diff --git a/fastdeploy/input/v1/text_processor.py b/fastdeploy/input/v1/text_processor.py
deleted file mode 100644
index f83a0e0f12e..00000000000
--- a/fastdeploy/input/v1/text_processor.py
+++ /dev/null
@@ -1,925 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from abc import ABC, abstractmethod
-from collections import OrderedDict
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer
-
-from fastdeploy import envs
-from fastdeploy.input.utils import process_stop_token_ids
-from fastdeploy.utils import data_processor_logger
-
-_SAMPLING_EPS = 1e-5
-
-
-class BaseDataProcessor(ABC):
-    """base class for data processor"""
-
-    def __init__(self):
-        """
-        Returns:
-            None
-        """
-        self.tokenizer = self._load_tokenizer()
-        self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.bos_token)
-        self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.cls_token)
-        self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.sep_token)
-        self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token)
-        self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.mask_token)
-        data_processor_logger.info(
-            (
-                f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ",
-                f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, "
-                f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, "
-                f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, "
-                f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}",
-            )
-        )
-        self._tokenize_cache = OrderedDict()
-        self._tokenize_cache_capacity = 128
-
-    def _apply_default_parameters(self, request):
-        """
-        Apply default value for parameters in request
-        """
-
-        def set_value(req, key, value):
-            value = getattr(self.generation_config, key, value)
-            if getattr(req.sampling_params, key) is None:
-                setattr(req.sampling_params, key, value)
-
-        set_value(request, "top_p", 0.7)
-        set_value(request, "temperature", 1.0)
-        set_value(request, "repetition_penalty", 1.0)
-        set_value(request, "frequency_penalty", 0.0)
-        set_value(request, "presence_penalty", 0.0)
-        return request
-
-    @abstractmethod
-    def process_request_dict(self, request, **kwargs):
-        """
-        Preprocess the request
-
-        Args:
-            request Request: may contain text and messages fields
-            **kwargs: others
-
-        Returns:
-            bool: Whether preprocessing is successful
-            str: error message
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def process_response_dict(self, response_obj):
-        """
-        Preprocess the response
-
-        Args:
-            response_obj RequestOutput: response for engine, contain ids fields
-
-        Returns:
-            RequestOutput: response contain text fields
-        """
-        raise NotImplementedError
-
-    def text2ids(self, text, max_model_len=None):
-        """
-        text to token ids
-
-        Args:
-            text (str): text
-
-        Returns:
-            List[int]: token ids list
-        """
-        raise NotImplementedError
-
-    def encode_with_cache(self, text, max_model_len=None, add_special_tokens=False):
-        """
-        Encode text into token ids with a small LRU cache.
-        """
-        if not hasattr(self, "_tokenize_cache"):
-            self._tokenize_cache = OrderedDict()
-            self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128)
-        key = (text, bool(add_special_tokens))
-        cached = self._tokenize_cache.get(key)
-        if cached is not None:
-            self._tokenize_cache.move_to_end(key)
-            return cached
-        token_ids = self.text2ids(text, max_model_len, add_special_tokens=add_special_tokens)
-        if hasattr(token_ids, "tolist"):
-            token_ids = token_ids.tolist()
-        elif not isinstance(token_ids, list):
-            token_ids = list(token_ids)
-        self._tokenize_cache[key] = token_ids
-        if len(self._tokenize_cache) > self._tokenize_cache_capacity:
-            self._tokenize_cache.popitem(last=False)
-        return token_ids
-
-    def _encode_literal_text_with_cache(self, text):
-        if not hasattr(self, "_tokenize_cache"):
-            self._tokenize_cache = OrderedDict()
-            self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128)
-        key = ("literal_text", text)
-        cached = self._tokenize_cache.get(key)
-        if cached is not None:
-            self._tokenize_cache.move_to_end(key)
-            return cached
-        token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
-        if hasattr(token_ids, "tolist"):
-            token_ids = token_ids.tolist()
-        elif not isinstance(token_ids, list):
-            token_ids = list(token_ids)
-        self._tokenize_cache[key] = token_ids
-        if len(self._tokenize_cache) > self._tokenize_cache_capacity:
-            self._tokenize_cache.popitem(last=False)
-        return token_ids
-
-    def messages2ids(self, messages):
-        """
-        Convert multi-turn messages into ID sequences.
-
-        Args:
-            messages (List[List[Dict[str, Any]]]): multi-turn messages.
-
-        Returns:
-            List[int]: ID sequences
-        """
-        raise NotImplementedError
-
-    def _get_think_token_ids(self):
-        think_token_ids = getattr(self, "_think_token_ids", None)
-        if think_token_ids is not None:
-            return think_token_ids
-        tokenizer = getattr(self, "tokenizer", None)
-        vocab = tokenizer.get_vocab() if tokenizer is not None else {}
-        think_start_id = vocab.get("<think>", -1)
-        think_end_id = vocab.get("</think>", -1)
-        self._think_token_ids = (think_start_id, think_end_id)
-        return self._think_token_ids
-
-    def _prepare_think_stop_sentence(self, logits_processors_args, max_model_len=None):
-        if not isinstance(logits_processors_args, dict):
-            return logits_processors_args
-        think_stop_sentence = logits_processors_args.get("think_stop_sentence")
-        if isinstance(think_stop_sentence, str) and think_stop_sentence:
-            sentence_token_ids = self._encode_literal_text_with_cache(think_stop_sentence)
-            logits_processors_args["think_stop_sentence_token_ids"] = sentence_token_ids
-            logits_processors_args.pop("think_stop_sentence", None)
-        return logits_processors_args
-
-    def _update_thinking_prompt_state(self, prompt_token_ids, logits_processors_args):
-        if not isinstance(logits_processors_args, dict):
-            return logits_processors_args
-        thinking_budget = logits_processors_args.get("thinking_budget")
-        if thinking_budget is None or not isinstance(thinking_budget, int) or thinking_budget < 0:
-            return logits_processors_args
-        if logits_processors_args.get("think_prompt_checked"):
-            return logits_processors_args
-        if prompt_token_ids is None:
-            return logits_processors_args
-        token_len = getattr(prompt_token_ids, "size", None) or len(prompt_token_ids)
-        if token_len == 0:
-            return logits_processors_args
-        think_start_id, think_end_id = self._get_think_token_ids()
-        if think_start_id < 0 or think_end_id < 0:
-            return logits_processors_args
-
-        if hasattr(prompt_token_ids, "tolist"):
-            token_list = prompt_token_ids.tolist()
-        else:
-            token_list = list(prompt_token_ids)
-
-        started = False
-        ended = False
-        tokens_after_start = 0
-        last_token_id = None
-        in_thinking = False
-        for token_id in token_list:
-            if token_id == think_start_id:
-                started = True
-                ended = False
-                in_thinking = True
-            elif token_id == think_end_id and in_thinking:
-                ended = True
-                in_thinking = False
-        if started and token_list:
-            # Align with operator-level reasoning_max_tokens: prompt-side tokens
-            # inside <think> do not consume thinking budget.
-            last_token_id = int(token_list[-1])
-
-        logits_processors_args["think_prompt_checked"] = True
-        logits_processors_args["think_prompt_started"] = started
-        logits_processors_args["think_prompt_ended"] = ended
-        logits_processors_args["think_prompt_tokens_after_start"] = tokens_after_start
-        if last_token_id is not None:
-            logits_processors_args["think_prompt_last_token_id"] = last_token_id
-        else:
-            logits_processors_args.pop("think_prompt_last_token_id", None)
-        return logits_processors_args
-
-    def ids2tokens(self, token_id, task_id=None):
-        """
-        token ids to strings
-
-        Args:
-            token_id (List[int]): token id
-                        task_id (str): task id
-
-        Returns:
-            List[str]: strings
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def _load_tokenizer(self):
-        """
-        load tokenizer
-
-        Returns:
-            tokenizer (AutoTokenizer)
-        """
-        raise NotImplementedError
-
-
-class DataProcessor(BaseDataProcessor):
-    def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
-        """
-            Initializes the DecodeStatus object.
-
-        Args:
-            model_name_or_path (str): The name or path of the pre-trained model to be loaded.
-                Can also be a path to a directory containing the pre-trained model file.
-
-        Returns:
-            None.
-
-        Raises:
-            None.
-        """
-
-        self.model_name_or_path = model_name_or_path
-
-        # Generation config
-        try:
-            self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
-        except Exception as e:
-            data_processor_logger.warning(
-                f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
-            )
-            self.generation_config = None
-
-        self.decode_status = dict()
-        self.model_status_dict = dict()
-        self.tool_parser_dict = dict()
-        self.tokenizer = self._load_tokenizer()
-        self._tokenize_cache = OrderedDict()
-        self._tokenize_cache_capacity = 128
-        data_processor_logger.info(
-            f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
-                                eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
-        )
-
-        try:
-            from paddleformers.trl.llm_utils import get_eos_token_id
-        except Exception:
-            from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
-        self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
-        data_processor_logger.info(
-            f"The eos_token_ids obtained by merging tokenizer and generation_config is {self.eos_token_ids}"
-        )
-        self.eos_token_id_len = len(self.eos_token_ids)
-        self.pad_token_id = self.get_pad_id()
-        self.reasoning_parser = None
-        self.tool_parser_obj = tool_parser_obj
-        if reasoning_parser_obj:
-            self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
-        self.tokenizer.pad_token_id = self.pad_token_id
-
-    def process_request(self, request, max_model_len=None, **kwargs):
-        """
-        Preprocess the request
-
-        Args:
-            request (Dict): may contain text and messages fields
-
-        Returns:
-            bool: Whether preprocessing is successful
-            str: error message
-        """
-        data_processor_logger.info(f"Start processing request: {request}")
-        request = self._apply_default_parameters(request)
-        if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        # processing bad_words
-        bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
-
-        logits_processors_args = self._prepare_think_stop_sentence(
-            request.get("logits_processors_args") or {}, max_model_len
-        )
-        request["logits_processors_args"] = logits_processors_args
-
-        # processing prompt_token_ids
-        if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
-            if request.prompt is not None:
-                prompt = request.prompt
-                add_special_tokens = request.get("add_special_tokens", False)
-                assert isinstance(prompt, str) or (
-                    isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
-                ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
-                if isinstance(prompt, list):  # if prompt is a token id list
-                    request.prompt_token_ids = prompt
-                else:
-                    request.prompt_token_ids = self.text2ids(
-                        request.prompt, max_model_len, add_special_tokens=add_special_tokens
-                    )
-            elif request.messages is not None:
-                if self.tokenizer.chat_template is None:
-                    raise ValueError("This model does not support chat_template.")
-                task = request.to_dict()
-                chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
-                if chat_template_kwargs:
-                    if isinstance(chat_template_kwargs, dict):
-                        for k, v in chat_template_kwargs.items():
-                            if k not in task or task[k] is None:
-                                task[k] = v
-                    else:
-                        raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-                task.setdefault("enable_thinking", True)
-                request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs)
-            else:
-                raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
-
-        if len(request.prompt_token_ids) == 0:
-            raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
-        # truncate prompts that exceed the length limit
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-
-        logits_processors_args = request.get("logits_processors_args") or {}
-        logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args)
-        request["logits_processors_args"] = logits_processors_args
-
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if request.get("max_tokens") is None:
-            request.set("max_tokens", max(1, max_tokens))
-        else:
-            request.set("max_tokens", min(max_tokens, request.get("max_tokens")))
-        if request.get("temperature") < _SAMPLING_EPS:
-            # zero temperature means greedy decoding: set top_k=1 to force argmax
-            request.set("temperature", 1)
-            request.set("top_k", 1)
-        if request.get("top_p") < _SAMPLING_EPS:
-            request.set("top_p", _SAMPLING_EPS)
-            request.set("top_k", 1)
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-
-        data_processor_logger.info(f"Processed request: {request}")
-        return request
-
-    def process_request_dict(self, request, max_model_len=None, **kwargs):
-        """
-        Preprocess the request
-
-        Args:
-            request Request: may contain text and messages fields
-
-        Returns:
-            bool: Whether preprocessing is successful
-            str: error message
-        """
-        data_processor_logger.info(f"Start processing request: {request}")
-        request = self._apply_default_parameters(request)
-        if not request.eos_token_ids:
-            request.eos_token_ids = self.eos_token_ids
-
-        # processing stop_sequences and stop_token_ids
-        process_stop_token_ids(request, self.update_stop_seq)
-
-        # processing bad_words
-        bad_words = request.sampling_params.bad_words
-        bad_words_token_ids = request.sampling_params.bad_words_token_ids
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
-        logits_processors_args = self._prepare_think_stop_sentence(
-            getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
-        )
-        request.sampling_params.logits_processors_args = logits_processors_args
-
-        # processing prompt_token_ids
-        if not request.prompt_token_ids:
-            if request.prompt:
-                prompt = request.prompt
-                add_special_tokens = getattr(request, "add_special_tokens", None) or False
-                assert isinstance(prompt, str) or (
-                    isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
-                ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
-                if isinstance(prompt, list):  # if prompt is a token id list
-                    request.prompt_token_ids = prompt
-                else:
-                    request.prompt_token_ids = self.text2ids(
-                        request.prompt, max_model_len, add_special_tokens=add_special_tokens
-                    ).tolist()
-            elif request.messages:
-                if self.tokenizer.chat_template is None:
-                    raise ValueError("This model does not support chat_template.")
-                chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
-                if not chat_template_kwargs:
-                    chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
-                if chat_template_kwargs:
-                    if isinstance(chat_template_kwargs, dict):
-                        for k, v in chat_template_kwargs.items():
-                            if not getattr(request, k, None):
-                                setattr(request, k, v)
-                    else:
-                        raise ValueError("Invalid input: chat_template_kwargs must be a dict")
-                if getattr(request, "enable_thinking") is None:
-                    setattr(request, "enable_thinking", True)
-                request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs)
-                delattr(request, "chat_template_kwargs")
-            else:
-                raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
-
-        if len(request.prompt_token_ids) == 0:
-            raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
-        # truncate prompts that exceed the length limit
-        if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
-            request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-        logits_processors_args = getattr(request.sampling_params, "logits_processors_args", None) or {}
-        logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args)
-        request.sampling_params.logits_processors_args = logits_processors_args
-
-        max_tokens = max_model_len - len(request.prompt_token_ids)
-        if getattr(request.sampling_params, "max_tokens", None) is None:
-            request.sampling_params.max_tokens = max(1, max_tokens)
-        else:
-            request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-
-        if request.sampling_params.temperature < _SAMPLING_EPS:
-            # zero temperature means greedy decoding: set top_k=1 to force argmax
-            request.sampling_params.temperature = 1
-            request.sampling_params.top_k = 1
-        if request.sampling_params.top_p < _SAMPLING_EPS:
-            request.sampling_params.top_p = _SAMPLING_EPS
-            request.sampling_params.top_k = 1
-        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
-            parts = request.request_id.split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.sampling_params.n or 1
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request.request_id] = model_status
-            request.enable_thinking = model_status == "think_start"
-
-        data_processor_logger.info(f"Processed request: {request}")
-        return request
-
-    def process_logprob_response(self, token_ids, **kwargs):
-        full_text = self.tokenizer.decode(token_ids, **kwargs)
-        return full_text
-
-    def process_response(self, response_dict, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_dict (Dict): response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        req_id = response_dict.request_id
-        token_ids = response_dict.outputs.token_ids
-        if token_ids[-1] == self.tokenizer.eos_token_id:
-            token_ids = token_ids[:-1]
-        full_text = self.tokenizer.decode(token_ids)
-        response_dict.outputs.text = full_text
-        if self.reasoning_parser:
-            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                full_text, response_dict, self.model_status_dict[req_id]
-            )
-            response_dict.outputs.text = text
-            response_dict.outputs.reasoning_content = reasoning_content
-        if self.tool_parser_obj:
-            tool_parser = self.tool_parser_obj(self.tokenizer)
-            tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
-            if tool_call_info.tools_called:
-                response_dict.outputs.tool_calls = tool_call_info.tool_calls
-                response_dict.outputs.text = tool_call_info.content
-        if req_id in self.model_status_dict:
-            del self.model_status_dict[req_id]
-        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
-
-        return response_dict
-
-    def process_response_obj_normal(self, response_obj, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_obj :response for engine, contain ids fields
-
-        Returns:
-            RequestOutput: response contain text fields
-        """
-        output = response_obj.outputs
-        token_ids = output.token_ids
-        is_end = response_obj.finished
-        req_id = response_obj.request_id
-        request = kwargs.get("request", None)
-        if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
-            if token_ids[-1] in self.eos_token_ids:
-                token_ids = token_ids[:-1]
-        delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
-        if is_end:
-            full_text = previous_texts + delta_text
-            response_obj.outputs.completion_tokens = full_text
-            response_obj.outputs.text = full_text
-            if self.reasoning_parser:
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                    full_text,
-                    request,
-                    self.model_status_dict[req_id],
-                )
-                response_obj.outputs.text = text
-                response_obj.outputs.reasoning_content = reasoning_content
-                reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
-                response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
-            if self.tool_parser_obj:
-                tool_parser = self.tool_parser_obj(self.tokenizer)
-                tool_call_info = tool_parser.extract_tool_calls(full_text, request)
-                if tool_call_info.tools_called:
-                    response_obj.outputs.tool_calls = tool_call_info.tool_calls
-                    response_obj.outputs.text = tool_call_info.content
-            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
-            del self.decode_status[req_id]
-            if req_id in self.model_status_dict:
-                del self.model_status_dict[req_id]
-        return response_obj
-
-    def process_response_obj_streaming(self, response_obj, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_obj : response for engine, contain ids fields
-
-        Returns:
-            RequestOutput: response contain text fields
-        """
-        output = response_obj.outputs
-        token_ids = output.token_ids
-        is_end = response_obj.finished
-        req_id = response_obj.request_id
-        request = kwargs.get("request", None)
-
-        if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
-            if token_ids[-1] in self.eos_token_ids:
-                token_ids = token_ids[:-1]
-        delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
-        response_obj.outputs.completion_tokens = delta_text
-        if self.reasoning_parser:
-            reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
-                previous_texts,
-                previous_texts + delta_text,
-                delta_text,
-                previous_token_ids,
-                previous_token_ids + token_ids,
-                token_ids,
-                self.model_status_dict[req_id],
-            )
-            response_obj.outputs.delta_message = reasoning_delta_message
-            reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
-            reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
-            response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
-        if self.tool_parser_obj:
-            if req_id not in self.tool_parser_dict:
-                self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
-            tool_parser = self.tool_parser_dict[req_id]
-            tool_call = tool_parser.extract_tool_calls_streaming(
-                previous_texts,
-                previous_texts + delta_text,
-                delta_text,
-                previous_token_ids,
-                previous_token_ids + token_ids,
-                token_ids,
-                request,
-            )
-            if tool_call is None or tool_call.tool_calls:
-                response_obj.outputs.delta_message = tool_call
-        response_obj.outputs.text = delta_text
-        if is_end:
-            data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
-            del self.decode_status[req_id]
-            if req_id in self.tool_parser_dict:
-                del self.tool_parser_dict[req_id]
-            if req_id in self.model_status_dict:
-                del self.model_status_dict[req_id]
-        return response_obj
-
-    def process_response_dict(self, response_dict, **kwargs):
-        """
-        Preprocess the response
-
-        Args:
-            response_obj: response for engine, contain ids fields
-
-        Returns:
-            Dict: response contain text fields
-        """
-        stream = kwargs.get("stream", True)
-        if stream:
-            return self.process_response_obj_streaming(response_dict, **kwargs)
-        else:
-            return self.process_response_obj_normal(
-                response_dict,
-                **kwargs,
-            )
-
-    def text2ids(self, text, max_model_len, **kwargs):
-        """
-        text to token ids
-
-        Args:
-            text (str): text
-
-        Returns:
-            List[int]: token ids list
-        """
-
-        add_special_tokens = kwargs.get("add_special_tokens")
-        if envs.FD_USE_HF_TOKENIZER:
-            tokens = self.tokenizer(
-                text,
-                return_tensors="np",
-                padding=True,
-                truncation=True,
-            )
-        else:
-            text = [text] if isinstance(text, str) else text
-
-            tokens = self.tokenizer(
-                text,
-                return_tensors="np",
-                padding=True,
-                truncation=True,
-                max_length=max_model_len,
-                add_special_tokens=add_special_tokens,
-            )
-
-        return tokens["input_ids"][0]
-
-    def messages2ids(self, request, **kwargs):
-        """
-        Convert multi-turn messages into ID sequences.
-
-        Args:
-            messages (List[List[Dict[str, Any]]]): multi-turn messages.
-
-        Returns:
-            List[int]: ID sequences
-        """
-        message_dict = {
-            key: getattr(request, key, None)
-            for key in ["messages", "tools", "documents", "enable_thinking", "system"]
-            if getattr(request, key, None) is not None
-        }
-        if "add_generation_prompt" not in kwargs:
-            kwargs["add_generation_prompt"] = (
-                request.add_generation_prompt if request.add_generation_prompt is not None else True
-            )
-        spliced_message = self.tokenizer.apply_chat_template(
-            message_dict,
-            tokenize=False,
-            split_special_tokens=False,
-            add_special_tokens=False,
-            **kwargs,
-        )
-        request.prompt_tokens = spliced_message
-        tokens = self.tokenizer.tokenize(spliced_message)
-        req_id = getattr(request, "request_id", None)
-        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-        data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
-        return token_ids
-
-    def ids2tokens(self, token_id, task_id):
-        """
-        token ids to strings
-
-        Args:
-            token_ids (List[int]): token ids
-                        task_id (str): task id
-
-        Returns:
-            List[str]: strings
-        """
-        if envs.FD_USE_HF_TOKENIZER:
-            if task_id not in self.decode_status:
-                # history token ids & history token strings & befer decode str
-                self.decode_status[task_id] = [[], [], ""]
-
-            previous_token_ids = self.decode_status[task_id][0]
-            decode_str = self.tokenizer.batch_decode(
-                [previous_token_ids + token_id],
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False,
-            )
-            if isinstance(decode_str, list) and len(decode_str):
-                new_str = decode_str[0].replace(self.decode_status[task_id][2], "", 1)
-                self.decode_status[task_id][1].append(new_str)
-                self.decode_status[task_id][2] = decode_str[0]
-            else:
-                new_str = ""
-            self.decode_status[task_id][0] += token_id
-            return new_str
-        else:
-            if task_id not in self.decode_status:
-                # prefix offset & read offset & history token ids & history token strings
-                self.decode_status[task_id] = [0, 0, [], ""]
-
-            prefix_offset = self.decode_status[task_id][0]
-            read_offset = self.decode_status[task_id][1]
-            previous_token_ids = self.decode_status[task_id][2]
-            previous_texts = self.decode_status[task_id][3]
-            decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
-                previous_token_ids + token_id, prefix_offset, read_offset
-            )
-            self.decode_status[task_id][0] = prefix_offset
-            self.decode_status[task_id][1] = read_offset
-            self.decode_status[task_id][2] += token_id
-            self.decode_status[task_id][3] += decode_str
-
-            return decode_str, previous_token_ids, previous_texts
-
-    def _load_tokenizer(self):
-        """
-        load tokenizer
-
-        Returns:
-            tokenizer (AutoTokenizer)
-        """
-        if envs.FD_USE_HF_TOKENIZER:
-            from transformers import AutoTokenizer
-
-            return AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=False)
-        else:
-            from paddleformers.transformers import AutoTokenizer
-
-            return AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
-
-    def clear_request_status(self, task_id):
-        """
-        clear request status
-
-        Args:
-            task_id (str): task id
-
-        Returns:
-            results_all (str): all token strings
-        """
-        results_all = ""
-        if task_id in self.decode_status:
-            if envs.FD_USE_HF_TOKENIZER:
-                results_all = self.decode_status[task_id][2]
-            else:
-                results_all = "".join(self.decode_status[task_id][3])
-            del self.decode_status[task_id]
-        return results_all
-
-    def get_pad_id(self):
-        """
-        get pad_token_id, if not pad_token_id, use eos_token
-
-        Returns:
-            int: pad_token_id
-        """
-        if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
-            return self.tokenizer.eos_token
-        return self.tokenizer.pad_token_id
-
-    def pad_batch_data(
-        self,
-        insts,
-        pad_id=0,
-        return_seq_len=False,
-        return_array=True,
-        pad_style="right",
-    ):
-        """Pad the instances to the max sequence length in batch."""
-        if len(insts) == 0:
-            padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]]
-            if return_seq_len:
-                seq_len = np.array([], dtype=np.int64) if return_array else []
-                return padded_insts, seq_len
-            return padded_insts
-
-        max_len = max(map(len, insts))
-        if pad_style == "left":
-            padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]
-        else:
-            padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]
-        if return_array:
-            padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len])
-
-        if return_seq_len:
-            seq_len = [len(inst) for inst in insts]
-            if return_array:
-                seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
-            return padded_insts, seq_len
-        return padded_insts
-
-    def update_stop_seq(self, stop_sequences):
-        """
-        Update stop sequences from request.
-        """
-        stop_seqs = []
-        for seq in stop_sequences:
-            if seq != self.tokenizer.eos_token_id:
-                stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
-        stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
-        data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
-        return stop_seqs, stop_seqs_len
-
-    def update_bad_words(self, bad_words, bad_words_token_ids):
-        """Support bad words"""
-
-        token_ids = bad_words_token_ids
-
-        if token_ids is None:
-            token_ids = []
-        for bad_word in bad_words:
-            # To prohibit words both at the beginning
-            # and in the middle of text
-            # (related to add_prefix_space tokenizer parameter)
-            for add_prefix_space in [False, True]:
-                prefix = " " if add_prefix_space else ""
-                prompt = prefix + bad_word.lstrip()
-                prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
-
-                if len(prompt_token_ids) != 1:
-                    if not add_prefix_space:
-                        data_processor_logger.warning(
-                            f"Skip bad_words: <{prompt}>."
-                            f"Bad words should be a single token."
-                            f"Got tokens: {prompt_token_ids}."
-                        )
-                    continue
-
-                if prompt_token_ids[0] > self.tokenizer.vocab_size:
-                    if not add_prefix_space:
-                        data_processor_logger.warning(
-                            f"Skip bad_words: <{prompt}>."
-                            f"All token id values should be satisfying:"
-                            f" 0 <= token_id < {self.tokenizer.vocab_size}."
-                            f"Got token: {prompt_token_ids}."
-                        )
-                    continue
-
-                if prompt_token_ids not in token_ids:
-                    token_ids.extend(prompt_token_ids)
-        return token_ids
diff --git a/fastdeploy/inter_communicator/zmq_server.py b/fastdeploy/inter_communicator/zmq_server.py
index 7073edb48a5..c88fe96eb08 100644
--- a/fastdeploy/inter_communicator/zmq_server.py
+++ b/fastdeploy/inter_communicator/zmq_server.py
@@ -169,10 +169,7 @@ def pack_aggregated_data(self, data):
         if len(data) > 1:
             for response in data[1:]:
                 result.add(response)
-        if not envs.ENABLE_V1_DATA_PROCESSOR:
-            result = ForkingPickler.dumps([result.to_dict()])
-        else:
-            result = ForkingPickler.dumps([result])
+        result = ForkingPickler.dumps([result.to_dict()])
         return result
 
     def receive_json_once(self, block=False):
@@ -303,10 +300,7 @@ def _send_response_per_query(self, req_id, data):
                 if self.aggregate_send:
                     result = self.pack_aggregated_data(new_data)
                 else:
-                    if not envs.ENABLE_V1_DATA_PROCESSOR:
-                        result = ForkingPickler.dumps([response.to_dict() for response in new_data])
-                    else:
-                        result = ForkingPickler.dumps(new_data)
+                    result = ForkingPickler.dumps([response.to_dict() for response in new_data])
                 with self.response_token_lock:
 
                     _zmq_metrics_stats = ZMQMetricsStats()
@@ -349,13 +343,10 @@ def _send_batch_response(self, batch_data, worker_pid=None):
         metrics_address = self.address or self.worker_push_addresses.get(worker_pid, "unknown")
 
         try:
-            if not envs.ENABLE_V1_DATA_PROCESSOR:
-                result = msgpack.packb(
-                    [[output.to_dict() for output in outputs] for outputs in batch_data],
-                    default=_msgpack_default,
-                )
-            else:
-                result = ForkingPickler.dumps(batch_data)
+            result = msgpack.packb(
+                [[output.to_dict() for output in outputs] for outputs in batch_data],
+                default=_msgpack_default,
+            )
             result_len = len(result)
 
             # Only hold lock for the actual socket send
diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py
index 5a6241c4433..69c6db2a753 100644
--- a/tests/engine/test_common_engine.py
+++ b/tests/engine/test_common_engine.py
@@ -1267,7 +1267,6 @@ def receive_json_once(self, block):
 
         with (
             patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
-            patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False),
             patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
             patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
         ):
@@ -3277,7 +3276,6 @@ def __init__(self):
         with (
             patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
             patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
-            patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False),
             patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
             patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
         ):
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8af369c396a..1b33405503f 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -20,7 +20,6 @@
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock, Mock, patch
 
-import numpy as np
 import paddle
 
 import fastdeploy.envs as envs
@@ -307,51 +306,29 @@ async def test_create_chat_completion_request_id_and_v1_stream(self):
         self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(
             side_effect=ParameterError("param", "bad")
         )
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
-                resp = await self.chat_completion_handler.create_chat_completion(
-                    ChatCompletionRequest(
-                        messages=[{"role": "user", "content": "Hello"}],
-                        request_id="abc",
-                        stream=False,
-                    )
+        with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
+            resp = await self.chat_completion_handler.create_chat_completion(
+                ChatCompletionRequest(
+                    messages=[{"role": "user", "content": "Hello"}],
+                    request_id="abc",
+                    stream=False,
                 )
+            )
         self.assertEqual(resp.error.param, "param")
         self.assertIn("bad", resp.error.message)
         self.assertEqual(mock_trace.call_args.kwargs["rid"], "chatcmpl-abc")
 
         self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(side_effect=RuntimeError("boom"))
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"):
-                resp = await self.chat_completion_handler.create_chat_completion(
-                    ChatCompletionRequest(
-                        messages=[{"role": "user", "content": "Hello"}],
-                        request_id="err",
-                        stream=False,
-                    )
+        with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"):
+            resp = await self.chat_completion_handler.create_chat_completion(
+                ChatCompletionRequest(
+                    messages=[{"role": "user", "content": "Hello"}],
+                    request_id="err",
+                    stream=False,
                 )
+            )
         self.assertIn("generator error", resp.error.message)
 
-        self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=np.array([1, 2]))
-        stream_mock = Mock(return_value="streamed")
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
-            with patch(
-                "fastdeploy.entrypoints.openai.serving_chat.Request.from_generic_request",
-                return_value={"metrics": {}, "prompt_tokens": "pt", "max_tokens": 3},
-            ):
-                with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
-                    with patch.object(self.chat_completion_handler, "chat_completion_stream_generator", stream_mock):
-                        result = await self.chat_completion_handler.create_chat_completion(
-                            ChatCompletionRequest(
-                                messages=[{"role": "user", "content": "Hello"}],
-                                user="user",
-                                stream=True,
-                            )
-                        )
-        self.assertEqual(result, "streamed")
-        self.assertTrue(mock_trace.call_args.kwargs["rid"].startswith("chatcmpl-user-"))
-        self.assertEqual(stream_mock.call_args.args[3], [1, 2])
-
     async def test_create_chat_completion_full_and_waiting_errors(self):
         """Test full generator error and waiting error handling."""
         self.chat_completion_handler.engine_client.is_master = True
@@ -361,15 +338,14 @@ async def test_create_chat_completion_full_and_waiting_errors(self):
         self.chat_completion_handler.engine_client.semaphore.status = Mock(return_value="ok")
 
         self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=[1, 2])
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            with patch.object(
-                self.chat_completion_handler,
-                "chat_completion_full_generator",
-                AsyncMock(side_effect=RuntimeError("boom")),
-            ):
-                resp = await self.chat_completion_handler.create_chat_completion(
-                    ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False)
-                )
+        with patch.object(
+            self.chat_completion_handler,
+            "chat_completion_full_generator",
+            AsyncMock(side_effect=RuntimeError("boom")),
+        ):
+            resp = await self.chat_completion_handler.create_chat_completion(
+                ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False)
+            )
         self.assertIn("full generator error", resp.error.message)
 
         with patch(
diff --git a/tests/entrypoints/test_serving_completion.py b/tests/entrypoints/test_serving_completion.py
index b76d798afc5..9c2beb678df 100644
--- a/tests/entrypoints/test_serving_completion.py
+++ b/tests/entrypoints/test_serving_completion.py
@@ -20,7 +20,6 @@
 import numpy as np
 import paddle
 
-import fastdeploy.envs as envs
 import fastdeploy.metrics.trace as tracing
 from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from fastdeploy.utils import ErrorCode, ParameterError
@@ -124,40 +123,21 @@ async def test_create_completion_branches(self):
         ec = _make_engine_client()
         ec.format_and_add_data = AsyncMock(side_effect=ParameterError("max_tokens", "bad"))
         serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens")
+        res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens")
         ec.semaphore.release.assert_called_once()
         ec = _make_engine_client()
-        ec.format_and_add_data = AsyncMock(side_effect=ValueError("bad"))
-        serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
-
-        def fake_from_generic_request(_, request_id):
-            return {"prompt": "hi", "request_id": request_id, "prompt_tokens": [1], "max_tokens": 2, "metrics": {}}
-
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
-            with patch(
-                "fastdeploy.entrypoints.openai.serving_completion.Request.from_generic_request",
-                side_effect=fake_from_generic_request,
-            ):
-                await _assert_error(self, serving, _make_request(prompt="hi"), code=ErrorCode.INVALID_VALUE)
-        ec = _make_engine_client()
         ec.format_and_add_data = AsyncMock(return_value=np.array([1, 2]))
         serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))):
-                await _assert_error(
-                    self, serving, _make_request(prompt="hi"), contains="completion_full_generator error"
-                )
+        with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))):
+            await _assert_error(self, serving, _make_request(prompt="hi"), contains="completion_full_generator error")
         serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1)
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            with patch.object(serving, "completion_stream_generator", return_value="streamed"):
-                res = await serving.create_completion(_make_request(request_id="req123", stream=True))
+        with patch.object(serving, "completion_stream_generator", return_value="streamed"):
+            res = await serving.create_completion(_make_request(request_id="req123", stream=True))
         self.assertEqual(res, "streamed")
         serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1)
-        with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            await _assert_error(
-                self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error"
-            )
+        await _assert_error(
+            self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error"
+        )
 
     async def test_completion_full_generator_branches(self):
         ec = _make_engine_client()
diff --git a/tests/input/test_preprocess.py b/tests/input/test_preprocess.py
index b4659261a8e..4196b729fbb 100644
--- a/tests/input/test_preprocess.py
+++ b/tests/input/test_preprocess.py
@@ -63,10 +63,8 @@ def test_create_processor_text_normal_path(self):
         mock_dp = MagicMock()
         with (
             patch.dict("sys.modules", {"fastdeploy.plugins": None, "fastdeploy.plugins.input_processor": None}),
-            patch("fastdeploy.input.preprocess.envs") as mock_envs,
             patch("fastdeploy.input.text_processor.DataProcessor", return_value=mock_dp),
         ):
-            mock_envs.ENABLE_V1_DATA_PROCESSOR = False
             pp.create_processor()
 
         self.assertIs(pp.processor, mock_dp)
diff --git a/tests/input/v1/test_ernie4_5_processor.py b/tests/input/v1/test_ernie4_5_processor.py
deleted file mode 100644
index 13cfdb10747..00000000000
--- a/tests/input/v1/test_ernie4_5_processor.py
+++ /dev/null
@@ -1,448 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-
-from fastdeploy.engine.request import Request, RequestOutput
-
-MODULE_PATH = "fastdeploy.input.v1.ernie4_5_processor"
-
-from fastdeploy.input.v1.ernie4_5_processor import _SAMPLING_EPS, Ernie4_5Processor
-
-
-class MockTokenizer:
-    """A simple mock tokenizer used to simulate tokenization behavior in unit tests."""
-
-    def __init__(self):
-        self.bos_token = "<bos>"
-        self.bos_token_id = 101
-        self.eos_token = "<eos>"
-        self.eos_token_id = 102
-        self.pad_token_id = 0
-        self.vocab_size = 200
-        # Non-None value indicates chat_template support
-        self.chat_template = "dummy"
-
-    def tokenize(self, text):
-        """Return multi-token output for 'multi*' to test branching; otherwise return single-token."""
-        if text.startswith("multi"):
-            return ["multi", "word"]
-        return [text]
-
-    def convert_tokens_to_ids(self, tokens):
-        """Map tokens to synthetic IDs for branch coverage."""
-        mapping = {
-            "bad": 5,
-            " bad": 6,
-            "multi": 7,
-            "word": 8,
-            "oov": 250,
-            " oov": 251,
-            "hello": 9,
-            "REASON": 42,
-        }
-        return [mapping.get(t, 1) for t in tokens]
-
-    def decode(self, token_ids, **kwargs):
-        """Simple decode implementation returning a space-separated string."""
-        return " ".join(str(t) for t in token_ids)
-
-    def decode_token(self, token_ids, prefix_offset, read_offset):
-        """Incremental decode used to test streaming behavior."""
-        new_tokens = token_ids[read_offset:]
-        decode_str = " ".join(str(t) for t in new_tokens)
-        new_read_offset = len(token_ids)
-        return decode_str, prefix_offset, new_read_offset
-
-    def apply_chat_template(self, request_or_messages, tokenize, split_special_tokens, add_special_tokens, **kwargs):
-        """Minimal chat template implementation used by messages2ids."""
-        if isinstance(request_or_messages, dict) and "messages" in request_or_messages:
-            return " | ".join(m["content"] for m in request_or_messages["messages"])
-        return str(request_or_messages)
-
-
-class ErnieX1ReasoningParser:
-    """Mock reasoning parser to trigger reasoning-related branches during streaming."""
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    def extract_reasoning_content(self, full_text, response_dict, model_status):
-        """Extract reasoning content for non-streaming responses."""
-
-        class ReasoningContent:
-            def __init__(self):
-                self.reasoning_content = "mock_reasoning"
-                self.content = "mock_content"
-
-        return ReasoningContent()
-
-    def extract_reasoning_content_streaming(
-        self,
-        previous_texts,
-        full_text,
-        delta_text,
-        previous_token_ids,
-        all_token_ids,
-        delta_token_ids,
-        model_status,
-    ):
-        """Return a simple object with reasoning_content to cover reasoning branch."""
-
-        class ReasoningDelta:
-            def __init__(self, content):
-                self.reasoning_content = content
-
-        return ReasoningDelta(delta_text)
-
-
-class MockToolParser:
-    """Mock tool parser to cover tool-related branches in both normal and streaming responses."""
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    class ToolDelta:
-        """Simple container representing detected tool calls."""
-
-        def __init__(self):
-            self.tool_calls = [{"name": "fake_tool"}]
-            self.tools_called = True
-            self.content = "tool_content"
-
-    def extract_tool_calls(self, full_text, response_dict):
-        """Used in process_response and process_response_obj_normal."""
-        return MockToolParser.ToolDelta()
-
-    def extract_tool_calls_streaming(
-        self,
-        previous_texts,
-        full_text,
-        delta_text,
-        previous_token_ids,
-        all_token_ids,
-        delta_token_ids,
-        response_dict,
-    ):
-        """Used in process_response_obj_streaming."""
-        return MockToolParser.ToolDelta()
-
-
-class TestErnie4_5Processor(unittest.TestCase):
-    """Unit tests for Ernie4_5Processor focusing on preprocessing and postprocessing logic."""
-
-    def setUp(self):
-        """Patch external dependencies: tokenizer, generation config, eos token resolution."""
-        self.gen_patcher = patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", return_value=MagicMock())
-        self.tokenizer_patcher = patch(
-            f"{MODULE_PATH}.Ernie4_5Tokenizer.from_pretrained", side_effect=lambda path: MockTokenizer()
-        )
-        self.eos_patcher = patch(
-            "paddleformers.cli.utils.llm_utils.get_eos_token_id",
-            side_effect=lambda tokenizer, cfg: [tokenizer.eos_token_id],
-        )
-
-        self.gen_patcher.start()
-        self.tokenizer_patcher.start()
-        self.eos_patcher.start()
-
-    def tearDown(self):
-        """Stop all patches after each test."""
-        self.gen_patcher.stop()
-        self.tokenizer_patcher.stop()
-        self.eos_patcher.stop()
-
-    def _make_processor(self, reasoning=False, tool=False):
-        """Helper for creating a processor with optional reasoning/tool parser support."""
-        reasoning_cls = ErnieX1ReasoningParser if reasoning else None
-        tool_cls = MockToolParser if tool else None
-        proc = Ernie4_5Processor("dummy-model", reasoning_parser_obj=reasoning_cls, tool_parser_obj=tool_cls)
-        proc._apply_default_parameters = lambda req: req
-        proc.model_status_dict = {"req-1": "think_start"}
-        return proc
-
-    def test_update_bad_words(self):
-        """Verify filtering, multi-token skipping, and OOV behavior in update_bad_words."""
-        proc = self._make_processor()
-
-        bad_words = ["bad", "multi", "oov"]
-        token_ids = proc.update_bad_words(bad_words, bad_words_token_ids=None)
-
-        self.assertEqual(token_ids, [5, 6, 1])
-
-    def test_process_request_dict_with_prompt_string(self):
-        """Test prompt-based tokenization, truncation, and temperature/top_p correction."""
-        proc = self._make_processor()
-        req = {
-            "request_id": "test_0",
-            "prompt": "hello",
-            "temperature": 0.0,
-            "top_p": 0.0,
-        }
-        req = Request.from_dict(req)
-
-        processed = proc.process_request_dict(req, max_model_len=10)
-
-        self.assertTrue(hasattr(processed, "eos_token_ids"))
-        self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id])
-
-        expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello"))
-        self.assertEqual(processed.prompt_token_ids, expected_ids)
-
-        self.assertEqual(processed.sampling_params.max_tokens, max(1, 10 - len(expected_ids)))
-        self.assertEqual(processed.sampling_params.temperature, 1)
-        self.assertEqual(processed.sampling_params.top_k, 1)
-        self.assertAlmostEqual(processed.sampling_params.top_p, _SAMPLING_EPS)
-        self.assertEqual(processed.prompt_tokens, "hello")
-
-    def test_pad_batch_data_right_and_left_and_empty(self):
-        """Test left/right padding and empty input behavior."""
-        proc = self._make_processor()
-
-        insts = [[1, 2], [3]]
-
-        padded, seq_len = proc.pad_batch_data(
-            insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="right"
-        )
-        np.testing.assert_array_equal(padded, np.array([[1, 2], [3, 0]], dtype=np.int64))
-        np.testing.assert_array_equal(seq_len, np.array([[2], [1]], dtype=np.int64))
-
-        padded_left, seq_len_left = proc.pad_batch_data(
-            insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="left"
-        )
-        np.testing.assert_array_equal(padded_left, np.array([[1, 2], [0, 3]], dtype=np.int64))
-        np.testing.assert_array_equal(seq_len_left, np.array([[2], [1]], dtype=np.int64))
-
-        padded_empty, seq_len_empty = proc.pad_batch_data(
-            [], pad_id=0, return_seq_len=True, return_array=True, pad_style="right"
-        )
-        np.testing.assert_array_equal(padded_empty, np.array([[]], dtype=np.int64))
-        np.testing.assert_array_equal(seq_len_empty, np.array([], dtype=np.int64))
-
-    def test_process_response_obj_streaming_with_reasoning_and_tool(self):
-        """Ensure streaming mode handles reasoning and tool-call parsing correctly."""
-        proc = self._make_processor(reasoning=True, tool=True)
-
-        response = {
-            "finished": True,
-            "request_id": "req-1",
-            "outputs": {"token_ids": [10, 11]},
-        }
-        response = RequestOutput.from_dict(response)
-
-        result = proc.process_response_obj_streaming(response, enable_thinking=False, include_stop_str_in_output=False)
-
-        outputs = result.outputs
-
-        self.assertTrue(hasattr(outputs, "completion_tokens"))
-        self.assertTrue(hasattr(outputs, "text"))
-        self.assertEqual(outputs.completion_tokens, outputs.reasoning_content)
-
-        self.assertTrue(hasattr(outputs, "reasoning_token_num"))
-        self.assertGreaterEqual(outputs.reasoning_token_num, 0)
-
-        self.assertTrue(hasattr(outputs, "delta_message"))
-        delta_msg = outputs.delta_message
-        self.assertTrue(hasattr(delta_msg, "tool_calls"))
-
-        self.assertNotIn("req-1", proc.decode_status)
-        self.assertNotIn("req-1", proc.tool_parser_dict)
-
-    def test_update_stop_seq(self):
-        """Test stop sequence tokenization and padding."""
-        proc = self._make_processor()
-
-        stop_seqs, stop_lens = proc.update_stop_seq("stop")
-        self.assertIsInstance(stop_seqs, list)
-        self.assertIsInstance(stop_lens, list)
-
-        stop_seqs2, stop_lens2 = proc.update_stop_seq(["stop", "hello"])
-        self.assertEqual(len(stop_seqs2), 2)
-        self.assertEqual(len(stop_lens2), 2)
-
-    def test_process_request_chat_template_kwargs(self):
-        """Test chat_template_kwargs application inside process_request_dict."""
-
-        proc = self._make_processor()
-
-        request = {
-            "request_id": "test_0",
-            "messages": [{"role": "user", "content": "hello"}],
-            "temperature": 0.5,
-            "top_p": 0.5,
-        }
-        request = Request.from_dict(request)
-
-        processed = proc.process_request_dict(request, max_model_len=20, chat_template_kwargs={"extra": "VALUE"})
-
-        self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id])
-
-        expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello"))
-        self.assertIsNotNone(processed.prompt_token_ids)
-        self.assertEqual(processed.prompt_token_ids, expected_ids)
-
-        self.assertTrue(hasattr(processed.sampling_params, "max_tokens"))
-        self.assertEqual(processed.sampling_params.max_tokens, max(1, 20 - len(expected_ids)))
-
-    def test_process_request_dict_chat_template_kwargs(self):
-        """Test chat_template_kwargs insertion in process_request_dict."""
-        proc = self._make_processor()
-
-        req = {
-            "request_id": "test_0",
-            "messages": [{"role": "user", "content": "hey"}],
-            "chat_template_kwargs": {"A": "B"},
-            "temperature": 0.5,
-            "top_p": 0.5,
-        }
-        req = Request.from_dict(req)
-        req.chat_template_kwargs = {"A": "B"}
-
-        result = proc.process_request_dict(req, max_model_len=30)
-
-        self.assertTrue(hasattr(result, "prompt_token_ids"))
-        self.assertEqual(getattr(result, "A"), "B")
-
-    def test_init_generation_config_exception(self):
-        """Test fallback behavior when GenerationConfig loading fails."""
-        with patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", side_effect=Exception("fail")):
-            proc = self._make_processor()
-            self.assertIsNone(proc.generation_config)
-
-    # def test_process_response_with_tool_parser(self):
-    #     """Verify tool_call extraction in process_response."""
-    #     proc = self._make_processor(tool=True)
-
-    #     class RespObj:
-    #         """Mock response carrying token_ids and index for testing."""
-
-    #         def __init__(self):
-    #             self.request_id = "reqx"
-    #             self.outputs = MagicMock()
-    #             self.outputs.token_ids = [9, proc.tokenizer.eos_token_id]
-    #             self.outputs.index = 0
-
-    #     resp = RespObj()
-    #     result = proc.process_response(resp)
-
-    #     self.assertTrue(hasattr(result.outputs, "tool_calls"))
-    #     self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool")
-
-    def test_process_response_obj_normal_with_tool(self):
-        """Verify tool_call extraction in normal (non-streaming) response mode."""
-        proc = self._make_processor(tool=True)
-
-        resp = {
-            "finished": True,
-            "request_id": "task-99",
-            "outputs": {"token_ids": [10, 11], "text": ""},
-        }
-        resp = RequestOutput.from_dict(resp)
-
-        result = proc.process_response_obj_normal(resp, enable_thinking=False, include_stop_str_in_output=False)
-
-        self.assertTrue(hasattr(result.outputs, "tool_calls"))
-        self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool")
-
-    def test_process_request_greedy_sets_top_k(self):
-        """process_request with temperature=0 should set top_k=1 for greedy decoding."""
-        proc = self._make_processor()
-        proc.messages2ids = MagicMock(return_value=[9])
-
-        request = Request.from_dict(
-            {
-                "request_id": "test_greedy",
-                "prompt": "hello",
-                "temperature": 0.0,
-                "top_p": 0.5,
-            }
-        )
-        result = proc.process_request(request, max_model_len=10)
-        self.assertEqual(result.get("temperature"), 1)
-        self.assertEqual(result.get("top_k"), 1)
-
-    def test_process_request(self):
-        """Test process_request method with various input types."""
-        proc = self._make_processor()
-
-        # Test with prompt string
-        request = Request.from_dict(
-            {
-                "request_id": "test_1",
-                "prompt": "hello",
-                "temperature": 0.5,
-                "top_p": 0.5,
-            }
-        )
-        result = proc.process_request(request, max_model_len=10)
-        self.assertEqual(result.prompt_token_ids, [9])
-
-        # Test with prompt token ids
-        request = Request.from_dict(
-            {
-                "request_id": "test_2",
-                "prompt_token_ids": [1, 2, 3],
-                "temperature": 0.5,
-                "top_p": 0.5,
-            }
-        )
-        result = proc.process_request(request, max_model_len=5)
-        self.assertEqual(result.prompt_token_ids, [1, 2, 3])
-
-        # Test with messages - mock the messages2ids method to avoid dict attribute error
-        proc.messages2ids = MagicMock(return_value=[9])
-        request = Request.from_dict(
-            {
-                "request_id": "test_3",
-                "messages": [{"role": "user", "content": "hello"}],
-                "temperature": 0.5,
-                "top_p": 0.5,
-            }
-        )
-        result = proc.process_request(request, max_model_len=10)
-        self.assertEqual(result.prompt_token_ids, [9])
-        proc.messages2ids.assert_called_once()
-
-    def test_process_response(self):
-        """Test process_response method with various scenarios."""
-        # Test without reasoning parser to avoid model_status_dict dependency
-        proc = self._make_processor(reasoning=False)
-
-        # Test basic response
-        response = RequestOutput.from_dict(
-            {"request_id": "test_4", "outputs": {"token_ids": [10, 11, proc.tokenizer.eos_token_id], "index": 0}}
-        )
-        result = proc.process_response(response)
-        self.assertEqual(result.outputs.text, "10 11")
-
-        # Test another response to ensure consistency
-        response = RequestOutput.from_dict(
-            {"request_id": "test_5", "outputs": {"token_ids": [20, 21, proc.tokenizer.eos_token_id], "index": 0}}
-        )
-        result = proc.process_response(response)
-        self.assertEqual(result.outputs.text, "20 21")
-
-        # Test response without eos_token at the end
-        response = RequestOutput.from_dict({"request_id": "test_6", "outputs": {"token_ids": [30, 31], "index": 0}})
-        result = proc.process_response(response)
-        self.assertEqual(result.outputs.text, "30 31")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_ernie_processor.py b/tests/input/v1/test_ernie_processor.py
deleted file mode 100644
index 437e4029a5d..00000000000
--- a/tests/input/v1/test_ernie_processor.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-from fastdeploy.engine.request import Request, RequestOutput
-from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor
-
-
-class MockReasoningParser:
-    def get_model_status(self, prompt_token_ids):
-        return "think_start"
-
-
-class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase):
-    def setUp(self):
-        # 创建 Ernie4_5Processor 实例的模拟对象
-        with patch.object(Ernie4_5Processor, "__init__", return_value=None) as mock_init:
-            self.processor = Ernie4_5Processor("model_path")
-            mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
-
-        # 设置必要的属性
-        self.processor.tokenizer = MagicMock()
-        self.processor.tokenizer.eos_token_id = 1
-        self.processor.decode_status = {"test": []}
-        self.processor.reasoning_end_dict = {}
-        self.processor.tool_parser_dict = {}
-        self.processor.generation_config = MagicMock()
-        self.processor.eos_token_ids = [1]
-        self.processor.reasoning_parser = MockReasoningParser()
-        self.processor.model_status_dict = {"request-id_0": "think_start", "test": "think_start"}
-
-        # 模拟 ids2tokens 方法
-        def mock_ids2tokens(token_ids, task_id):
-            self.processor.decode_status[task_id] = "mock_decode_status"
-            return "delta_text", [2, 3], "previous_texts"
-
-        self.processor.ids2tokens = mock_ids2tokens
-
-        def mock_messages2ids(request, **kwargs):
-            if "chat_template" in kwargs:
-                return [1]
-            else:
-                return [0]
-
-        def mock_apply_default_parameters(request):
-            return request
-
-        self.processor.messages2ids = mock_messages2ids
-        self.processor._apply_default_parameters = mock_apply_default_parameters
-
-        # 模拟推理解析器
-        self.mock_reasoning_parser = MagicMock()
-        self.mock_reasoning_parser.__class__.__name__ = "ErnieX1ReasoningParser"
-        # self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text")
-        self.processor.reasoning_parser = self.mock_reasoning_parser
-
-        # 模拟工具解析器
-        self.mock_tool_parser = MagicMock()
-        self.mock_tool_parser.extract_tool_calls_streaming.return_value = None
-        self.mock_tool_parser_obj = MagicMock()
-        self.mock_tool_parser_obj.return_value = self.mock_tool_parser
-        self.processor.tool_parser_obj = self.mock_tool_parser_obj
-
-    def test_process_response_obj_streaming_normal_case(self):
-        """测试正常情况下的流式响应处理"""
-        # 准备输入
-        response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}}
-        kwargs = {"enable_thinking": True}
-        response = RequestOutput.from_dict(response_dict)
-
-        # 调用方法
-        result = self.processor.process_response_obj_streaming(response, **kwargs)
-
-        # 验证结果
-        self.assertEqual(result.outputs.completion_tokens, "delta_text")
-
-    def test_process_request_dict(self):
-        request_dict = {
-            "request_id": "123",
-            "messages": [{"role": "user", "content": "Hello!"}],
-            "chat_template_kwargs": {"chat_template": "Hello!"},
-            "eos_token_ids": [1],
-            "temperature": 1,
-            "top_p": 1,
-        }
-        request = Request.from_dict(request_dict)
-        request.chat_template_kwargs = {"chat_template": "Hello!"}
-        result = self.processor.process_request_dict(request, 100)
-        self.assertEqual(result.prompt_token_ids, [1])
-
-    def test_process_response_obj_normal(self):
-        mock_tokens = ["reasoning", "token", "list"]
-        self.processor.tokenizer.tokenize = MagicMock(return_value=mock_tokens)
-        self.processor.reasoning_parser.extract_reasoning_content = MagicMock(
-            return_value=("Mock reasoning content", "Mock final text")
-        )
-
-        self.processor.tool_parser_obj = None
-
-        response_dict = {
-            "request_id": "request-id_0",
-            "outputs": {"token_ids": [2, 3, 4, 5, 1], "text": "Initial text", "top_logprobs": []},
-            # "finish_reason": "stop",
-            "finished": True,
-        }
-        response = RequestOutput.from_dict(response_dict)
-        kwargs = {"enable_thinking": True}
-
-        with patch("fastdeploy.input.ernie4_5_processor.data_processor_logger"):
-            result = self.processor.process_response_obj_normal(response, **kwargs)
-
-        self.mock_reasoning_parser.extract_reasoning_content.assert_called_once()
-        self.assertEqual(result.outputs.reasoning_content, "Mock reasoning content")
-        self.assertEqual(result.outputs.reasoning_token_num, len(mock_tokens))
-        self.assertEqual(result.outputs.text, "Mock final text")
-        self.assertTrue(hasattr(result.outputs, "completion_tokens"))
-
-    def test_think_status(self):
-        """测试 思考机制"""
-        request = {
-            "prompt": "hello",
-            "request_id": "test_1",
-            "prompt_token_ids": [1, 2, 3],
-            "temperature": 0.7,
-            "top_p": 0.9,
-        }
-        request = Request.from_dict(request)
-        self.processor.reasoning_parser = MagicMock()
-        self.processor.reasoning_parser.get_model_status.return_value = "think_start"
-        self.processor.model_status_dict = {}
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-        request = {
-            "prompt": "hello",
-            "request_id": "test",
-            "prompt_token_ids": [1, 2, 3],
-            "temperature": 0.7,
-            "top_p": 0.9,
-        }
-        request = Request.from_dict(request)
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_ernie_vl_processor.py b/tests/input/v1/test_ernie_vl_processor.py
deleted file mode 100644
index 132b423f3cb..00000000000
--- a/tests/input/v1/test_ernie_vl_processor.py
+++ /dev/null
@@ -1,1460 +0,0 @@
-﻿"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from types import SimpleNamespace
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import CompletionOutput, Request, RequestOutput
-from fastdeploy.engine.sampling_params import SamplingParams
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.v1.ernie4_5_vl_processor import Ernie4_5_VLProcessor
-from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
-    AdaptiveImageProcessor,
-)
-from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor
-
-
-class MockReasoningParser:
-    def get_model_status(self, prompt_token_ids):
-        return "think_start"
-
-
-class TestErnie4_5VLProcessorProcessResponseDictStreaming(unittest.TestCase):
-    def setUp(self):
-        # Create mock object for Ernie4_5Processor instance
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None) as mock_init:
-            self.processor = Ernie4_5_VLProcessor("model_path")
-            mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
-
-        # Set necessary attributes
-        self.processor.tokenizer = MagicMock()
-        self.processor.tokenizer.eos_token_id = 1
-        self.processor.decode_status = {"test": []}
-        self.processor.reasoning_end_dict = {}
-        self.processor.tool_parser_dict = {}
-        self.processor.generation_config = MagicMock()
-        self.processor.eos_token_ids = [1]
-        self.processor.reasoning_parser = MockReasoningParser()
-        self.processor.model_status_dict = {"test": "think_start"}
-        self.processor.ernie4_5_processor = MagicMock()
-
-        # Mock ids2tokens method
-        def mock_ids2tokens(token_ids, task_id):
-            return "delta_text", [2, 3], "previous_texts"
-
-        self.processor.ids2tokens = mock_ids2tokens
-
-        def mock_request2ids(request, **kwargs):
-            return {"input_ids": np.array([1, 2, 3]), "prompt_token_ids": [0]}
-
-        def mock_check_mm_limits(item):
-            pass
-
-        def mock_apply_default_parameters(request):
-            return request
-
-        def mock_pack_outputs(outputs):
-            # Ensure input_ids is numpy array if it exists
-            result = outputs.copy() if isinstance(outputs, dict) else outputs
-            if isinstance(result, dict):
-                if "input_ids" in result and isinstance(result["input_ids"], list):
-                    result["input_ids"] = np.array(result["input_ids"])
-                if "token_type_ids" in result and isinstance(result["token_type_ids"], list):
-                    result["token_type_ids"] = np.array(result["token_type_ids"])
-                if "position_ids" in result and isinstance(result["position_ids"], list):
-                    result["position_ids"] = np.array(result["position_ids"])
-            return result
-
-        def mock_prompt_token_ids2outputs(request):
-            return {
-                "input_ids": np.array([1, 1, 1]),
-                "token_type_ids": np.array([0, 0, 0]),
-                "position_ids": np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]),
-                "images": [],
-                "grid_thw": [],
-                "image_type_ids": [],
-                "cur_position": 3,
-            }
-
-        self.processor._apply_default_parameters = mock_apply_default_parameters
-        self.processor._check_mm_limits = mock_check_mm_limits
-        self.processor.ernie4_5_processor.request2ids = mock_request2ids
-        self.processor.ernie4_5_processor.prompt_token_ids2outputs = mock_prompt_token_ids2outputs
-        self.processor.pack_outputs = mock_pack_outputs
-
-        # Mock reasoning parser
-        self.mock_reasoning_parser = MagicMock()
-        self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = None
-        self.processor.reasoning_parser = self.mock_reasoning_parser
-
-        # Mock tool parser
-        self.mock_tool_parser = MagicMock()
-        self.mock_tool_parser.extract_tool_calls_streaming.return_value = None
-        self.mock_tool_parser_obj = MagicMock()
-        self.mock_tool_parser_obj.return_value = self.mock_tool_parser
-        self.processor.tool_parser_obj = self.mock_tool_parser_obj
-
-    def test_think_status(self):
-        """测试 思考机制"""
-        request = {
-            "prompt": "hello",
-            "request_id": "test_1",
-            "prompt_token_ids": [1, 2, 3],
-        }
-        request = Request.from_dict(request)
-        self.processor.reasoning_parser = MagicMock()
-        self.processor.reasoning_parser.get_model_status.return_value = "think_start"
-        self.processor.model_status_dict = {}
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-        request = {
-            "prompt": "hello",
-            "request_id": "test",
-            "prompt_token_ids": [1, 2, 3],
-        }
-        request = Request.from_dict(request)
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-    def test_init(self):
-        """Test __init__ method"""
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"):
-            mock_dp = MagicMock()
-            mock_dp.image_patch_id = 1001
-            mock_dp.spatial_conv_size = 14
-            mock_dp.tokenizer = MagicMock()
-            mock_dp.tokenizer.pad_token_id = 0
-            mock_dp.eval = MagicMock()
-
-            with patch(
-                "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.DataProcessor"
-            ) as mock_dp_class:
-                mock_dp_class.return_value = mock_dp
-                with patch(
-                    "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.GenerationConfig"
-                ) as mock_gen_config:
-                    mock_gen_config.from_pretrained.return_value = MagicMock()
-                    with patch("paddleformers.cli.utils.llm_utils.get_eos_token_id") as mock_get_eos:
-                        mock_get_eos.return_value = [1, 2]
-
-                        # Test normal initialization
-                        mock_reasoning_parser_class = MagicMock()
-                        processor = Ernie4_5_VLProcessor(
-                            "model_path",
-                            limit_mm_per_prompt={"image": 2, "video": 1},
-                            mm_processor_kwargs={"spatial_conv_size": 14},
-                            reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser_class,
-                            tool_parser_obj=MagicMock(),
-                            enable_processor_cache=True,
-                        )
-
-                        self.assertEqual(processor.image_patch_id, 1001)
-                        self.assertEqual(processor.spatial_conv_size, 14)
-                        self.assertIsNotNone(processor.tokenizer)
-                        self.assertIsNotNone(processor.generation_config)
-                        self.assertEqual(processor.eos_token_ids, [1, 2])
-                        self.assertEqual(processor.limit_mm_per_prompt["image"], 2)
-                        self.assertEqual(processor.limit_mm_per_prompt["video"], 1)
-                        mock_dp.eval.assert_called_once()
-
-                        # Test with generation config exception
-                        mock_gen_config.from_pretrained.side_effect = Exception("Config not found")
-                        processor2 = Ernie4_5_VLProcessor("model_path")
-                        self.assertIsNone(processor2.generation_config)
-
-                        # Test with reasoning_parser_obj
-                        mock_reasoning_parser = MagicMock()
-                        processor3 = Ernie4_5_VLProcessor(
-                            "model_path", reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser
-                        )
-                        self.assertIsNotNone(processor3.reasoning_parser)
-
-    def test_parse_processor_kwargs(self):
-        """Test _parse_processor_kwargs with various inputs"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor._parse_processor_kwargs = Ernie4_5_VLProcessor._parse_processor_kwargs.__get__(
-                processor, Ernie4_5_VLProcessor
-            )
-
-            # Test with valid kwargs
-            valid_kwargs = {
-                "spatial_conv_size": 14,
-                "temporal_conv_size": 2,
-                "image_min_pixels": 1000,
-                "image_max_pixels": 10000,
-            }
-            result = processor._parse_processor_kwargs(valid_kwargs)
-            self.assertEqual(result, valid_kwargs)
-
-            # Test with invalid type (implementation catches exception and returns empty dict)
-            invalid_kwargs = {"spatial_conv_size": "invalid"}  # Should be int
-            result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, invalid_kwargs)
-            self.assertEqual(result, {})
-
-            # Test with non-dict input (implementation catches exception and returns empty dict)
-            result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, "not a dict")
-            self.assertEqual(result, {})
-
-            # Test exception handling with None
-            with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"):
-                result = processor._parse_processor_kwargs(None)
-                self.assertEqual(result, {})
-
-    def test_parse_limits(self):
-        """Test _parse_limits with various inputs"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor._parse_limits = Ernie4_5_VLProcessor._parse_limits.__get__(processor, Ernie4_5_VLProcessor)
-
-            # Test with valid limits
-            valid_limits = {"image": 5, "video": 3}
-            result = processor._parse_limits(valid_limits)
-            self.assertEqual(result["image"], 5)
-            self.assertEqual(result["video"], 3)
-            self.assertEqual(result["audio"], 1)  # Default value
-
-            # Test with empty input (None)
-            result = processor._parse_limits(None)
-            self.assertEqual(result["image"], 1)
-            self.assertEqual(result["video"], 1)
-            self.assertEqual(result["audio"], 1)
-
-            # Test with invalid type (implementation catches exception and returns default limits)
-            result = Ernie4_5_VLProcessor._parse_limits(processor, "not a dict")
-            self.assertEqual(result["image"], 1)
-            self.assertEqual(result["video"], 1)
-            self.assertEqual(result["audio"], 1)
-
-    def test_check_mm_limits(self):
-        """Test _check_mm_limits with various inputs"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor._check_mm_limits = Ernie4_5_VLProcessor._check_mm_limits.__get__(processor, Ernie4_5_VLProcessor)
-
-            # Test with dict input (should not raise)
-            processor.limit_mm_per_prompt = {"image": 2, "video": 1}
-            mm_data = {"image": [1, 2], "video": [1]}
-            processor._check_mm_limits(mm_data)
-
-            # Test with messages input (should not raise)
-            messages = [
-                {"role": "user", "content": [{"type": "image", "data": "img1"}]},
-                {"role": "user", "content": [{"type": "video", "data": "vid1"}]},
-            ]
-            processor._check_mm_limits(messages)
-
-            # Test when limit is exceeded (should raise ValueError)
-            processor.limit_mm_per_prompt = {"image": 1, "video": 1}
-            mm_data = {"image": [1, 2, 3], "video": []}  # 3 images, limit is 1
-            with self.assertRaises(ValueError) as context:
-                processor._check_mm_limits(mm_data)
-            self.assertIn("Too many image items", str(context.exception))
-
-    def test_process_request_dict(self):
-        """Test process_request_dict method"""
-        # from fastdeploy.engine.request import Request
-
-        # Mock the process_request_dict method
-        self.processor.process_request_dict = MagicMock()
-
-        # Create a mock Request object
-        mock_request = MagicMock(spec=Request)
-        mock_request.to_dict.return_value = {"messages": [{"role": "user", "content": "Hello"}]}
-
-        # Mock Request.from_dict to return a mock request
-        with patch.object(Request, "from_dict") as mock_from_dict:
-            mock_result_request = MagicMock(spec=Request)
-            mock_from_dict.return_value = mock_result_request
-
-            self.processor.process_request_dict(mock_request, max_model_len=100, chat_template_kwargs={"key": "value"})
-
-            # Verify process_request_dict was called
-            self.processor.process_request_dict.assert_called_once()
-
-    def test_get_pad_id(self):
-        """Test get_pad_id method"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor.tokenizer = MagicMock()
-            processor.tokenizer.pad_token_id = 100
-            processor.get_pad_id = Ernie4_5_VLProcessor.get_pad_id.__get__(processor, Ernie4_5_VLProcessor)
-
-            result = processor.get_pad_id()
-            self.assertEqual(result, 100)
-
-    def test_load_tokenizer(self):
-        """Test _load_tokenizer method"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            mock_tokenizer = MagicMock()
-            processor.ernie4_5_processor = MagicMock()
-            processor.ernie4_5_processor.tokenizer = mock_tokenizer
-            processor._load_tokenizer = Ernie4_5_VLProcessor._load_tokenizer.__get__(processor, Ernie4_5_VLProcessor)
-
-            processor._load_tokenizer()
-            self.assertEqual(processor.tokenizer, mock_tokenizer)
-
-    def test_append_completion_tokens(self):
-        """Test append_completion_tokens method"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor.append_completion_tokens = Ernie4_5_VLProcessor.append_completion_tokens.__get__(
-                processor, Ernie4_5_VLProcessor
-            )
-
-            multimodal_inputs = {
-                "input_ids": [1, 2, 3],
-                "token_type_ids": [0, 0, 0],
-                "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                "cur_position": 3,
-            }
-            completion_token_ids = [10, 11, 12]
-
-            processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
-            self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11, 12])
-            self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0, 0])
-            self.assertEqual(len(multimodal_inputs["position_ids"]), 6)
-            self.assertEqual(multimodal_inputs["cur_position"], 6)
-
-    def test_pack_outputs(self):
-        """Test pack_outputs with and without images"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor.image_patch_id = 1001
-            processor.ernie4_5_processor = SimpleNamespace(mm_num_tokens=lambda **kwargs: 123)
-            processor.pack_outputs = Ernie4_5_VLProcessor.pack_outputs.__get__(processor, Ernie4_5_VLProcessor)
-            # Test with images
-            outs_with_images = {
-                "input_ids": [1, 2, 3],
-                "token_type_ids": [0, 0, 0],
-                "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                "images": [np.array([[1, 2], [3, 4]])],
-                "grid_thw": [np.array([[1, 2, 2]])],
-                "image_type_ids": [0],
-            }
-
-            result = processor.pack_outputs(outs_with_images)
-            self.assertIsNotNone(result["images"])
-            self.assertIsNotNone(result["grid_thw"])
-            self.assertIsNotNone(result["image_type_ids"])
-            self.assertEqual(result["image_patch_id"], 1001)
-            self.assertIsInstance(result["input_ids"], np.ndarray)
-            self.assertIsInstance(result["token_type_ids"], np.ndarray)
-            self.assertIsInstance(result["position_ids"], np.ndarray)
-
-            # Test without images
-            outs_without_images = {
-                "input_ids": [1, 2, 3],
-                "token_type_ids": [0, 0, 0],
-                "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
-                "images": [],
-                "grid_thw": [],
-                "image_type_ids": [],
-            }
-
-            result = processor.pack_outputs(outs_without_images)
-            self.assertIsNone(result["images"])
-            self.assertIsNone(result["grid_thw"])
-            self.assertIsNone(result["image_type_ids"])
-
-    def test_process_response_dict(self):
-        """Test process_response_dict with different parameters"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor.process_response_dict = Ernie4_5_VLProcessor.process_response_dict.__get__(
-                processor, Ernie4_5_VLProcessor
-            )
-
-            response = RequestOutput(
-                request_id="test_0",
-                outputs=CompletionOutput(text="response", index=0, send_idx=0, token_ids=[1, 2, 3]),
-            )
-            # Test with stream=True
-            processor.process_response_obj_streaming = MagicMock(return_value=response)
-            response_obj = RequestOutput(request_id="test_0")
-            result = processor.process_response_dict(response_obj, stream=True)
-            processor.process_response_obj_streaming.assert_called_once()
-            self.assertEqual(result, response)
-
-            # Test with stream=False
-            processor.process_response_obj_normal = MagicMock(return_value=response)
-            response_obj = RequestOutput(request_id="test_0")
-            result = processor.process_response_dict(response_obj, stream=False)
-            processor.process_response_obj_normal.assert_called_once()
-            self.assertEqual(result, response)
-
-    def test_apply_default_parameters(self):
-        """Test _apply_default_parameters with dict and object request"""
-        with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
-            processor = Ernie4_5_VLProcessor("model_path")
-            processor.generation_config = MagicMock()
-            processor.generation_config.top_p = 0.8
-            processor.generation_config.temperature = 0.9
-            processor._apply_default_parameters = Ernie4_5_VLProcessor._apply_default_parameters.__get__(
-                processor, Ernie4_5_VLProcessor
-            )
-
-            # Test with dict request
-            request = Request(request_id="test_0")
-            request.sampling_params = SamplingParams()
-            result = processor._apply_default_parameters(request)
-            self.assertEqual(result.sampling_params.top_p, 0.8)
-            self.assertEqual(result.sampling_params.temperature, 0.9)
-
-            # Test with object request
-            class MockRequest:
-                def __init__(self):
-                    self.sampling_params = SamplingParams()
-                    self.sampling_params.top_p = None
-                    self.sampling_params.temperature = None
-
-                def get(self, key):
-                    return getattr(self.sampling_params, key, None)
-
-                def set(self, key, value):
-                    setattr(self.sampling_params, key, value)
-
-            request = MockRequest()
-            result = processor._apply_default_parameters(request)
-            self.assertEqual(result.sampling_params.top_p, 0.8)
-
-
-class TestDataProcessorTargetMethods(unittest.TestCase):
-    def setUp(self):
-        self.mock_tokenizer = MagicMock(spec=Ernie4_5Tokenizer)
-        self.mock_tokenizer.ignored_index = -100
-        self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids
-        self.mock_tokenizer.chat_template = "mock_template"
-        self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>"
-        # Mock encode method for _add_text
-        self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
-
-        def mock_load_tokenizer(dp_instance):
-            dp_instance.tokenizer = self.mock_tokenizer
-
-        with patch.object(DataProcessor, "_load_tokenizer", side_effect=mock_load_tokenizer, autospec=True):
-            with patch.object(AdaptiveImageProcessor, "from_pretrained") as mock_image_preprocessor:
-                mock_image_preprocessor.return_value = MagicMock()
-                self.data_processor = DataProcessor(
-                    tokenizer_name="mock_tokenizer",
-                    image_preprocessor_name="mock_image_preprocessor",
-                    enable_processor_cache=False,
-                )
-        self.data_processor.image_patch_id = 1001
-        self.data_processor.image_start_id = 1002
-        self.data_processor.image_end_id = 1003
-        self.data_processor.video_start_id = 1004
-        self.data_processor.video_end_id = 1005
-        self.data_processor.role_prefixes = {"user": "User: ", "assistant": "Assistant: "}
-        self.data_processor.enable_processor_cache = False
-        # Note: extract_mm_items is not mocked by default, only when needed
-        self.data_processor.extract_mm_items = MagicMock(return_value=([], [], [], [], None, [], []))
-
-    def _restore_real_extract_mm_items(self):
-        """Helper method to restore real extract_mm_items method for testing"""
-        from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor
-
-        original_extract_mm_items = DataProcessor.extract_mm_items
-        self.data_processor.extract_mm_items = original_extract_mm_items.__get__(self.data_processor, DataProcessor)
-
-    def _mock_convert_tokens_to_ids(self, token):
-        token_id_map = {
-            "<|begin_of_sentence|>": 101,
-            "<|end_of_sentence|>": 102,
-            "</s>": 103,
-            "<|IMAGE_PLACEHOLDER|>": 1001,
-            "<|IMAGE_START|>": 1002,
-            "<|IMAGE_END|>": 1003,
-            "<|VIDEO_START|>": 1004,
-            "<|VIDEO_END|>": 1005,
-        }
-        return token_id_map.get(token, 999)
-
-    def test_prompt_token_ids2outputs_only_prompt_token_ids(self):
-        test_prompt_token_ids = [101, 999, 998, 997, 102]
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-        }
-        request = Request.from_dict(request)
-
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-
-        prompt_len = len(test_prompt_token_ids)
-
-        self.assertEqual(
-            outputs["input_ids"],
-            test_prompt_token_ids,
-            f"input_ids mismatch: actual {outputs['input_ids']}, expected {test_prompt_token_ids}",
-        )
-
-        self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
-
-        expected_position_ids = [[i] * 3 for i in range(prompt_len)]
-        self.assertEqual(outputs["position_ids"], expected_position_ids)
-
-        self.assertEqual(outputs["cur_position"], prompt_len)
-
-        self.assertEqual(len(outputs["images"]), 0)
-        self.assertEqual(len(outputs["grid_thw"]), 0)
-        self.assertEqual(len(outputs["mm_positions"]), 0)
-        self.assertEqual(len(outputs["mm_hashes"]), 0)
-        self.assertEqual(outputs["video_cnt"], 0)
-        self.assertEqual(outputs["num_input_image_tokens"], 0)
-        self.assertEqual(outputs["num_input_video_tokens"], 0)
-
-    def test_prompt_token_ids2outputs_with_messages_no_mm(self):
-        test_prompt_token_ids = [101, 999, 998, 997, 102]
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [{"role": "user", "content": "Hello World"}],
-        }
-        request = Request.from_dict(request)
-
-        self.data_processor.extract_mm_items.return_value = ([], [], [], [], None, [], [])
-
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-
-        prompt_len = len(test_prompt_token_ids)
-
-        self.assertEqual(outputs["input_ids"], test_prompt_token_ids)
-
-        self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
-
-        expected_position_ids = [[i] * 3 for i in range(prompt_len)]
-        self.assertEqual(outputs["position_ids"], expected_position_ids)
-
-        self.assertEqual(outputs["cur_position"], prompt_len)
-
-        self.assertEqual(len(outputs["images"]), 0)
-        self.assertEqual(outputs["video_cnt"], 0)
-        self.assertEqual(outputs["num_input_image_tokens"], 0)
-
-    def test_prompt_token_ids2outputs_add_image(self):
-        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
-        mock_img = MagicMock()
-        mock_img.height = 224
-        mock_img.width = 224
-        mock_img.convert.return_value = mock_img
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [mock_img],
-            [],
-            ["img_uuid"],
-            [],
-            None,
-            [],
-            [{"type": "image", "data": mock_img}],
-        )
-        mock_resize = (None, (2, 4))
-        self.data_processor.image_preprocessor.get_smarted_resize.return_value = mock_resize
-        mock_preprocess = {"pixel_values": np.random.randn(1, 16, 16, 3), "image_grid_thw": np.array([[2, 4]])}
-        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
-        # self.data_processor._compute_3d_positions = MagicMock(return_value=[[i]*3 for i in range(4)])
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-        self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
-        self.assertEqual(
-            outputs["token_type_ids"],
-            [
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["image"],
-                IDS_TYPE_FLAG["image"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-            ],
-        )
-        self.assertEqual(len(outputs["position_ids"]), 6)
-        self.assertEqual(outputs["cur_position"], 6)
-        self.assertEqual(len(outputs["images"]), 1)
-        self.assertIsNotNone(outputs["images"][0])
-        self.assertEqual(outputs["num_input_image_tokens"], 2)
-        self.assertEqual(len(outputs["mm_positions"]), 1)
-        self.assertEqual(len(outputs["mm_hashes"]), 1)
-        self.assertEqual(len(outputs["grid_thw"]), 1)
-        self.assertEqual(len(outputs["image_type_ids"]), 1)
-
-    def test_prompt_token_ids2outputs_add_processed_image(self):
-        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
-        mock_img_data = np.random.randn(8, 28, 28)
-        mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [mock_img_cache],
-            [],
-            ["img_uuid"],
-            [],
-            None,
-            [],
-            [{"type": "image", "data": mock_img_cache}],
-        )
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-        self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
-        self.assertEqual(
-            outputs["token_type_ids"],
-            [
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["image"],
-                IDS_TYPE_FLAG["image"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-            ],
-        )
-        self.assertEqual(len(outputs["position_ids"]), 20)
-        self.assertEqual(outputs["cur_position"], 8)
-        self.assertEqual(len(outputs["images"]), 1)
-        self.assertIsNotNone(outputs["images"][0])
-        self.assertEqual(len(outputs["mm_positions"]), 1)
-        self.assertEqual(outputs["mm_hashes"][0], "img_uuid")
-        self.assertEqual(len(outputs["grid_thw"]), 1)
-        self.assertEqual(len(outputs["image_type_ids"]), 1)
-
-    def test_prompt_token_ids2outputs_add_video(self):
-        test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
-        mock_frame1 = MagicMock()
-        mock_frame1.height = 224
-        mock_frame1.width = 224
-        mock_frame1.convert.return_value = mock_frame1
-        mock_frame2 = MagicMock()
-        mock_frame2.height = 224
-        mock_frame2.width = 224
-        mock_frame2.convert.return_value = mock_frame2
-        frames = [mock_frame1, mock_frame2]
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [],
-            [frames],
-            [],
-            ["vid_uuid"],
-            None,
-            [],
-            [{"type": "video", "data": frames}],
-        )
-        self.data_processor._load_and_process_video = MagicMock(return_value=frames)
-        patches_h, patches_w = 4, 4
-        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
-        mock_preprocess = {
-            "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
-            "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
-        }
-        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-        self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
-        self.assertEqual(
-            outputs["token_type_ids"],
-            [
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-            ],
-        )
-        self.assertEqual(len(outputs["position_ids"]), 8)
-        self.assertEqual(outputs["cur_position"], 6)
-        self.assertEqual(len(outputs["images"]), 1)
-        self.assertIsNotNone(outputs["images"][0])
-        self.assertEqual(len(outputs["mm_positions"]), 1)
-        self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
-        self.assertEqual(len(outputs["grid_thw"]), 1)
-        self.assertEqual(len(outputs["image_type_ids"]), 2)
-        self.assertEqual(outputs["num_input_video_tokens"], 4)
-
-    def test_prompt_token_ids2outputs_add_processed_video(self):
-        test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
-        t, h, w = 2, 4, 4
-        spatial_conv_size = self.data_processor.spatial_conv_size
-        temporal_conv_size = self.data_processor.temporal_conv_size
-        token_per_frame = (h // spatial_conv_size) * (w // spatial_conv_size)
-        num_tokens = (t // temporal_conv_size) * token_per_frame
-        mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
-        mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [],
-            [mock_frames_cache],
-            [],
-            ["vid_uuid"],
-            None,
-            [],
-            [{"type": "video", "data": mock_frames_cache}],
-        )
-        outputs = self.data_processor.prompt_token_ids2outputs(request)
-        self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
-        self.assertEqual(
-            outputs["token_type_ids"],
-            [
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["video"],
-                IDS_TYPE_FLAG["text"],
-                IDS_TYPE_FLAG["text"],
-            ],
-        )
-        self.assertEqual(len(outputs["position_ids"]), 8)
-        self.assertEqual(outputs["cur_position"], 6)
-        self.assertEqual(len(outputs["images"]), 1)
-        self.assertIsNotNone(outputs["images"][0])
-        self.assertEqual(len(outputs["mm_positions"]), 1)
-        self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
-        self.assertEqual(len(outputs["grid_thw"]), 1)
-        self.assertEqual(len(outputs["image_type_ids"]), 2)
-
-    def test_prompt_token_ids2outputs_add_image_token_len_mismatch(self):
-        test_prompt_token_ids = [101, 1002, 1001, 1001, 1001, 1003, 102]
-        mock_img = MagicMock()
-        mock_img.height = 224
-        mock_img.width = 224
-        mock_img.convert.return_value = mock_img
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [mock_img],
-            [],
-            ["img_uuid"],
-            [],
-            None,
-            [],
-            [{"type": "image", "data": mock_img}],
-        )
-        patches_h, patches_w = 8, 8
-        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
-        mock_preprocess = {
-            "pixel_values": np.random.randn(1, patches_h, patches_w, 3),
-            "image_grid_thw": np.array([[patches_h, patches_w]]),
-        }
-        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
-        with self.assertRaises(ValueError) as ctx:
-            self.data_processor.prompt_token_ids2outputs(request)
-        self.assertIn("image tokens num not match the size", str(ctx.exception))
-
-    def test_prompt_token_ids2outputs_add_processed_image_token_len_mismatch(self):
-        test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
-        spatial_conv_size = self.data_processor.spatial_conv_size
-        num_tokens = 4
-        mock_img_data = np.random.randn(num_tokens * (spatial_conv_size**2), 28, 28)
-        mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [mock_img_cache],
-            [],
-            ["img_uuid"],
-            [],
-            None,
-            [],
-            [{"type": "image", "data": mock_img_cache}],
-        )
-        with self.assertRaises(ValueError) as ctx:
-            self.data_processor.prompt_token_ids2outputs(request)
-        self.assertIn("image tokens num not match the size", str(ctx.exception))
-
-    def test_prompt_token_ids2outputs_add_video_token_len_mismatch(self):
-        test_prompt_token_ids = [101, 1004, 1001, 1001, 1005, 102]
-        mock_frame1 = MagicMock()
-        mock_frame1.height = 224
-        mock_frame1.width = 224
-        mock_frame1.convert.return_value = mock_frame1
-        mock_frame2 = MagicMock()
-        mock_frame2.height = 224
-        mock_frame2.width = 224
-        mock_frame2.convert.return_value = mock_frame2
-        frames = [mock_frame1, mock_frame2]
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [],
-            [frames],
-            [],
-            ["vid_uuid"],
-            None,
-            [],
-            [{"type": "video", "data": frames}],
-        )
-        self.data_processor._load_and_process_video = MagicMock(return_value=frames)
-        patches_h, patches_w = 8, 8
-        self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
-        mock_preprocess = {
-            "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
-            "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
-        }
-        self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
-        with self.assertRaises(ValueError) as ctx:
-            self.data_processor.prompt_token_ids2outputs(request)
-        self.assertIn("video tokens num not match the size", str(ctx.exception))
-
-    def test_prompt_token_ids2outputs_add_processed_video_token_len_mismatch(self):
-        test_prompt_token_ids = [101, 1004, 1001, 1005, 102]
-        t, h, w = 2, 8, 8
-        spatial_conv_size = self.data_processor.spatial_conv_size
-        temporal_conv_size = self.data_processor.temporal_conv_size
-
-        num_tokens = 4
-        mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
-        mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
-        request = {
-            "request_id": "test_0",
-            "prompt_token_ids": test_prompt_token_ids,
-            "messages": [
-                {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
-            ],
-        }
-        request = Request.from_dict(request)
-        self.data_processor.extract_mm_items.return_value = (
-            [],
-            [mock_frames_cache],
-            [],
-            ["vid_uuid"],
-            None,
-            [],
-            [{"type": "video", "data": mock_frames_cache}],
-        )
-        with self.assertRaises(ValueError) as ctx:
-            self.data_processor.prompt_token_ids2outputs(request)
-        self.assertIn("video tokens num not match the size", str(ctx.exception))
-
-    def test_extract_mm_items(self):
-        """Test extract_mm_items with various scenarios: basic items, video, and missing data error"""
-        self._restore_real_extract_mm_items()
-
-        # Test basic multimodal items (image + video)
-        request = {
-            "request_id": "test_0",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Hello"},
-                        {"type": "image", "data": Image.new("RGB", (224, 224)), "uuid": "img1"},
-                        {"type": "video", "data": [Image.new("RGB", (224, 224))], "uuid": "vid1"},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = request.messages
-            images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = (
-                self.data_processor.extract_mm_items(request)
-            )
-            self.assertEqual(len(images), 1)
-            self.assertEqual(len(videos), 1)
-            self.assertEqual(image_uuid[0], "img1")
-            self.assertEqual(video_uuid[0], "vid1")
-            self.assertEqual(len(mm_items), 2)
-
-        # Test missing data error when cache is disabled
-        self.data_processor.enable_processor_cache = False
-        request = {
-            "request_id": "test_0",
-            "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}],
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = request.messages
-            with self.assertRaises(ValueError) as ctx:
-                self.data_processor.extract_mm_items(request)
-            self.assertIn("Missing items cannot be retrieved", str(ctx.exception))
-
-
-class TestDataProcessor(unittest.TestCase):
-    def setUp(self):
-        """Set up test environment"""
-        self.mock_tokenizer = MagicMock()
-
-        def mock_convert_tokens_to_ids(x):
-            if isinstance(x, list):
-                return [hash(str(token)) % 10000 for token in x]
-            return hash(str(x)) % 10000
-
-        self.mock_tokenizer.convert_tokens_to_ids = MagicMock(side_effect=mock_convert_tokens_to_ids)
-        self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
-        self.mock_tokenizer.decode = MagicMock(return_value="decoded_text")
-        self.mock_tokenizer.tokenize = MagicMock(return_value=["token1", "token2"])
-        self.mock_tokenizer.ignored_index = -100
-        self.mock_tokenizer.chat_template = MagicMock()
-        self.mock_tokenizer.apply_chat_template = MagicMock(return_value="formatted_prompt")
-
-        self.mock_image_preprocessor = MagicMock()
-        self.mock_image_preprocessor.get_smarted_resize = MagicMock(return_value=((224, 224), (16, 16)))
-        self.mock_image_preprocessor.preprocess = MagicMock(
-            return_value={
-                "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32),
-                "image_grid_thw": np.array([[1, 16, 16]]),
-            }
-        )
-        self.mock_image_preprocessor.from_pretrained = MagicMock(return_value=self.mock_image_preprocessor)
-
-        with patch(
-            "fastdeploy.input.v1.ernie4_5_vl_processor.process.AdaptiveImageProcessor",
-            self.mock_image_preprocessor,
-        ):
-            with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.Ernie4_5Tokenizer") as mock_tokenizer_class:
-                mock_tokenizer_class.from_pretrained = MagicMock(return_value=self.mock_tokenizer)
-                mock_tokenizer_class.resource_files_names = {"vocab_file": "tokenizer.model"}
-                with patch("os.path.exists", return_value=True):
-                    self.processor = DataProcessor(
-                        tokenizer_name="test_model",
-                        image_preprocessor_name="test_model",
-                    )
-
-    def _create_outputs(self):
-        """Helper to create outputs dict"""
-        return {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-            "cur_position": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-        }
-
-    def _mock_video_processing(self, mock_frames=None):
-        """Helper to mock video processing"""
-        if mock_frames is None:
-            mock_frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
-        mock_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_video_decord")
-        mock_frames_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_frames_decord")
-        mock_render = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.render_frame_timestamp")
-        return mock_read, mock_frames_read, mock_render, mock_frames
-
-    def _setup_video_mocks(self, mock_read, mock_frames_read, mock_render, mock_frames):
-        """Setup video processing mocks"""
-        mock_read.return_value = (None, {"duration": 2.0}, "test_path")
-        mock_frames_read.return_value = (
-            [np.array(f) for f in mock_frames],
-            None,
-            [0.0, 0.5, 1.0, 1.5] if len(mock_frames) == 4 else [float(i) * 0.5 for i in range(len(mock_frames))],
-        )
-        mock_render.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
-        self.mock_image_preprocessor.preprocess.return_value = {
-            "pixel_values_videos": np.random.rand(len(mock_frames), 256, 3 * 14 * 14).astype(np.float32),
-            "video_grid_thw": np.array([[len(mock_frames), 16, 16]]),
-        }
-
-    def test_train_and_eval(self):
-        """Test training and evaluation mode switching"""
-        self.assertTrue(self.processor.is_training)
-        self.processor.eval()
-        self.assertFalse(self.processor.is_training)
-        self.processor.train()
-        self.assertTrue(self.processor.is_training)
-
-    def test_build_token_type_mapping(self):
-        """Test token type mapping construction"""
-        mapping = self.processor._build_token_type_mapping()
-        for token in [
-            self.processor.IMG_START,
-            self.processor.IMG_END,
-            self.processor.VID_START,
-            self.processor.VID_END,
-        ]:
-            self.assertEqual(mapping[token], IDS_TYPE_FLAG["image"])
-        self.assertEqual(mapping[self.processor.image_patch_id], IDS_TYPE_FLAG["image"])
-
-    def test_add_text_and_special_token(self):
-        """Test adding text and special tokens"""
-        outputs = self._create_outputs()
-        self.processor._add_text("hello", outputs)
-        self.assertEqual(len(outputs["input_ids"]), 3)
-        self.assertEqual(outputs["cur_position"], 3)
-
-        outputs2 = self._create_outputs()
-        self.processor._add_text([1, 2, 3, 4, 5], outputs2)
-        self.assertEqual(len(outputs2["input_ids"]), 5)
-
-        outputs3 = self._create_outputs()
-        self.processor._add_special_token("<|begin_of_sentence|>", outputs3)
-        self.processor._add_special_token(12345, outputs3)
-        self.assertEqual(len(outputs3["input_ids"]), 2)
-
-    def test_compute_3d_positions(self):
-        """Test 3D position computation"""
-        pos_ids = self.processor._compute_3d_positions(t=2, h=16, w=16, start_idx=10)
-        self.assertIsInstance(pos_ids, list)
-        self.assertGreater(len(pos_ids), 0)
-        self.assertEqual(len(pos_ids[0]), 3)
-
-        pos_ids2 = self.processor._compute_3d_positions(t=1, h=16, w=16, start_idx=0)
-        expected_len = 1 * (16 // self.processor.spatial_conv_size) ** 2
-        self.assertEqual(len(pos_ids2), expected_len)
-
-    def test_set_video_frame_args_comprehensive(self):
-        """Test _set_video_frame_args with various scenarios"""
-        # Valid cases
-        result = self.processor._set_video_frame_args(
-            {
-                "target_frames": 32,
-                "fps": -1,
-                "min_frames": 16,
-                "max_frames": 64,
-                "frames_sample": "leading",
-            },
-            {"duration": 10.0},
-        )
-        self.assertEqual(result["target_frames"], 32)
-
-        result = self.processor._set_video_frame_args(
-            {
-                "target_frames": -1,
-                "fps": 2,
-                "min_frames": 16,
-                "max_frames": 64,
-                "frames_sample": "leading",
-            },
-            {"duration": 10.0},
-        )
-        self.assertIsNotNone(result)
-
-        # Error cases
-        with self.assertRaises(ValueError):
-            self.processor._set_video_frame_args(
-                {
-                    "target_frames": -1,
-                    "fps": -1,
-                    "min_frames": 16,
-                    "max_frames": 64,
-                    "frames_sample": "leading",
-                },
-                {"duration": 10.0},
-            )
-        with self.assertRaises(ValueError):
-            self.processor._set_video_frame_args(
-                {
-                    "target_frames": 10,
-                    "fps": 2,
-                    "min_frames": 1,
-                    "max_frames": 100,
-                    "frames_sample": "leading",
-                },
-                {"duration": 10.0},
-            )
-        with self.assertRaises(ValueError):
-            self.processor._set_video_frame_args(
-                {
-                    "target_frames": 5,
-                    "fps": -1,
-                    "min_frames": 10,
-                    "max_frames": 100,
-                    "frames_sample": "leading",
-                },
-                {"duration": 10.0},
-            )
-        with self.assertRaises(ValueError):
-            self.processor._set_video_frame_args(
-                {
-                    "target_frames": 200,
-                    "fps": -1,
-                    "min_frames": 1,
-                    "max_frames": 100,
-                    "frames_sample": "leading",
-                },
-                {"duration": 10.0},
-            )
-        with self.assertRaises(ValueError):
-            self.processor._set_video_frame_args(
-                {
-                    "target_frames": -1,
-                    "fps": 2,
-                    "min_frames": 100,
-                    "max_frames": 10,
-                    "frames_sample": "leading",
-                },
-                {"duration": 10.0},
-            )
-
-        # Adjustment cases
-        result = self.processor._set_video_frame_args(
-            {
-                "target_frames": -1,
-                "fps": 1,
-                "min_frames": 10,
-                "max_frames": 100,
-                "frames_sample": "leading",
-            },
-            {"duration": 1.0},
-        )
-        self.assertEqual(result["target_frames"], 10)
-        self.assertEqual(result["fps"], -1)
-
-        result = self.processor._set_video_frame_args(
-            {
-                "target_frames": -1,
-                "fps": 10,
-                "min_frames": 1,
-                "max_frames": 100,
-                "frames_sample": "leading",
-            },
-            {"duration": 100.0},
-        )
-        self.assertEqual(result["target_frames"], 100)
-        self.assertEqual(result["fps"], -1)
-
-    def test_text2ids_comprehensive(self):
-        """Test text2ids with various scenarios"""
-        # Text only
-        outputs = self.processor.text2ids("Hello world")
-        self.assertIn("input_ids", outputs)
-        self.assertEqual(len(outputs["images"]), 0)
-
-        # Empty text
-        outputs = self.processor.text2ids("")
-        self.assertEqual(len(outputs["input_ids"]), 0)
-
-        # With image placeholder
-        mock_image = Image.new("RGB", (224, 224))
-        outputs = self.processor.text2ids("Hello <|image@placeholder|> world", images=[mock_image])
-        self.assertGreater(len(outputs["input_ids"]), 0)
-        self.assertGreater(len(outputs["images"]), 0)
-
-        # With cached image
-        cached_image = (
-            np.random.rand(256, 3 * 14 * 14).astype(np.float32),
-            {"thw": (1, 16, 16)},
-        )
-        outputs = self.processor.text2ids(
-            "Hello <|image@placeholder|> world",
-            images=[cached_image],
-            image_uuid=["uuid"],
-        )
-        self.assertGreater(len(outputs["input_ids"]), 0)
-
-        # Multiple images
-        outputs = self.processor.text2ids(
-            "Hello <|image@placeholder|> world <|image@placeholder|> end",
-            images=[mock_image, mock_image],
-        )
-        self.assertEqual(len(outputs["images"]), 2)
-
-        # With video placeholder
-        mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
-        with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
-            mr.return_value = (None, {"duration": 2.0}, "test_path")
-            mfr.return_value = (
-                [np.array(f) for f in mock_frames],
-                None,
-                [0.0, 0.5, 1.0, 1.5],
-            )
-            mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
-            self.mock_image_preprocessor.preprocess.return_value = {
-                "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
-                "video_grid_thw": np.array([[4, 16, 16]]),
-            }
-            outputs = self.processor.text2ids("Hello <|video@placeholder|> world", videos=["test_video.mp4"])
-            self.assertGreater(len(outputs["input_ids"]), 0)
-
-        # Cached video
-        cached_video = (
-            np.random.rand(256, 3 * 14 * 14).astype(np.float32),
-            {"thw": (4, 16, 16)},
-        )
-        outputs = self.processor.text2ids(
-            "Hello <|video@placeholder|> world",
-            videos=[cached_video],
-            video_uuid=["uuid"],
-        )
-        self.assertGreater(len(outputs["input_ids"]), 0)
-
-        # Video dict format
-        mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
-        with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
-            mr.return_value = (None, {"duration": 2.0}, "test_path")
-            mfr.return_value = (
-                [np.array(f) for f in mock_frames],
-                None,
-                [0.0, 0.5, 1.0, 1.5],
-            )
-            mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
-            self.mock_image_preprocessor.preprocess.return_value = {
-                "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
-                "video_grid_thw": np.array([[4, 16, 16]]),
-            }
-            outputs = self.processor.text2ids(
-                "Hello <|video@placeholder|> world",
-                videos=[{"video": "test.mp4", "fps": 2}],
-            )
-            self.assertGreater(len(outputs["input_ids"]), 0)
-
-        # Image and video together
-        mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
-        with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
-            mr.return_value = (None, {"duration": 2.0}, "test_path")
-            mfr.return_value = (
-                [np.array(f) for f in mock_frames],
-                None,
-                [0.0, 0.5, 1.0, 1.5],
-            )
-            mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
-            self.mock_image_preprocessor.preprocess.side_effect = [
-                {
-                    "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32),
-                    "image_grid_thw": np.array([[1, 16, 16]]),
-                },
-                {
-                    "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
-                    "video_grid_thw": np.array([[4, 16, 16]]),
-                },
-            ]
-            outputs = self.processor.text2ids(
-                "Hello <|image@placeholder|> world <|video@placeholder|> end",
-                images=[mock_image],
-                videos=["test_video.mp4"],
-            )
-            self.assertGreater(len(outputs["input_ids"]), 0)
-            self.mock_image_preprocessor.preprocess.side_effect = None
-
-    def test_request2ids_comprehensive(self):
-        """Test request2ids with various scenarios"""
-        self.processor.is_training = False
-
-        # Basic request with multimodal content - covers both text and image branches in one call
-        mock_image = Image.new("RGB", (224, 224))
-        request = {
-            "request_id": "test_0",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What's in this image?"},
-                        {"type": "image", "data": mock_image, "uuid": "img1"},
-                    ],
-                }
-            ],
-            "add_generation_prompt": True,
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = request.messages
-            outputs = self.processor.request2ids(request)
-            self.assertIn("input_ids", outputs)
-
-        # Error case: missing chat_template
-        self.processor.tokenizer.chat_template = None
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}]
-            with self.assertRaises(ValueError):
-                self.processor.request2ids(request)
-        self.processor.tokenizer.chat_template = MagicMock()
-
-        # Error case: unsupported role
-        request = {
-            "request_id": "test_0",
-            "messages": [{"role": "invalid_role", "content": "Hello"}],
-            "add_generation_prompt": True,
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = [{"role": "invalid_role", "content": [{"type": "text", "text": "Hello"}]}]
-            with self.assertRaises(AssertionError):
-                self.processor.request2ids(request)
-
-        # Error case: missing cache when cache is disabled
-        self.processor.enable_processor_cache = False
-        request = {
-            "request_id": "test_0",
-            "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}],
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-            mock_parse.return_value = request.messages
-            with self.assertRaises(ValueError):
-                self.processor.request2ids(request)
-
-    def test_extract_labels(self):
-        """Test label extraction"""
-        outputs = {"input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5], "labels": []}
-        self.processor.is_training = True
-        self.processor._extract_labels(outputs, ["target text"])
-        self.assertEqual(len(outputs["labels"]), len(outputs["input_ids"]))
-
-        # Multiple targets
-        outputs2 = {
-            "input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5, self.processor.sep_token_id, 6, 7],
-            "labels": [],
-        }
-        self.processor._extract_labels(outputs2, ["target1", "target2"])
-        self.assertEqual(len(outputs2["labels"]), len(outputs2["input_ids"]))
-
-        # Error case
-        outputs3 = {"input_ids": [1, 2, 3, self.processor.sep_token_id], "labels": []}
-        with self.assertRaises(AssertionError):
-            self.processor._extract_labels(outputs3, ["target1", "target2"])
-
-    def test_fancy_print(self):
-        """Test fancy_print function"""
-        from fastdeploy.input.v1.ernie4_5_vl_processor.process import fancy_print
-
-        test_cases = [
-            ([1, 2, 3, self.processor.image_patch_id, 4, 5], self.processor.image_patch_id, None),
-            (
-                [
-                    1,
-                    2,
-                    self.processor.image_patch_id,
-                    self.processor.image_patch_id,
-                    self.processor.image_patch_id,
-                    4,
-                    5,
-                ],
-                self.processor.image_patch_id,
-                "<|IMAGE@",
-            ),
-            ([1, 2, 3, 4, 5], self.processor.image_patch_id, None),
-        ]
-        for input_ids, image_patch_id, expected_contains in test_cases:
-            result = fancy_print(input_ids, self.mock_tokenizer, image_patch_id)
-            self.assertIsInstance(result, str)
-            if expected_contains:
-                self.assertIn(expected_contains, result)
-
-    def test_processor_cache_operations(self):
-        """Test processor cache get/update and request2ids with cache"""
-        # Test get_processor_cache
-        mock_socket = MagicMock()
-        mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data"))
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle:
-            mock_pickle.loads = MagicMock(return_value=[{"data": "cached_item"}])
-            result = self.processor.get_processor_cache(mock_socket, ["hash1", "hash2"])
-            self.assertEqual(len(result), 1)
-
-        # Test update_processor_cache
-        mock_socket2 = MagicMock()
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle"):
-            self.processor.update_processor_cache(
-                mock_socket2,
-                ["hash1"],
-                [(np.array([1, 2, 3]), {"meta": "data"})],
-            )
-            mock_socket2.send_multipart.assert_called_once()
-
-        # Test request2ids with processor cache update
-        self.processor.is_training = False
-        self.processor.enable_processor_cache = True
-        mock_image = Image.new("RGB", (224, 224))
-        request = {
-            "request_id": "test_0",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Hello"},
-                        {"type": "image", "data": mock_image, "uuid": "img1"},
-                    ],
-                }
-            ],
-            "add_generation_prompt": True,
-        }
-        request = Request.from_dict(request)
-        with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.zmq") as mock_zmq:
-            mock_context = MagicMock()
-            mock_socket = MagicMock()
-            mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data"))
-            mock_context.socket.return_value = mock_socket
-            mock_zmq.Context.return_value = mock_context
-            with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
-                mock_parse.return_value = request.messages
-                with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle:
-                    mock_pickle.loads = MagicMock(return_value=[])
-                    with patch.object(self.processor, "text2ids") as mock_text2ids:
-                        mock_text2ids.return_value = {
-                            "input_ids": [1, 2, 3],
-                            "token_type_ids": [0] * 3,
-                            "position_ids": [[i] * 3 for i in range(3)],
-                            "images": [np.random.rand(256, 3 * 14 * 14).astype(np.float32)],
-                            "grid_thw": [np.array([[1, 16, 16]])],
-                            "image_type_ids": [0],
-                            "cur_position": 3,
-                            "video_cnt": 0,
-                            "num_input_image_tokens": 0,
-                            "num_input_video_tokens": 0,
-                            "mm_positions": [],
-                            "mm_hashes": ["hash1"],
-                        }
-                        with patch.object(self.processor, "update_processor_cache") as mock_update:
-                            self.processor.request2ids(request)
-                            mock_update.assert_called_once()
-        self.processor.enable_processor_cache = False
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_image_preprocessor_adaptive.py b/tests/input/v1/test_image_preprocessor_adaptive.py
deleted file mode 100644
index 5a15244d1fd..00000000000
--- a/tests/input/v1/test_image_preprocessor_adaptive.py
+++ /dev/null
@@ -1,499 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
-    AdaptiveImageProcessor,
-    ceil_by_factor,
-    floor_by_factor,
-    is_scaled_image,
-    make_batched_images,
-    make_batched_videos,
-    round_by_factor,
-    smart_resize,
-)
-
-
-class TestImagePreprocessorAdaptive(unittest.TestCase):
-    def setUp(self):
-        """Set up test environment"""
-        self.processor = AdaptiveImageProcessor(
-            min_pixels=56 * 56,
-            max_pixels=28 * 28 * 1280,
-            patch_size=14,
-            temporal_conv_size=2,
-            merge_size=2,
-        )
-
-    def test_init(self):
-        """Test initialization"""
-        self.assertEqual(self.processor.min_pixels, 56 * 56)
-        self.assertEqual(self.processor.max_pixels, 28 * 28 * 1280)
-        self.assertEqual(self.processor.patch_size, 14)
-        self.assertEqual(self.processor.temporal_conv_size, 2)
-        self.assertEqual(self.processor.merge_size, 2)
-
-    def test_set_pixels(self):
-        """Test setting pixels with valid and invalid values (lines 205-214)"""
-        # Test setting only min_pixels
-        self.processor.set_pixels(min_pixels=100, msg="test")
-        self.assertEqual(self.processor.min_pixels, 100)
-        self.assertEqual(self.processor.size["min_pixels"], 100)
-
-        # Test setting only max_pixels
-        self.processor.set_pixels(max_pixels=200, msg="test")
-        self.assertEqual(self.processor.max_pixels, 200)
-        self.assertEqual(self.processor.size["max_pixels"], 200)
-
-        # Test setting both
-        self.processor.set_pixels(min_pixels=150, max_pixels=250, msg="test")
-        self.assertEqual(self.processor.min_pixels, 150)
-        self.assertEqual(self.processor.max_pixels, 250)
-        self.assertEqual(self.processor.size["min_pixels"], 150)
-        self.assertEqual(self.processor.size["max_pixels"], 250)
-
-        # Invalid cases
-        with self.assertRaises(AssertionError):
-            self.processor.set_pixels(min_pixels=-1)
-        with self.assertRaises(AssertionError):
-            self.processor.set_pixels(max_pixels=0)
-
-    def test_get_smarted_resize(self):
-        """Test get_smarted_resize with default and custom pixels"""
-        height, width = 224, 224
-        # Test with default pixels
-        (resized_h, resized_w), (patches_h, patches_w) = self.processor.get_smarted_resize(height, width)
-        self.assertIsInstance(resized_h, int)
-        self.assertIsInstance(resized_w, int)
-        self.assertIsInstance(patches_h, int)
-        self.assertIsInstance(patches_w, int)
-        # Test with custom pixels
-        (resized_h, resized_w), (_, _) = self.processor.get_smarted_resize(
-            height, width, min_pixels=100, max_pixels=10000
-        )
-        self.assertIsInstance(resized_h, int)
-        self.assertIsInstance(resized_w, int)
-
-    def test_round_by_factor(self):
-        """Test round_by_factor with various cases"""
-        self.assertEqual(round_by_factor(100, 28), 112)  # 100/28 ≈ 3.57, round(3.57) = 4, 4*28 = 112
-        self.assertEqual(round_by_factor(50, 10), 50)
-        self.assertEqual(round_by_factor(55, 10), 60)
-        # Edge cases
-        self.assertEqual(round_by_factor(0, 14), 0)
-        self.assertEqual(round_by_factor(14, 14), 14)
-        self.assertEqual(round_by_factor(13, 14), 14)  # Round up
-        self.assertEqual(round_by_factor(15, 14), 14)  # Round down
-
-    def test_ceil_by_factor(self):
-        """Test ceil_by_factor with various cases"""
-        self.assertEqual(ceil_by_factor(100, 28), 112)  # ceil(100/28)*28 = ceil(3.57)*28 = 4*28 = 112
-        self.assertEqual(ceil_by_factor(50, 10), 50)
-        self.assertEqual(ceil_by_factor(55, 10), 60)
-        # Edge cases
-        self.assertEqual(ceil_by_factor(0, 14), 0)
-        self.assertEqual(ceil_by_factor(14, 14), 14)
-        self.assertEqual(ceil_by_factor(13, 14), 14)  # Ceil up
-        self.assertEqual(ceil_by_factor(15, 14), 28)  # Ceil up to next multiple
-
-    def test_floor_by_factor(self):
-        """Test floor_by_factor with various cases"""
-        self.assertEqual(floor_by_factor(100, 28), 84)  # floor(100/28)*28 = floor(3.57)*28 = 3*28 = 84
-        self.assertEqual(floor_by_factor(50, 10), 50)
-        self.assertEqual(floor_by_factor(55, 10), 50)
-        # Edge cases
-        self.assertEqual(floor_by_factor(0, 14), 0)
-        self.assertEqual(floor_by_factor(14, 14), 14)
-        self.assertEqual(floor_by_factor(13, 14), 0)  # Floor down
-        self.assertEqual(floor_by_factor(15, 14), 14)  # Floor down to multiple
-        self.assertEqual(floor_by_factor(28, 14), 28)  # Exact multiple
-
-    def test_smart_resize(self):
-        """Test smart_resize with various scenarios (lines 557-587)"""
-        # Basic functionality
-        height, width = 224, 224
-        new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
-        self.assertIsInstance(new_h, int)
-        self.assertIsInstance(new_w, int)
-        self.assertEqual(new_h % 28, 0)
-        self.assertEqual(new_w % 28, 0)
-
-        # High aspect ratio (height > width) - tests lines 557-563
-        height, width = 10000, 10  # aspect ratio > 200
-        new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
-        self.assertIsInstance(new_h, int)
-        self.assertIsInstance(new_w, int)
-        self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200)
-
-        # High aspect ratio (width > height) - tests lines 562-563
-        height, width = 10, 10000
-        new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
-        self.assertIsInstance(new_h, int)
-        self.assertIsInstance(new_w, int)
-        self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200)
-
-        # Too large - tests lines 575-578
-        height, width = 10000, 10000
-        new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
-        self.assertLessEqual(new_h * new_w, 28 * 28 * 1280)
-
-        # Too small - tests lines 579-582
-        height, width = 10, 10
-        new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
-        self.assertGreaterEqual(new_h * new_w, 56 * 56)
-
-        # Exceeds max_pixels with custom parameters
-        height, width = 10000, 10000
-        max_pixels = 10000
-        min_pixels = 1000
-        new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels)
-        self.assertLessEqual(new_h * new_w, max_pixels)
-        self.assertGreaterEqual(new_h * new_w, min_pixels)
-
-        # Below min_pixels with custom parameters
-        height, width = 10, 10
-        min_pixels = 10000
-        max_pixels = 100000
-        new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels)
-        self.assertGreaterEqual(new_h * new_w, min_pixels)
-        self.assertLessEqual(new_h * new_w, max_pixels)
-
-        # Invalid result (extreme parameters) - tests lines 584-585
-        with self.assertRaises(ValueError):
-            smart_resize(1, 1, factor=100000, min_pixels=100, max_pixels=1000)
-
-    def test_is_scaled_image(self):
-        """Test is_scaled_image with various image types"""
-        # uint8 image
-        image = np.array([[0, 255], [128, 200]], dtype=np.uint8)
-        self.assertFalse(is_scaled_image(image))
-        image = np.random.rand(224, 224, 3).astype(np.uint8) * 255
-        self.assertFalse(is_scaled_image(image))
-
-        # Scaled float image (values in [0, 1])
-        image = np.array([[0.0, 0.5], [0.3, 1.0]], dtype=np.float32)
-        self.assertTrue(is_scaled_image(image))
-        image = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
-        self.assertTrue(is_scaled_image(image))
-
-        # Unscaled float image (values > 1)
-        image = np.array([[0.0, 255.0], [128.0, 300.0]], dtype=np.float32)
-        self.assertFalse(is_scaled_image(image))
-        image = np.random.rand(224, 224, 3).astype(np.float32) * 255
-        self.assertFalse(is_scaled_image(image))
-
-        # Edge cases
-        image = np.array([[0.0, 1.0]], dtype=np.float32)
-        self.assertTrue(is_scaled_image(image))
-        image = np.array([[0.0, 1.1]], dtype=np.float32)
-        self.assertFalse(is_scaled_image(image))
-        image = np.array([[-0.1, 1.0]], dtype=np.float32)
-        self.assertFalse(is_scaled_image(image))
-
-    def test_make_batched_images(self):
-        """Test make_batched_images with various input types"""
-        # Single image
-        img = Image.new("RGB", (224, 224))
-        result = make_batched_images(img)
-        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0], img)
-
-        # List of images
-        imgs = [Image.new("RGB", (224, 224)) for _ in range(3)]
-        result = make_batched_images(imgs)
-        self.assertEqual(len(result), 3)
-        self.assertEqual(result, imgs)
-
-        # Nested list
-        imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)]
-        result = make_batched_images(imgs)
-        self.assertEqual(len(result), 4)  # 2*2 = 4
-
-        # Invalid inputs
-        with self.assertRaises(ValueError) as context:
-            make_batched_images("invalid")
-        self.assertIn("Could not make batched images", str(context.exception))
-        with self.assertRaises(ValueError) as context:
-            make_batched_images([[1, 2, 3], [4, 5, 6]])
-        self.assertIn("Could not make batched images", str(context.exception))
-
-    def test_make_batched_videos(self):
-        """Test make_batched_videos with various input types"""
-        # List of images
-        imgs = [Image.new("RGB", (224, 224)) for _ in range(3)]
-        result = make_batched_videos(imgs)
-        self.assertEqual(len(result), 1)
-        self.assertEqual(len(result[0]), 3)
-
-        # Single image in list
-        img = Image.new("RGB", (224, 224))
-        result = make_batched_videos([img])
-        self.assertEqual(len(result), 1)
-        self.assertEqual(len(result[0]), 1)
-
-        # Nested list
-        imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)]
-        result = make_batched_videos(imgs)
-        self.assertEqual(len(result), 2)
-        self.assertEqual(len(result[0]), 2)
-
-        # 4D array (single)
-        video = np.random.rand(3, 224, 224, 3).astype(np.uint8)
-        result = make_batched_videos(video)
-        self.assertEqual(len(result), 1)
-        self.assertIsInstance(result[0], list)
-
-        # 4D array in list (lines 119-120)
-        videos = [np.random.rand(3, 224, 224, 3).astype(np.uint8)]
-        result = make_batched_videos(videos)
-        self.assertEqual(len(result), 1)
-        self.assertIsInstance(result[0], list)
-
-        # Invalid input
-        with self.assertRaises(ValueError) as context:
-            make_batched_videos("invalid")
-        self.assertIn("Could not make batched video", str(context.exception))
-
-    def test_preprocess_images(self):
-        """Test preprocess handling images"""
-        img = Image.new("RGB", (224, 224))
-        result = self.processor.preprocess(images=img)
-        self.assertIn("pixel_values", result)
-        self.assertIn("image_grid_thw", result)
-        # Verify pixel_values shape
-        pixel_values = result["pixel_values"]
-        self.assertIsInstance(pixel_values, np.ndarray)
-
-    def test_preprocess_videos(self):
-        """Test preprocess handling videos"""
-        frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
-        result = self.processor.preprocess(images=None, videos=frames)
-        self.assertIn("pixel_values_videos", result)
-        self.assertIn("video_grid_thw", result)
-
-    def test_preprocess_invalid_images(self):
-        """Test preprocess handling invalid image"""
-        with self.assertRaises(ValueError):
-            self.processor.preprocess(images="invalid")
-
-    def test_preprocess_with_predetermined_grid_thw(self):
-        """Test preprocess using predetermined_grid_thw"""
-        img = Image.new("RGB", (224, 224))
-        # predetermined_grid_thw should be (h, w) format, not [1, h, w]
-        predetermined_grid_thw = [(16, 16)]  # For single image, should be (h, w) tuple
-        result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_flags(self):
-        """Test preprocess with various flags disabled"""
-        img = Image.new("RGB", (224, 224))
-        # Test without resize
-        result = self.processor.preprocess(images=img, do_resize=False)
-        self.assertIn("pixel_values", result)
-        # Test without rescale
-        result = self.processor.preprocess(images=img, do_rescale=False)
-        self.assertIn("pixel_values", result)
-        # Test without normalize
-        result = self.processor.preprocess(images=img, do_normalize=False)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_custom_mean_std(self):
-        """Test preprocess using custom mean and std"""
-        img = Image.new("RGB", (224, 224))
-        # Test with simple custom mean/std
-        result = self.processor.preprocess(images=img, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
-        self.assertIn("pixel_values", result)
-        # Test with ImageNet-style mean/std
-        result = self.processor.preprocess(
-            images=img, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225]
-        )
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_do_convert_rgb(self):
-        """Test preprocess with do_convert_rgb=True (line 289)"""
-        img = Image.new("L", (224, 224))  # Grayscale image
-        result = self.processor.preprocess(images=img, do_convert_rgb=True)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_scaled_image_warning(self):
-        """Test warning for scaled image in preprocess (lines 294-298)"""
-        # Create a scaled image (values between 0-1)
-        img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
-        # Use patch to capture warning
-        with patch(
-            "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
-        ) as mock_logger:
-            # Directly call _preprocess, pass scaled image
-            self.processor._preprocess(
-                [img_array],  # Pass scaled numpy array
-                do_rescale=True,
-                do_convert_rgb=False,
-            )
-            # Verify warning is called when is_scaled_image returns True and do_rescale is True
-            mock_logger.warning.assert_called()
-
-    def test_preprocess_invalid_images_check(self):
-        """Test invalid image check in preprocess (line 464)"""
-        # Test invalid image type - need to ensure valid_images returns False
-        # Use patch to make valid_images return False, but make_batched_images succeeds
-        with patch(
-            "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
-        ) as mock_valid:
-            mock_valid.return_value = False
-            valid_images_list = [Image.new("RGB", (224, 224))]  # Valid image, but valid_images returns False
-            with self.assertRaises(ValueError) as context:
-                self.processor.preprocess(images=valid_images_list)
-            self.assertIn("Invalid image type", str(context.exception))
-
-    def test_preprocess_predetermined_grid_thw_multiple_images(self):
-        """Test preprocess with predetermined_grid_thw for multiple images (lines 307-310)"""
-        imgs = [Image.new("RGB", (224, 224)) for _ in range(2)]
-        predetermined_grid_thw = [(16, 16), (20, 20)]
-        result = self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_predetermined_grid_thw_length_mismatch(self):
-        """Test preprocess with predetermined_grid_thw length mismatch (lines 307-310, 470)"""
-        imgs = [Image.new("RGB", (224, 224)) for _ in range(2)]
-        predetermined_grid_thw = [(16, 16)]  # Length mismatch - only 1 element for 2 images
-        # The function raises IndexError when accessing predetermined_grid_thw[img_idx] with img_idx=1
-        with self.assertRaises(IndexError):
-            self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw)
-
-    def test_preprocess_with_input_data_format(self):
-        """Test preprocess with input_data_format parameter (lines 299-301)"""
-        img = Image.new("RGB", (224, 224))
-        from paddleformers.transformers.image_utils import ChannelDimension
-
-        # Test with FIRST
-        result = self.processor.preprocess(images=img, input_data_format=ChannelDimension.FIRST)
-        self.assertIn("pixel_values", result)
-        # Test with None
-        result = self.processor.preprocess(images=img, input_data_format=None)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_do_resize_with_predetermined_grid_thw(self):
-        """Test preprocess with do_resize=True and predetermined_grid_thw (lines 314-317)"""
-        img = Image.new("RGB", (224, 224))
-        predetermined_grid_thw = [(16, 16)]
-        result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw, do_resize=True)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_videos_with_predetermined_grid_thw(self):
-        """Test preprocess videos with predetermined_grid_thw (lines 511)"""
-        frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
-        predetermined_grid_thw = [(16, 16)] * 4
-        result = self.processor.preprocess(images=None, videos=frames, predetermined_grid_thw=predetermined_grid_thw)
-        self.assertIn("pixel_values_videos", result)
-
-    def test_preprocess_return_tensors(self):
-        """Test preprocess with return_tensors parameter (lines 396, 523)"""
-        img = Image.new("RGB", (224, 224))
-        # Use string instead of TensorType enum which may not be available
-        result = self.processor.preprocess(images=img, return_tensors="np")
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_do_rescale_false_with_scaled_image(self):
-        """Test preprocess with do_rescale=False and scaled image (line 335)"""
-        # Create a scaled image
-        img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5  # Values in [0, 0.5]
-        img = Image.fromarray((img_array * 255).astype(np.uint8))
-        result = self.processor.preprocess(images=img, do_rescale=False)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_custom_resample(self):
-        """Test preprocess with custom resample parameter (line 332)"""
-        img = Image.new("RGB", (224, 224))
-        from PIL import Image as PILImage
-
-        result = self.processor.preprocess(images=img, resample=PILImage.BILINEAR)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_custom_rescale_factor(self):
-        """Test preprocess with custom rescale_factor (line 336)"""
-        img = Image.new("RGB", (224, 224))
-        result = self.processor.preprocess(images=img, rescale_factor=1.0 / 128.0)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_data_format(self):
-        """Test preprocess with different data_format values"""
-        img = Image.new("RGB", (224, 224))
-        from paddleformers.transformers.image_utils import ChannelDimension
-
-        # Test with FIRST
-        result = self.processor.preprocess(images=img, data_format=ChannelDimension.FIRST)
-        self.assertIn("pixel_values", result)
-        # Test with LAST
-        result = self.processor.preprocess(images=img, data_format=ChannelDimension.LAST)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_multiple_images_loop(self):
-        """Test preprocess loop with multiple images (lines 312-348, 468-488)"""
-        images = [Image.new("RGB", (224, 224)) for _ in range(3)]
-        result = self.processor.preprocess(images=images)
-        self.assertIn("pixel_values", result)
-        self.assertIn("image_grid_thw", result)
-        pixel_values = result["pixel_values"]
-        self.assertIsInstance(pixel_values, np.ndarray)
-        self.assertEqual(len(pixel_values.shape), 2)  # Should be [grid_t * grid_h * grid_w, C * psz * psz]
-
-    def test_preprocess_videos_loop(self):
-        """Test preprocess with videos in loop (lines 496-521)"""
-        # Test with multiple videos
-        videos = [
-            [Image.new("RGB", (224, 224)) for _ in range(4)],
-            [Image.new("RGB", (224, 224)) for _ in range(4)],
-        ]
-        result = self.processor.preprocess(images=None, videos=videos)
-        self.assertIn("pixel_values_videos", result)
-        self.assertIn("video_grid_thw", result)
-        self.assertIsInstance(result["pixel_values_videos"], np.ndarray)
-        # Test with nested list format
-        videos = [[Image.new("RGB", (224, 224)) for _ in range(4)] for _ in range(2)]
-        result = self.processor.preprocess(images=None, videos=videos)
-        self.assertIn("pixel_values_videos", result)
-        self.assertIn("video_grid_thw", result)
-        self.assertIsInstance(result["pixel_values_videos"], np.ndarray)
-
-    def test_preprocess_both_images_and_videos(self):
-        """Test preprocess with both images and videos (lines 458-523)"""
-        images = [Image.new("RGB", (224, 224))]
-        videos = [[Image.new("RGB", (224, 224)) for _ in range(4)]]
-        result = self.processor.preprocess(images=images, videos=videos)
-        # Due to implementation, only video results are returned when both are provided
-        self.assertIn("pixel_values_videos", result)
-        self.assertIn("video_grid_thw", result)
-
-    def test_preprocess_invalid_images_check_list_input(self):
-        """Test preprocess with invalid images check (line 464)
-
-        Note: The error is raised by make_batched_images before valid_images check,
-        so the error message is different.
-        """
-        invalid_images = ["not an image", "also not an image"]
-
-        with self.assertRaises(ValueError) as context:
-            self.processor.preprocess(images=invalid_images)
-        self.assertIn("Could not make batched images", str(context.exception))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_paddleocr_vl_processor.py b/tests/input/v1/test_paddleocr_vl_processor.py
deleted file mode 100644
index 3c1e83b42ff..00000000000
--- a/tests/input/v1/test_paddleocr_vl_processor.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-import unittest
-from unittest.mock import ANY, MagicMock, patch
-
-import numpy as np
-import zmq
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.paddleocr_vl_processor.image_processor import (
-    ImageProcessor,
-    smart_resize,
-)
-from fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor import (
-    PaddleOCRVLProcessor,
-)
-from fastdeploy.input.v1.paddleocr_vl_processor.process import DataProcessor
-from fastdeploy.input.v1.paddleocr_vl_processor.process_video import sample_frames
-
-MODULE_PATH = "fastdeploy.input.v1.paddleocr_vl_processor.process"
-
-
-class TestProcessVideo(unittest.TestCase):
-    def setUp(self):
-        self.metadata = {"num_of_frame": 100, "fps": 25}
-        self.frame_factor = 4
-        self.min_frames = 8
-        self.max_frames = 32
-
-    def test_sample_with_num_frames(self):
-        """测试使用num_frames参数采样（来自用户的原始测试）"""
-        num_frames = 16
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=self.max_frames,
-            num_frames=num_frames,
-            fps=0,  # 确保 fps 不>0
-            metadata=self.metadata,
-        )
-        self.assertEqual(len(indices), 16)
-        self.assertEqual(indices[0], 0)
-        self.assertEqual(indices[-1], 93)
-        np.testing.assert_array_equal(indices, np.arange(0, 100, 100 / 16).astype(np.int32))
-
-    def test_error_num_frames_exceeds_total(self):
-        """测试 num_frames 超过总帧数的异常（来自用户的原始测试）"""
-        with self.assertRaises(ValueError) as context:
-            sample_frames(
-                frame_factor=self.frame_factor,
-                min_frames=self.min_frames,
-                max_frames=self.max_frames,
-                num_frames=200,  # 超过总帧数100
-                fps=0,
-                metadata=self.metadata,
-            )
-        self.assertIn("exceeds", str(context.exception))
-
-    def test_error_mutual_exclusion(self):
-        """新增：测试 num_frames 和 fps 互斥"""
-        with self.assertRaises(ValueError) as context:
-            sample_frames(
-                frame_factor=self.frame_factor,
-                min_frames=self.min_frames,
-                max_frames=self.max_frames,
-                num_frames=16,  # > 0
-                fps=10,  # > 0
-                metadata=self.metadata,
-            )
-        self.assertIn("mutually exclusive", str(context.exception))
-
-    def test_error_fps_without_metadata(self):
-        """新增：测试 fps > 0 但 metadata 为 None"""
-        with self.assertRaises(TypeError) as context:
-            sample_frames(
-                frame_factor=self.frame_factor,
-                min_frames=self.min_frames,
-                max_frames=self.max_frames,
-                num_frames=0,
-                fps=10,
-                metadata=None,  # 缺失
-            )
-        # 验证是预期的 TypeError
-        self.assertIn("'NoneType' object is not subscriptable", str(context.exception))
-
-    def test_num_frames_rounding(self):
-        """新增：测试 num_frames 向 frame_factor 舍入"""
-        num_frames = 17  # 不是 4 的倍数
-        # 逻辑: round(17 / 4) * 4 = round(4.25) * 4 = 4 * 4 = 16
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=self.max_frames,
-            num_frames=num_frames,
-            fps=0,
-            metadata=self.metadata,
-        )
-        # 应舍入到 16
-        self.assertEqual(len(indices), 16)
-
-    def test_sample_with_fps_basic(self):
-        """新增：测试使用 fps 采样（基本路径，被 max_frames 限制）"""
-        # 逻辑: num_frames_calc = 100 / 25 * 10 = 40
-        #      num_frames_clamped = min(max(40, 8), 32) = 32
-        #      num_frames_factored = floor(32 / 4) * 4 = 32
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=self.max_frames,
-            num_frames=0,
-            fps=10,
-            metadata=self.metadata,
-        )
-        # 应被 max_frames=32 限制
-        self.assertEqual(len(indices), 32)
-        self.assertEqual(indices[-1], 96)
-
-    def test_sample_with_fps_hits_min_frames(self):
-        """新增：测试使用 fps 采样（被 min_frames 限制）"""
-        # 逻辑: num_frames_calc = 100 / 25 * 1 = 4
-        #      num_frames_clamped = min(max(4, 8), 32) = 8
-        #      num_frames_factored = floor(8 / 4) * 4 = 8
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=self.max_frames,
-            num_frames=0,
-            fps=1,
-            metadata=self.metadata,
-        )
-        # 应被 min_frames=8 限制
-        self.assertEqual(len(indices), 8)
-        self.assertEqual(indices[-1], 87)
-
-    def test_sample_with_fps_hits_total_frames(self):
-        """新增：测试使用 fps 采样（被 total_num_frames 限制）"""
-        local_max_frames = 200
-
-        # 逻辑: num_frames_calc = 100 / 25 * 50 = 200
-        #      num_frames_clamped = min(min(max(200, 8), 200), 100) = 100
-        #      num_frames_factored = floor(100 / 4) * 4 = 100
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=local_max_frames,
-            num_frames=0,
-            fps=50,
-            metadata=self.metadata,
-        )
-        # 应被 total_num_frames=100 限制
-        self.assertEqual(len(indices), 100)
-        self.assertEqual(indices[-1], 99)  # 采样所有帧
-
-    def test_no_sampling(self):
-        """新增：测试不采样（fps=0, num_frames=0）"""
-        indices = sample_frames(
-            frame_factor=self.frame_factor,
-            min_frames=self.min_frames,
-            max_frames=self.max_frames,
-            num_frames=0,
-            fps=0,
-            metadata=self.metadata,
-        )
-        # 应返回所有帧
-        self.assertEqual(len(indices), self.metadata["num_of_frame"])
-        self.assertEqual(len(indices), 100)
-        self.assertEqual(indices[-1], 99)
-        np.testing.assert_array_equal(indices, np.arange(0, 100).astype(np.int32))
-
-
-class Test_DataProcessor(unittest.TestCase):
-    """
-    针对 process.py 中 DataProcessor 类的单元测试。
-    """
-
-    def setUp(self):
-
-        # 1. 手动启动 Patcher
-        patcher1 = patch(f"{MODULE_PATH}.AutoTokenizer.from_pretrained")
-        patcher2 = patch(f"{MODULE_PATH}.ImageProcessor.from_pretrained")
-        patcher_zmq_context = patch(f"{MODULE_PATH}.zmq.Context")
-
-        self.mock_auto_tokenizer_constructor = patcher1.start()
-        self.mock_image_processor_constructor = patcher2.start()
-        self.mock_zmq_context_constructor = patcher_zmq_context.start()
-
-        self.addCleanup(patcher1.stop)
-        self.addCleanup(patcher2.stop)
-        self.addCleanup(patcher_zmq_context.stop)
-
-        # 2. 创建模拟对象
-        self.mock_tokenizer = MagicMock()
-        self.mock_image_processor = MagicMock()
-        self.mock_zmq_context = MagicMock()
-        self.mock_zmq_socket = MagicMock()
-
-        # 3. 配置 from_pretrained 和 zmq
-        self.mock_auto_tokenizer_constructor.return_value = self.mock_tokenizer
-        self.mock_image_processor_constructor.return_value = self.mock_image_processor
-        self.mock_zmq_context_constructor.return_value = self.mock_zmq_context
-        self.mock_zmq_context.socket.return_value = self.mock_zmq_socket
-
-        # 4. 配置模拟对象的属性和方法
-        self._configure_mocks()
-
-        # 5. 实例化 DataProcessor (默认不启用 cache)
-        self.processor = DataProcessor(model_path="dummy_model_path")
-        self._configure_processor_ids()
-
-        # 6. 准备测试用的虚拟数据
-        self.dummy_image = Image.fromarray(np.uint8(np.random.rand(224, 224, 3) * 255))
-        self.dummy_video_frames = np.uint8(np.random.rand(16, 224, 224, 3) * 255)
-        self.dummy_video_data = "path/to/dummy_video.mp4"
-        self.dummy_processed_image_cache = (
-            np.random.rand(64, 3, 14, 14).astype(np.float32),
-            {"thw": (1, 8, 8), "fps": 0},
-        )
-        self.dummy_processed_video_cache = (
-            np.random.rand(256, 3, 14, 14).astype(np.float32),
-            {"thw": (4, 8, 8), "fps": 30},
-        )
-
-    def _configure_mocks(self):
-        def mock_convert_tokens_to_ids(tokens):
-            if tokens == "<|IMAGE_PLACEHOLDER|>":
-                return 100
-            if tokens == "<|video_pad|>":
-                return 101
-            if tokens == "<|IMAGE_START|>":
-                return 102
-            if isinstance(tokens, list):
-                if tokens == ["Hello", "world"]:
-                    return [983, 984]
-                if tokens == ["Prompt", "text"]:
-                    return [606, 511]
-                if tokens == ["Prompt", "", "text"]:
-                    return [606, 511]  # 模拟 "Prompt  text".split()
-                return [hash(t) % 1000 for t in tokens]
-            return hash(tokens) % 1000
-
-        self.mock_tokenizer.convert_tokens_to_ids.side_effect = mock_convert_tokens_to_ids
-        self.mock_tokenizer.tokenize.side_effect = lambda s: s.split()
-        self.mock_tokenizer.ignored_index = -100
-        self.mock_tokenizer.chat_template = "dummy_template_string"
-
-        self.mock_image_processor.merge_size = 2
-        self.mock_image_processor.temporal_patch_size = 1
-
-    def _configure_processor_ids(self):
-        self.processor.image_token_id = 100
-        self.processor.video_token_id = 101
-        self.processor.image_patch_id = 100
-        self.processor.vision_start_id = 102
-
-    def _get_init_outputs(self):
-        return {
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "labels": [],
-            "cur_position": 0,
-            "video_cnt": 0,
-            "num_input_image_tokens": 0,
-            "num_input_video_tokens": 0,
-            "fps": [],
-            "mm_positions": [],
-            "mm_hashes": [],
-            "vit_seqlen": [],
-            "vit_position_ids": [],
-        }
-
-    def test_init(self):
-        """测试 DataProcessor 的初始化"""
-        self.mock_auto_tokenizer_constructor.assert_called_with("dummy_model_path", padding_side="left", use_fast=True)
-        self.mock_image_processor_constructor.assert_called_with("dummy_model_path")
-        self.assertEqual(self.processor.image_token, "<|IMAGE_PLACEHOLDER|>")
-        self.assertEqual(self.processor.video_token_id, 101)
-
-    def test_compute_text_positions(self):
-        """测试 _compute_text_positions 纯函数"""
-        pos_ids = self.processor._compute_text_positions(start_pos=5, num_tokens=3)
-        expected = np.array([[5, 6, 7], [5, 6, 7], [5, 6, 7]])
-        np.testing.assert_array_equal(pos_ids, expected)
-
-    def test_compute_vision_positions(self):
-        """测试 _compute_vision_positions 纯函数"""
-        pos_ids = self.processor._compute_vision_positions(start_pos=10, t=2, h=4, w=4, second_per_grid_t=1.0)
-        self.assertEqual(pos_ids.shape, (3, 8))
-        expected_t = np.array([0, 0, 0, 0, 2, 2, 2, 2])
-        expected_h = np.array([0, 0, 1, 1, 0, 0, 1, 1])
-        expected_w = np.array([0, 1, 0, 1, 0, 1, 0, 1])
-        expected = np.stack([expected_t, expected_h, expected_w]) + 10
-        np.testing.assert_array_equal(pos_ids, expected)
-
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_text(self):
-        """测试 _add_text 辅助函数"""
-        outputs = self._get_init_outputs()
-        self.mock_tokenizer.tokenize.return_value = ["Hello", "world"]
-        self.mock_tokenizer.convert_tokens_to_ids.side_effect = None
-        self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11]
-
-        self.processor._add_text("Hello world", outputs)
-
-        self.assertEqual(outputs["input_ids"], [10, 11])
-        self.assertEqual(outputs["token_type_ids"], [0, 0])
-        self.assertEqual(outputs["cur_position"], 2)
-
-    @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_123")
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_image_autohash(self, mock_hasher):
-        """测试 _add_image 辅助函数 (自动哈希)"""
-        outputs = self._get_init_outputs()
-        outputs["cur_position"] = 5
-
-        num_patches_hw = 8 * 8
-        num_tokens = 16
-        mock_preprocess_return = {
-            "pixel_values": np.random.rand(num_patches_hw, 3, 14, 14),
-            "grid_thw": np.array([1, 8, 8]),
-        }
-        self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
-        self.processor._add_image(self.dummy_image, outputs, uuid=None)
-
-        self.assertEqual(len(outputs["input_ids"]), num_tokens)
-        self.assertEqual(outputs["num_input_image_tokens"], num_tokens)
-        mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"])
-        self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_123")
-        self.assertEqual(outputs["cur_position"], 9)
-
-    @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features")
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_video_with_uuid(self, mock_hasher):
-        """测试 _add_video 辅助函数 (使用 uuid)"""
-        outputs = self._get_init_outputs()
-        outputs["cur_position"] = 10
-        meta = {"fps": 30}
-
-        num_patches_total = 256
-        num_tokens = 64
-
-        mock_preprocess_return = {
-            "pixel_values": np.random.rand(num_patches_total, 3, 14, 14),
-            "image_grid_thw": np.array([4, 8, 8]),
-        }
-        self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
-        self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid="custom_vid_uuid")
-
-        self.assertEqual(len(outputs["input_ids"]), num_tokens)
-        self.assertEqual(outputs["token_type_ids"], [2] * num_tokens)
-        mock_hasher.assert_not_called()
-        self.assertEqual(outputs["mm_hashes"][0], "custom_vid_uuid")
-        self.assertEqual(outputs["image_type_ids"], [1, 1, 1, 1])
-
-    @patch.object(DataProcessor, "_add_text", MagicMock())
-    @patch.object(DataProcessor, "_add_image", MagicMock())
-    @patch.object(DataProcessor, "_add_video", MagicMock())
-    @patch.object(DataProcessor, "_load_and_process_video")
-    def test_text2ids_parsing(self, mock_load_video):
-        """测试 text2ids 的解析和分支逻辑"""
-        mock_load_video.return_value = (self.dummy_video_frames, {"fps": 30})
-        text = "Text1 <|IMAGE_PLACEHOLDER|> Text2 <|video_pad|> Text3"
-        images = [self.dummy_image]
-        videos = [self.dummy_video_data]
-        image_uuid = ["img_uuid_1"]
-        video_uuid = ["vid_uuid_1"]
-
-        outputs = self.processor.text2ids(text, images, videos, image_uuid, video_uuid)
-
-        self.processor._add_text.assert_any_call("Text1 ", outputs)
-        self.processor._add_image.assert_called_once_with(self.dummy_image, outputs, "img_uuid_1")
-        self.processor._add_video.assert_called_once_with(self.dummy_video_frames, {"fps": 30}, outputs, "vid_uuid_1")
-
-    @patch(f"{MODULE_PATH}.parse_chat_messages")
-    @patch.object(DataProcessor, "text2ids", return_value="final_output")
-    def test_request2ids(self, mock_text2ids, mock_parse_chat):
-        """测试 request2ids 的 chat 模板逻辑"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Hello"},
-                    {"type": "image", "data": self.dummy_image, "uuid": "img1"},
-                ],
-            }
-        ]
-        request = {"request_id": "test_0", "messages": messages, "add_generation_prompt": True}
-        request = Request.from_dict(request)
-        mock_parse_chat.return_value = messages
-        parsed_prompt = "User: Hello <|IMAGE_PLACEHOLDER|> Assistant:"
-        self.mock_tokenizer.apply_chat_template.return_value = parsed_prompt
-
-        result = self.processor.request2ids(request)
-
-        self.mock_tokenizer.apply_chat_template.assert_called_once()
-        mock_text2ids.assert_called_once_with(parsed_prompt, [self.dummy_image], [], ["img1"], [])
-        self.assertEqual(result, "final_output")
-
-    @patch(f"{MODULE_PATH}.sample_frames")
-    @patch(f"{MODULE_PATH}.read_video_decord")
-    def test_load_and_process_video(self, mock_read_video, mock_sample_frames):
-        """测试 _load_and_process_video 的帧采样逻辑"""
-        mock_reader = MagicMock()
-        mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint(
-            0, 255, (100, 100, 3), dtype=np.uint8
-        )
-        mock_meta = {"num_of_frame": 100, "duration": 10.0, "fps": 10.0}
-        mock_read_video.return_value = (mock_reader, mock_meta, None)
-        mock_sample_frames.return_value = [0, 10, 20, 30, 40]
-        self.processor.fps = 1
-
-        frames, meta = self.processor._load_and_process_video("dummy_url", {"min_frames": 2, "max_frames": 10})
-
-        mock_sample_frames.assert_called_once_with(
-            frame_factor=ANY,
-            min_frames=2,
-            max_frames=10,
-            metadata=mock_meta,
-            fps=self.processor.fps,
-            num_frames=self.processor.target_frames,
-        )
-        self.assertEqual(frames.shape, (5, 100, 100, 3))
-        self.assertEqual(meta["fps"], 1)
-
-    def test_init_with_external_tokenizer(self):
-        """新增：测试使用外部传入的 tokenizer 初始化"""
-        self.mock_auto_tokenizer_constructor.reset_mock()
-
-        external_tokenizer = MagicMock()
-        processor = DataProcessor(model_path="dummy", tokenizer=external_tokenizer)
-
-        self.mock_auto_tokenizer_constructor.assert_not_called()
-        self.assertIs(processor.tokenizer, external_tokenizer)
-
-    def test_add_text_empty(self):
-        """新增：测试 _add_text 传入空字符串"""
-        outputs = self._get_init_outputs()
-        self.processor._add_text("", outputs)
-        self.assertEqual(outputs["input_ids"], [])
-        self.assertEqual(outputs["cur_position"], 0)
-
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0})
-    def test_add_text_pre_tokenized(self):
-        """新增：测试 _add_text 传入已 tokenized 的 IDs"""
-        outputs = self._get_init_outputs()
-        token_ids = [10, 11, 12]
-        self.processor._add_text(token_ids, outputs)
-
-        self.mock_tokenizer.tokenize.assert_not_called()
-        self.assertEqual(outputs["input_ids"], [10, 11, 12])
-        self.assertEqual(outputs["token_type_ids"], [0, 0, 0])
-        self.assertEqual(outputs["cur_position"], 3)
-
-    @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_456")
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_video_no_uuid(self, mock_hasher):
-        """新增：测试 _add_video 在 uuid 为 None 时自动哈希"""
-        outputs = self._get_init_outputs()
-        meta = {"fps": 30}
-        mock_preprocess_return = {
-            "pixel_values": np.random.rand(256, 3, 14, 14),
-            "image_grid_thw": np.array([4, 8, 8]),
-        }
-        self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
-        self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid=None)
-
-        mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"])
-        self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_456")
-
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_processed_image(self):
-        """新增：测试 _add_processed_image 处理缓存数据"""
-        outputs = self._get_init_outputs()
-        outputs["cur_position"] = 3
-
-        self.processor._add_processed_image(self.dummy_processed_image_cache, outputs, "cached_img_uuid")
-
-        num_tokens = 16
-        self.assertEqual(len(outputs["input_ids"]), num_tokens)
-        self.assertEqual(outputs["input_ids"][0], self.processor.image_patch_id)
-
-        np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_image_cache[0])
-
-        self.assertEqual(outputs["mm_hashes"][0], "cached_img_uuid")
-        self.assertEqual(outputs["cur_position"], 7)
-
-    @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
-    def test_add_processed_video(self):
-        """新增：测试 _add_processed_video 处理缓存数据"""
-        outputs = self._get_init_outputs()
-        outputs["cur_position"] = 5
-
-        self.processor._add_processed_video(self.dummy_processed_video_cache, outputs, "cached_vid_uuid")
-
-        num_tokens = 64
-        t, h, w = self.dummy_processed_video_cache[1]["thw"]
-
-        self.assertEqual(len(outputs["input_ids"]), num_tokens)
-        self.assertEqual(outputs["token_type_ids"], [2] * num_tokens)
-
-        np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_video_cache[0])
-
-        self.assertEqual(outputs["mm_hashes"][0], "cached_vid_uuid")
-        self.assertEqual(outputs["image_type_ids"], [1] * t)
-        self.assertGreater(outputs["cur_position"], 5)
-
-    def test_text2ids_with_processed_data(self):
-        """新增：测试 text2ids 调用 _add_processed_image 和 _add_processed_video"""
-        with (
-            patch.object(self.processor, "_add_processed_image") as mock_add_proc_img,
-            patch.object(self.processor, "_add_processed_video") as mock_add_proc_vid,
-        ):
-
-            text = "<|IMAGE_PLACEHOLDER|><|video_pad|>"
-            images = [self.dummy_processed_image_cache]
-            videos = [self.dummy_processed_video_cache]
-            image_uuid = ["img1"]
-            video_uuid = ["vid1"]
-
-            self.processor.text2ids(text, images, videos, image_uuid, video_uuid)
-
-            mock_add_proc_img.assert_called_once_with(self.dummy_processed_image_cache, ANY, "img1")
-            mock_add_proc_vid.assert_called_once_with(self.dummy_processed_video_cache, ANY, "vid1")
-
-    @patch(f"{MODULE_PATH}.sample_frames")
-    @patch(f"{MODULE_PATH}.read_video_decord")
-    def test_load_and_process_video_no_sampling(self, mock_read_video, mock_sample_frames):
-        """新增：测试 _load_and_process_video 不采样（fps=-1）"""
-        mock_reader = MagicMock()
-        mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint(
-            0, 255, (100, 100, 3), dtype=np.uint8
-        )
-        mock_meta = {"num_of_frame": 10, "duration": 1.0, "fps": 10.0}
-        mock_read_video.return_value = (mock_reader, mock_meta, None)
-
-        self.processor.fps = -1
-        self.processor.target_frames = -1
-
-        frames, meta = self.processor._load_and_process_video("dummy_url", {})
-
-        mock_sample_frames.assert_not_called()
-        self.assertEqual(frames.shape, (10, 100, 100, 3))
-        self.assertEqual(meta["num_of_frame"], 10)
-
-    def test_get_processor_cache(self):
-        """新增：测试 get_processor_cache (zmq)"""
-        hashes = ["hash1", "hash2"]
-        expected_items = ["item1", "item2"]
-        mock_resp = pickle.dumps(expected_items)
-        self.mock_zmq_socket.recv_multipart.return_value = (b"", mock_resp)
-
-        items = self.processor.get_processor_cache(self.mock_zmq_socket, hashes)
-
-        self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", pickle.dumps(hashes)])
-        self.assertEqual(items, expected_items)
-
-    def test_update_processor_cache(self):
-        """新增：测试 update_processor_cache (zmq)"""
-        hashes = ["hash1"]
-        items = ["item1"]
-
-        self.processor.update_processor_cache(self.mock_zmq_socket, hashes, items)
-
-        expected_req = pickle.dumps((hashes, items))
-        self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", expected_req])
-
-    def test_apply_chat_template(self):
-        """新增：测试 apply_chat_template 核心逻辑"""
-        request = {"messages": ["msg1"], "add_generation_prompt": True, "request_id": "req123"}
-        self.mock_tokenizer.apply_chat_template.return_value = "Prompt <|IMAGE_PLACEHOLDER|> text"
-        self.mock_tokenizer.tokenize.return_value = ["Prompt", "text"]
-
-        self.mock_tokenizer.convert_tokens_to_ids.side_effect = None
-        self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11]
-
-        token_ids = self.processor.apply_chat_template(request)
-
-        self.assertEqual(token_ids, [10, 11])
-        self.assertEqual(request["text_after_process"], "Prompt <|IMAGE_PLACEHOLDER|> text")
-
-        self.mock_tokenizer.tokenize.assert_called_with("Prompt  text")
-
-    def test_apply_chat_template_raises_error(self):
-        """新增：测试 apply_chat_template 在模板不存在时引发 ValueError"""
-        self.mock_tokenizer.chat_template = None
-        with self.assertRaises(ValueError) as context:
-            self.processor.apply_chat_template({"messages": []})
-        self.assertIn("does not support chat_template", str(context.exception))
-
-    @patch(f"{MODULE_PATH}.parse_chat_messages")
-    def test_request2ids_cache_miss_raises_error(self, mock_parse_chat):
-        """新增：测试 request2ids 在缓存关闭时缺少数据引发 ValueError"""
-        messages = [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}]
-        request = {"request_id": "test_0", "messages": messages}
-        request = Request.from_dict(request)
-
-        mock_parse_chat.return_value = messages
-
-        with self.assertRaises(ValueError) as context:
-            self.processor.request2ids(request)
-
-        self.assertIn("Missing items cannot be retrieved without processor cache.", str(context.exception))
-
-    @patch(f"{MODULE_PATH}.DataProcessor.get_processor_cache")
-    @patch(f"{MODULE_PATH}.DataProcessor.update_processor_cache")
-    @patch(f"{MODULE_PATH}.DataProcessor.text2ids")
-    @patch(f"{MODULE_PATH}.parse_chat_messages")
-    def test_request2ids_cache_hit_and_update(self, mock_parse_chat, mock_text2ids, mock_update_cache, mock_get_cache):
-        """新增：测试 request2ids 缓存命中和缓存更新"""
-        self.processor = DataProcessor(model_path="dummy_model_path", enable_processor_cache=True)
-        self._configure_processor_ids()
-
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "uuid": "img_cache_hit"},
-                    {"type": "image", "data": self.dummy_image, "uuid": "img_to_update"},
-                ],
-            }
-        ]
-        request = {"request_id": "test_0", "messages": messages}
-        request = Request.from_dict(request)
-
-        mock_parse_chat.return_value = messages
-        mock_get_cache.return_value = [self.dummy_processed_image_cache]
-
-        mock_text2ids_output = {
-            "grid_thw": [(1, 8, 8), (1, 8, 8)],
-            "fps": [0, 0],
-            "mm_hashes": ["img_cache_hit", "img_to_update"],
-            "images": [self.dummy_processed_image_cache[0], self.dummy_processed_image_cache[0]],
-        }
-        mock_text2ids.return_value = mock_text2ids_output
-        self.mock_tokenizer.apply_chat_template.return_value = "<|IMAGE_PLACEHOLDER|><|IMAGE_PLACEHOLDER|>"
-
-        self.processor.request2ids(request)
-
-        self.mock_zmq_context.socket.assert_called_with(zmq.DEALER)
-        mock_get_cache.assert_called_once_with(self.mock_zmq_socket, ["img_cache_hit"])
-
-        parsed_images = mock_text2ids.call_args[0][1]
-        self.assertIs(parsed_images[0], self.dummy_processed_image_cache)
-        self.assertIs(parsed_images[1], self.dummy_image)
-
-        expected_hash_to_cache = ["img_to_update"]
-        expected_item_to_cache = (self.dummy_processed_image_cache[0], {"thw": (1, 8, 8), "fps": 0})
-        mock_update_cache.assert_called_once()
-        self.assertEqual(mock_update_cache.call_args[0][1], expected_hash_to_cache)
-        self.assertEqual(mock_update_cache.call_args[0][2][0][1], expected_item_to_cache[1])
-        np.testing.assert_array_equal(mock_update_cache.call_args[0][2][0][0], expected_item_to_cache[0])
-
-    @patch(f"{MODULE_PATH}.DataProcessor.text2ids")
-    @patch(f"{MODULE_PATH}.parse_chat_messages")
-    def test_request2ids_unsupported_type(self, mock_parse_chat, mock_text2ids):
-        """新增：测试 request2ids 静默忽略不支持的类型"""
-        messages = [
-            {
-                "role": "user",
-                "content": [{"type": "text", "text": "Hello"}, {"type": "audio", "data": "...", "uuid": "audio1"}],
-            }
-        ]
-        request = {"request_id": "test_0", "messages": messages}
-        request = Request.from_dict(request)
-
-        mock_parse_chat.return_value = messages
-        self.mock_tokenizer.apply_chat_template.return_value = "User: Hello "
-
-        self.processor.request2ids(request)
-
-        mock_text2ids.assert_called_once()
-        call_args = mock_text2ids.call_args[0]
-        self.assertEqual(call_args[1], [])  # images
-        self.assertEqual(call_args[2], [])  # videos
-        self.assertEqual(call_args[3], [])  # image_uuid
-        self.assertEqual(call_args[4], [])  # video_uuid
-
-
-class TestPaddleOCR_VL_ImageProcessor(unittest.TestCase):
-    def setUp(self):
-        # 初始化默认参数
-        self.default_params = {
-            "do_resize": True,
-            "resample": 3,
-            "do_rescale": True,
-            "rescale_factor": 1 / 255,
-            "do_normalize": True,
-            "image_mean": [0.48145466, 0.4578275, 0.40821073],
-            "image_std": [0.26862954, 0.26130258, 0.27577711],
-            "do_convert_rgb": True,
-            "min_pixels": 28 * 28 * 130,
-            "max_pixels": 28 * 28 * 1280,
-            "patch_size": 14,
-            "temporal_patch_size": 1,
-            "merge_size": 2,
-        }
-
-        # 创建测试图像
-        self.test_image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
-
-    def test_initialization(self):
-        """测试初始化参数是否正确设置"""
-        processor = ImageProcessor(**self.default_params)
-
-        for param, value in self.default_params.items():
-            self.assertEqual(getattr(processor, param), value)
-
-    def test_smart_resize(self):
-        """测试智能调整图像大小功能"""
-        # 测试正常尺寸调整
-        h, w = smart_resize(224, 224, factor=28)
-        self.assertEqual(h % 28, 0)
-        self.assertEqual(w % 28, 0)
-
-        # 测试小尺寸调整
-        h, w = smart_resize(20, 20, factor=28)
-        self.assertGreaterEqual(h, 28)
-        self.assertGreaterEqual(w, 28)
-
-        # 测试超大尺寸调整
-        h, w = smart_resize(2000, 2000, factor=28)
-        self.assertLess(h * w, 28 * 28 * 1280)
-
-    def test_preprocess_single_image(self):
-        """测试单张图像预处理流程"""
-        processor = ImageProcessor(**self.default_params)
-
-        # 测试正常预处理
-        result = processor.preprocess(self.test_image)
-        self.assertIn("pixel_values", result)
-        self.assertIn("grid_thw", result)
-        self.assertEqual(result["pixel_values"].ndim, 4)  # [N, C, H, W]
-
-        # 测试关闭某些预处理步骤
-        result = processor.preprocess(self.test_image, do_resize=False, do_normalize=False)
-        self.assertIn("pixel_values", result)
-
-    def test_preprocess_batch_images(self):
-        """测试批量图像预处理"""
-        processor = ImageProcessor(**self.default_params)
-        batch_images = [self.test_image, self.test_image]
-
-        result = processor.preprocess(batch_images)
-        expected_shape = 1152
-        self.assertEqual(result["pixel_values"].shape[0], expected_shape)
-
-    def test_invalid_input(self):
-        """测试无效输入处理"""
-        processor = ImageProcessor(**self.default_params)
-
-        # 测试无效图像
-        with self.assertRaises(ValueError):
-            processor.preprocess("invalid_image")
-
-        # 测试视频输入(暂不支持)
-        with self.assertRaises(NotImplementedError):
-            processor.preprocess(self.test_image, videos=["video"])
-
-    def test_from_pretrained(self):
-        """测试从预训练模型加载配置"""
-        with patch("builtins.open", unittest.mock.mock_open(read_data='{"do_resize": false}')) as mock_file:
-            processor = ImageProcessor.from_pretrained("dummy_path")
-            self.assertFalse(processor.do_resize)
-            mock_file.assert_called_once()
-
-
-class TestPaddleOCRVLProcessor(unittest.TestCase):
-    def setUp(self):
-        # 创建 PaddleOCRVLProcessor 实例的模拟对象
-        with patch.object(PaddleOCRVLProcessor, "__init__", return_value=None):
-            self.processor = PaddleOCRVLProcessor("model_path")
-
-        # 设置必要的属性
-        self.processor.tokenizer = MagicMock()
-        self.processor.tokenizer.eos_token_id = 1
-        self.processor.processor = MagicMock()
-        self.processor.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-        self.processor.eos_token_ids = [1]
-        self.processor.reasoning_parser = None
-        self.processor.model_status_dict = {}
-
-        # 模拟 _apply_default_parameters
-        def mock_apply_default_parameters(request_or_dict):
-            if isinstance(request_or_dict, dict):
-                if "top_p" not in request_or_dict:
-                    request_or_dict["top_p"] = 0.9
-                return request_or_dict
-
-            if not hasattr(request_or_dict, "top_p"):
-                request_or_dict.top_p = 0.9
-            return request_or_dict
-
-        self.processor._apply_default_parameters = mock_apply_default_parameters
-
-        # 模拟 pack_outputs
-        def mock_pack_outputs(outputs):
-            # 简化 position_ids 的处理
-            position_ids_list = outputs["position_ids"]
-            if not position_ids_list:
-                position_ids = np.array([], dtype=np.int64)
-            elif isinstance(position_ids_list[0], list):
-                position_ids = np.array(position_ids_list, dtype=np.int64)
-            else:
-                position_ids = np.concatenate(position_ids_list, axis=1, dtype=np.int64)
-
-            if position_ids.ndim == 1:
-                position_ids = position_ids.reshape(1, -1)
-
-            # 源码的 pack_outputs 会 transpose
-            position_ids = position_ids.transpose(1, 0)
-
-            return {
-                "input_ids": np.array(outputs["input_ids"], dtype=np.int64),
-                "token_type_ids": np.array(outputs["token_type_ids"], dtype=np.int64),
-                "position_ids": position_ids,
-                "images": np.vstack(outputs["images"]) if outputs.get("images") else None,
-                "grid_thw": np.vstack(outputs["grid_thw"]) if outputs.get("grid_thw") else None,
-                "image_type_ids": np.array(outputs["image_type_ids"]) if outputs.get("image_type_ids") else None,
-            }
-
-        self.processor.pack_outputs = mock_pack_outputs
-        self.processor.np = np
-
-        # 模拟 _SAMPLING_EPS 常量
-        self.processor._SAMPLING_EPS = 1e-5
-
-        # 模拟 processor 返回 (position_ids 必须是 2D array 的 list)
-        self.processor.processor.text2ids.return_value = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],  # 修正
-            "images": ["image_feature"],
-            "grid_thw": ["grid_feature"],
-            "image_type_ids": [0],
-            "cur_position": 3,
-        }
-
-        self.processor.processor.request2ids.return_value = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],  # 修正
-            "images": ["image_feature"],
-            "grid_thw": ["grid_feature"],
-            "image_type_ids": [0],
-            "cur_position": 3,
-        }
-
-        # 模拟 _compute_text_positions 方法 (返回 2D array)
-        self.processor.processor._compute_text_positions = lambda pos, num: np.array(
-            [list(range(pos, pos + num))], dtype=np.int64
-        )
-
-        # 模拟 update_stop_seq
-        self.processor.update_stop_seq = MagicMock(return_value=([[99, 98]], [2]))
-
-        # 模拟 pack_outputs 需要的属性
-        self.processor.processor.image_token_id = 100
-        self.processor.processor.video_token_id = 101
-
-    def test_process_request_dict_basic(self):
-        """测试基本请求处理功能"""
-        request = {
-            "request_id": "test_request",
-            "prompt": "test prompt",
-            "multimodal_data": {"image": ["image1"]},
-            "metadata": {"generated_token_ids": []},
-        }
-        request = Request.from_dict(request)
-
-        result = self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(result.prompt_token_ids, [1, 2, 3])
-        self.assertEqual(result.prompt_token_ids_len, 3)
-        self.assertTrue(hasattr(result, "multimodal_inputs"))
-
-    def test_process_request_dict_with_messages(self):
-        """测试 messages 格式的请求处理"""
-        request = {
-            "request_id": "test_0",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}],
-                }
-            ],
-            "metadata": {"generated_token_ids": []},
-        }
-        request = Request.from_dict(request)
-
-        result = self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(result.prompt_token_ids, [1, 2, 3])
-        self.assertTrue(hasattr(result, "multimodal_inputs"))
-
-    def test_process_request_dict_with_max_len(self):
-        """测试最大长度限制功能"""
-        request = {
-            "request_id": "test_0",
-            "prompt": "test prompt",
-            "multimodal_data": {"image": ["image1"]},
-            "metadata": {"generated_token_ids": []},
-        }
-        request = Request.from_dict(request)
-
-        # 模拟 processor 返回长序列
-        self.processor.processor.text2ids.return_value = {
-            "input_ids": list(range(100)),
-            "token_type_ids": [0] * 100,
-            "position_ids": [np.array([list(range(100))], dtype=np.int64)],
-            "images": ["image_feature"],
-            "grid_thw": ["grid_feature"],
-            "image_type_ids": [0],
-            "cur_position": 100,
-        }
-
-        max_model_len = 50
-        result = self.processor.process_request_dict(request, max_model_len)
-        # 验证是否截断到 max_model_len - 1
-        self.assertEqual(len(result.prompt_token_ids), max_model_len - 1)
-        self.assertEqual(result.prompt_token_ids, list(range(49)))
-        # 验证原始输入长度确实超过了限制
-        self.assertGreater(len(self.processor.processor.text2ids.return_value["input_ids"]), max_model_len)
-
-    def test_parse_processor_kwargs(self):
-        """测试处理器参数解析"""
-        valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
-        result = self.processor._parse_processor_kwargs(valid_kwargs)
-        self.assertEqual(result, valid_kwargs)
-
-        # 测试无效参数
-        invalid_kwargs = {"video_max_frames": "invalid"}
-        with patch(
-            "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
-        ) as mock_logger:
-            result = self.processor._parse_processor_kwargs(invalid_kwargs)
-            self.assertEqual(result, {})
-            # 确认警告已被记录
-            mock_logger.warning.assert_called()
-
-    def test_parse_limits(self):
-        """测试输入限制解析"""
-        custom_limits = {"image": 2, "video": 3}
-        result = self.processor._parse_limits(custom_limits)
-        self.assertEqual(result["image"], 2)
-        self.assertEqual(result["video"], 3)
-        self.assertEqual(result["audio"], 1)  # 默认值
-
-    def test_check_mm_limits(self):
-        """测试多模态输入限制检查 (dict path)"""
-        # 测试不超限
-        item = {"image": ["image1"], "video": ["video1"]}
-        self.processor._check_mm_limits(item)
-
-        # 测试超限
-        item_exceeded = {"image": ["image1", "image2"], "video": ["video1"]}
-        with self.assertRaises(ValueError):
-            self.processor._check_mm_limits(item_exceeded)
-
-    def test_parse_processor_kwargs_invalid_type(self):
-        """测试 _parse_processor_kwargs 传入非字典类型"""
-        invalid_input = ["video_max_frames", 10]
-        with patch(
-            "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
-        ) as mock_logger:
-            result = self.processor._parse_processor_kwargs(invalid_input)
-            self.assertEqual(result, {})  # 触发
-            mock_logger.warning.assert_called()
-
-    def test_parse_limits_invalid_type(self):
-        """测试 _parse_limits 传入非字典类型"""
-        invalid_input = ["image", 2]
-        with patch(
-            "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
-        ) as mock_logger:
-            result = self.processor._parse_limits(invalid_input)
-            # 应返回默认值
-            self.assertEqual(result, {"image": 1, "video": 1, "audio": 1})
-            mock_logger.warning.assert_called()
-
-    def test_check_mm_limits_messages_path(self):
-        """测试 _check_mm_limits (messages path)"""
-        messages = [
-            {"role": "user", "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}]}
-        ]
-        self.processor._check_mm_limits(messages)  # 不应抛出异常
-
-    def test_check_mm_limits_messages_exceeded(self):
-        """测试 _check_mm_limits (messages path) 超限"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Hello"},
-                    {"type": "image_url", "url": "image1"},
-                    {"type": "image_url", "url": "image2"},  # 超过限制 1
-                ],
-            }
-        ]
-        with self.assertRaises(ValueError):
-            self.processor._check_mm_limits(messages)
-
-    def test_process_request_dict_no_prompt_or_messages(self):
-        """测试当请求既没有 prompt 也没有 messages 时抛出异常"""
-        request = {"request_id": "test_0", "metadata": {"generated_token_ids": []}}
-        request = Request.from_dict(request)
-        with self.assertRaises(ValueError):
-            self.processor.process_request_dict(request, max_model_len=512)
-
-    def test_process_request_dict_with_continuation(self):
-        """测试续写逻辑 (metadata 包含 generated_token_ids)"""
-        request = {
-            "request_id": "test_0",
-            "prompt": "test prompt",
-            "multimodal_data": {"image": ["image1"]},
-            "metadata": {"generated_token_ids": [10, 11, 12]},  # 已生成的 token
-        }
-        request = Request.from_dict(request)
-        setattr(request, "metadata", {"generated_token_ids": [10, 11, 12]})
-
-        result = self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(result.prompt_token_ids, [1, 2, 3, 10, 11, 12])
-        self.assertEqual(result.prompt_token_ids_len, 6)
-
-    def test_process_request_dict_with_stop_sequences(self):
-        """测试 stop_sequences 处理"""
-        request = {
-            "request_id": "test_0",
-            "prompt": "test prompt",
-            "stop": ["stop1", "stop2"],
-            "metadata": {"generated_token_ids": []},
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, max_model_len=512)
-
-        # 验证 update_stop_seq 被调用
-        self.processor.update_stop_seq.assert_called_with(["stop1", "stop2"])
-        # 验证结果被设置到 request 中
-        self.assertEqual(result.sampling_params.stop_token_ids, [[99, 98]])
-        self.assertEqual(result.sampling_params.stop_seqs_len, [2])
-
-    def test_process_request_dictefault_max_tokens(self):
-        """测试默认 max_tokens 计算"""
-        request = {
-            "request_id": "test_0",
-            "prompt": "test prompt",
-            "metadata": {"generated_token_ids": []},
-        }  # 长度为 3
-        request = Request.from_dict(request)
-        max_model_len = 10
-        result = self.processor.process_request_dict(request, max_model_len)
-
-        self.assertEqual(result.sampling_params.max_tokens, 7)
-
-    def test_process_request_dict_top_p_clamping(self):
-        """测试 top_p 值被修正 (clamping)"""
-        request = {
-            "request_id": "test_0",
-            "prompt": "test prompt",
-            "top_p": 0.0,  # 低于 _SAMPLING_EPS
-            "metadata": {"generated_token_ids": []},
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(result.sampling_params.top_p, self.processor._SAMPLING_EPS)
-
-    def test_append_generated_tokens(self):
-        """直接测试 append_generated_tokens 辅助函数"""
-        # : position_ids 必须是 [2D array]
-        multimodal_inputs = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
-            "cur_position": 3,
-        }
-        generated_token_ids = [10, 11]
-
-        # 调用 append_generated_tokens (它是 PaddleOCRVLProcessor 的方法)
-        PaddleOCRVLProcessor.append_generated_tokens(self.processor, multimodal_inputs, generated_token_ids)
-
-        self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11])
-        self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
-        # : 检查 position_ids 是否为 [np.array(...), np.array(...)]
-        self.assertEqual(len(multimodal_inputs["position_ids"]), 2)
-        self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][0], np.array([[0, 1, 2]], dtype=np.int64)))
-        self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][1], np.array([[3, 4]], dtype=np.int64)))
-        self.assertEqual(multimodal_inputs["cur_position"], 5)
-
-    def test_pack_outputs_real_no_images(self):
-        """测试真实的 pack_outputs 方法 (无图像)"""
-        outputs = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            # : position_ids 必须是 [2D array]
-            "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
-            "images": [],  # 空列表
-            "grid_thw": [],
-            "image_type_ids": [],
-            "cur_position": 3,
-        }
-
-        # 调用真实的类方法，而不是 setUp 中 mock 的实例方法
-        result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs)
-
-        self.assertIsNone(result["images"])
-        self.assertIsNone(result["grid_thw"])
-        self.assertIsNone(result["image_type_ids"])
-        self.assertTrue(np.array_equal(result["input_ids"], np.array([1, 2, 3], dtype=np.int64)))
-        # 验证 position_ids 被 concatenate 和 transpose
-        # input: [array([[0, 1, 2]])] -> concat: array([[0, 1, 2]]) (shape 1,3) -> transpose: array([[0], [1], [2]]) (shape 3,1)
-        self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64)))
-        self.assertEqual(result["image_patch_id"], 100)
-        self.assertEqual(result["video_patch_id"], 101)
-
-    def test_pack_outputs_real_with_images(self):
-        """测试真实的 pack_outputs 方法 (有图像)"""
-        image_feature = np.array([[0.1, 0.2]])
-        grid_feature = np.array([[1, 2, 3]])
-
-        outputs = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            # : position_ids 必须是 [2D array]
-            "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
-            "images": [image_feature],
-            "grid_thw": [grid_feature],
-            "image_type_ids": [0],
-            "cur_position": 3,
-        }
-
-        result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs)
-
-        self.assertTrue(np.array_equal(result["images"], image_feature))
-        self.assertTrue(np.array_equal(result["grid_thw"], grid_feature))
-        self.assertTrue(np.array_equal(result["image_type_ids"], np.array([0])))
-        self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64)))
-
-    def test_think_status(self):
-        """测试 思考机制"""
-        request = {
-            "prompt": "hello",
-            "request_id": "test_1",
-            "prompt_token_ids": [1, 2, 3],
-        }
-        request = Request.from_dict(request)
-        self.processor.reasoning_parser = MagicMock()
-        self.processor.reasoning_parser.get_model_status.return_value = "think_start"
-        self.processor.model_status_dict = {}
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-        request = {
-            "prompt": "hello",
-            "request_id": "test_2",
-            "prompt_token_ids": [1, 2, 3],
-        }
-        request = Request.from_dict(request)
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_process_video.py b/tests/input/v1/test_process_video.py
deleted file mode 100644
index b8777cb0a76..00000000000
--- a/tests/input/v1/test_process_video.py
+++ /dev/null
@@ -1,386 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import math
-import os
-import tempfile
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-from PIL import Image as PILImage
-
-import fastdeploy.input.v1.ernie4_5_vl_processor.process_video as process_video_module
-from fastdeploy.input.v1.ernie4_5_vl_processor.process_video import (
-    get_frame_indices,
-    read_frames_decord,
-    read_video_decord,
-)
-
-
-class _MockFrame:
-    """Lightweight frame wrapper that mimics the real frame object."""
-
-    def __init__(self, arr):
-        self._arr = arr
-
-    def asnumpy(self):
-        """Return the underlying numpy array."""
-        return self._arr
-
-
-class MockVideoReaderWrapper:
-    """
-    Simple mock implementation of a video reader:
-
-    - __len__ returns the total number of frames
-    - __getitem__ returns a _MockFrame(arr)
-    - get_avg_fps() returns fps
-    - Specific indices can be configured to raise errors in __getitem__
-    """
-
-    def __init__(
-        self,
-        src,
-        num_threads=1,
-        vlen=12,
-        fps=6,
-        fail_indices=None,
-        h=4,
-        w=5,
-        c=3,
-    ):
-        self.src = src
-        self._vlen = vlen
-        self._fps = fps
-        self._fail = set(fail_indices or [])
-        self._h, self._w, self._c = h, w, c
-
-    def __len__(self):
-        return self._vlen
-
-    def get_avg_fps(self):
-        return self._fps
-
-    def __getitem__(self, idx):
-        if idx < 0 or idx >= self._vlen:
-            raise IndexError("index out of range")
-        if idx in self._fail:
-            raise ValueError(f"forced fail at {idx}")
-        # Create a frame whose pixel value encodes the index (for easy debugging)
-        arr = np.zeros((self._h, self._w, self._c), dtype=np.uint8)
-        arr[:] = idx % 255
-        return _MockFrame(arr)
-
-
-class TestReadVideoDecord(unittest.TestCase):
-    def test_read_video_decord_with_wrapper(self):
-        """Test passing an existing VideoReaderWrapper instance directly."""
-        # Patch VideoReaderWrapper in the target module so isinstance checks use our mock class
-        with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper):
-            mock_reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5)
-            reader, meta, path = read_video_decord(mock_reader, save_to_disk=False)
-
-        self.assertIs(reader, mock_reader)
-        self.assertEqual(meta["fps"], 5)
-        self.assertEqual(meta["num_of_frame"], 10)
-        self.assertTrue(math.isclose(meta["duration"], 10 / 5, rel_tol=1e-6))
-        # The original reader object should be returned unchanged
-        self.assertIs(path, mock_reader)
-
-    def test_read_video_decord_with_bytes(self):
-        """Test that bytes input is wrapped into BytesIO and passed to VideoReaderWrapper."""
-        with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper):
-            data = b"\x00\x01\x02\x03"
-            reader, meta, path = read_video_decord(data, save_to_disk=False)
-
-        self.assertIsInstance(reader, MockVideoReaderWrapper)
-        self.assertEqual(meta["fps"], 6)
-        self.assertEqual(meta["num_of_frame"], 12)
-        self.assertTrue(math.isclose(meta["duration"], 12 / 6, rel_tol=1e-6))
-        self.assertIsInstance(path, io.BytesIO)
-
-
-class TestGetFrameIndices(unittest.TestCase):
-    def test_by_target_frames_middle(self):
-        """Test target_frames mode with 'middle' sampling strategy."""
-        vlen = 12
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=4,
-            target_fps=-1,
-            frames_sample="middle",
-            input_fps=-1,
-        )
-        # 12 frames split into 4 segments -> midpoints [1, 4, 7, 10]
-        self.assertEqual(out, [1, 4, 7, 10])
-
-    def test_by_target_frames_leading(self):
-        """Test target_frames mode with 'leading' sampling strategy."""
-        vlen = 10
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=5,
-            target_fps=-1,
-            frames_sample="leading",
-            input_fps=-1,
-        )
-        # 10 frames split into 5 segments -> segment starts [0, 2, 4, 6, 8]
-        self.assertEqual(out, [0, 2, 4, 6, 8])
-
-    def test_by_target_frames_rand(self):
-        """Test target_frames mode with 'rand' sampling strategy."""
-        vlen = 10
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=4,
-            target_fps=-1,
-            frames_sample="rand",
-            input_fps=-1,
-        )
-        self.assertEqual(len(out), 4)
-        self.assertTrue(all(0 <= i < vlen for i in out))
-
-    def test_by_target_frames_fix_start(self):
-        """Test target_frames mode with a fixed start offset."""
-        vlen = 10
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=5,
-            target_fps=-1,
-            frames_sample="middle",  # overridden by fix_start
-            fix_start=1,
-            input_fps=-1,
-        )
-        # Segment starts [0, 2, 4, 6, 8] -> +1 => [1, 3, 5, 7, 9]
-        self.assertEqual(out, [1, 3, 5, 7, 9])
-
-    def test_target_frames_greater_than_vlen(self):
-        """Test that target_frames > vlen falls back to using vlen samples."""
-        vlen = 5
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=10,
-            target_fps=-1,
-            frames_sample="middle",
-            input_fps=-1,
-        )
-        self.assertEqual(len(out), vlen)
-        self.assertTrue(all(0 <= i < vlen for i in out))
-
-    def test_by_target_fps_middle(self):
-        """Test target_fps mode with 'middle' sampling strategy."""
-        vlen, in_fps = 12, 6
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=-1,
-            target_fps=2,
-            frames_sample="middle",
-            input_fps=in_fps,
-        )
-        # Roughly 4 frames expected
-        self.assertTrue(3 <= len(out) <= 5)
-        self.assertTrue(all(0 <= i < vlen for i in out))
-
-    def test_by_target_fps_leading(self):
-        """Test target_fps mode with 'leading' sampling strategy."""
-        vlen, in_fps = 12, 6
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=-1,
-            target_fps=2,
-            frames_sample="leading",
-            input_fps=in_fps,
-        )
-        self.assertTrue(3 <= len(out) <= 5)
-        self.assertTrue(all(0 <= i < vlen for i in out))
-
-    def test_by_target_fps_rand(self):
-        """Test target_fps mode with 'rand' sampling strategy."""
-        vlen, in_fps = 12, 6
-        out = get_frame_indices(
-            vlen=vlen,
-            target_frames=-1,
-            target_fps=2,
-            frames_sample="rand",
-            input_fps=in_fps,
-        )
-        self.assertTrue(3 <= len(out) <= 5)
-        self.assertTrue(all(0 <= i < vlen for i in out))
-
-    def test_invalid_both_negative(self):
-        """Test that both target_frames and target_fps being negative raises ValueError."""
-        with self.assertRaises(ValueError):
-            get_frame_indices(
-                vlen=10,
-                target_frames=-1,
-                target_fps=-1,
-                frames_sample="middle",
-            )
-
-    def test_invalid_both_specified(self):
-        """Test that specifying both target_frames and target_fps raises AssertionError."""
-        with self.assertRaises(AssertionError):
-            get_frame_indices(
-                vlen=10,
-                target_frames=4,
-                target_fps=2,
-                frames_sample="middle",
-                input_fps=6,
-            )
-
-    def test_invalid_target_fps_missing_input(self):
-        """Test that target_fps > 0 with invalid input_fps raises AssertionError."""
-        with self.assertRaises(AssertionError):
-            get_frame_indices(
-                vlen=10,
-                target_frames=-1,
-                target_fps=2,
-                frames_sample="middle",
-                input_fps=-1,
-            )
-
-
-class TestReadFramesDecord(unittest.TestCase):
-    def test_basic_read_no_save(self):
-        """Test normal frame reading without saving to disk."""
-        reader = MockVideoReaderWrapper("dummy", vlen=8, fps=4)
-        meta = {"fps": 4, "duration": 8 / 4, "num_of_frame": 8}
-
-        ret, idxs, ts = read_frames_decord(
-            video_path="dummy",
-            video_reader=reader,
-            video_meta=meta,
-            target_frames=4,
-            frames_sample="middle",
-            save_to_disk=False,
-        )
-
-        # Should return 4 PIL.Image instances
-        self.assertEqual(len(ret), 4)
-        for img in ret:
-            self.assertIsInstance(img, PILImage.Image)
-
-        self.assertEqual(idxs, [0, 2, 4, 6])
-        dur = meta["duration"]
-        n = meta["num_of_frame"]
-        for i, t in zip(idxs, ts):
-            self.assertTrue(math.isclose(t, i * dur / n, rel_tol=1e-6))
-
-    def test_read_and_save_to_disk(self):
-        """Test reading frames and saving them as PNG files on disk."""
-        reader = MockVideoReaderWrapper("dummy", vlen=4, fps=2)
-        meta = {"fps": 2, "duration": 4 / 2, "num_of_frame": 4}
-
-        with (
-            tempfile.TemporaryDirectory() as tmpdir,
-            patch.object(
-                process_video_module,
-                "get_filename",
-                return_value="det_id",
-            ),
-        ):
-            ret, idxs, ts = read_frames_decord(
-                video_path="dummy",
-                video_reader=reader,
-                video_meta=meta,
-                target_frames=2,
-                frames_sample="leading",
-                save_to_disk=True,
-                cache_dir=tmpdir,
-            )
-
-            self.assertEqual(len(ret), 2)
-            for i, pth in enumerate(ret):
-                self.assertIsInstance(pth, str)
-                self.assertTrue(os.path.exists(pth))
-                self.assertEqual(os.path.basename(pth), f"{i}.png")
-
-    def test_fallback_previous_success(self):
-        """Test that a failed frame read falls back to a previous valid frame when possible."""
-        reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={3})
-        meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10}
-        idxs = [1, 2, 3, 6]
-
-        ret, new_idxs, ts = read_frames_decord(
-            video_path="dummy",
-            video_reader=reader,
-            video_meta=meta,
-            frame_indices=idxs.copy(),
-            save_to_disk=False,
-            tol=5,
-        )
-
-        # Index 3 fails and should be replaced by 2 or 4 (previous/next search)
-        self.assertIn(new_idxs[2], (2, 4))
-        self.assertEqual(len(ret), 4)
-
-    def test_fallback_next_when_prev_fails(self):
-        """Test that when current and previous frames fail, a later frame is used as fallback."""
-        reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={2, 3})
-        meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10}
-        idxs = [1, 2, 3, 6]
-
-        ret, new_idxs, ts = read_frames_decord(
-            video_path="dummy",
-            video_reader=reader,
-            video_meta=meta,
-            frame_indices=idxs.copy(),
-            save_to_disk=False,
-            tol=5,
-        )
-
-        # Frame 3 should eventually be replaced by 4
-        self.assertEqual(new_idxs[2], 4)
-        self.assertEqual(len(ret), 4)
-
-    def test_len_assert_when_no_fallback(self):
-        """Test that assertion is triggered when no valid fallback frame can be found."""
-
-        class FailAllAroundReader(MockVideoReaderWrapper):
-            """Reader that fails on index 1 and has too small length to find fallback."""
-
-            def __init__(self, *a, **kw):
-                super().__init__(*a, **kw)
-                self._vlen = 2
-                self._fps = 2
-                self._fail = {1}
-
-            def __getitem__(self, idx):
-                if idx in self._fail:
-                    raise ValueError("fail hard")
-                return super().__getitem__(idx)
-
-        reader = FailAllAroundReader("dummy")
-        meta = {"fps": 2, "duration": 2 / 2, "num_of_frame": 2}
-
-        # Request 2 frames: index 0 succeeds, index 1 always fails,
-        # and tol=0 disallows searching neighbors -> stack and length assertion should fail
-        with self.assertRaises(AssertionError):
-            read_frames_decord(
-                video_path="dummy",
-                video_reader=reader,
-                video_meta=meta,
-                target_frames=2,
-                frames_sample="leading",
-                save_to_disk=False,
-                tol=0,
-            )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_qwen3_vl_processor.py b/tests/input/v1/test_qwen3_vl_processor.py
deleted file mode 100644
index c858f1e51db..00000000000
--- a/tests/input/v1/test_qwen3_vl_processor.py
+++ /dev/null
@@ -1,1172 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import copy
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.qwen3_vl_processor import Qwen3VLProcessor
-from fastdeploy.input.v1.qwen3_vl_processor.process import sample_frames
-
-
-def mock_pil_image(height, width):
-    """
-    Generate mock random RGB image
-
-    Args:
-        height: Image height in pixels
-        width: Image width in pixels
-
-    Returns:
-        PIL.Image object with random RGB data
-    """
-    rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-    return Image.fromarray(rgb_image)
-
-
-def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
-    """
-    Generate mock video frames with metadata for testing purposes
-
-    Creates synthetic video data by generating random RGB frames and constructing
-    corresponding metadata to simulate real video processing.
-
-    Args:
-        height (int): Height of video frames in pixels
-        width (int): Width of video frames in pixels
-        nums_frame (int): Number of frames to generate
-        fps (int): Frames per second for the mock video
-
-    Returns:
-        tuple: A tuple containing:
-            frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
-                containing randomly generated RGB frames
-            meta (dict): Dictionary with video metadata:
-                - fps (int): Frames per second (same as input)
-                - duration (float): Calculated duration in seconds (nums_frame/fps)
-                - num_of_frame (int): Number of frames (same as nums_frame input)
-    """
-    frames = []
-    for _ in range(nums_frame):
-        frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-        frames.append(frame)
-    frames = np.stack(frames, axis=0)
-
-    meta = {
-        "fps": fps,
-        "duration": nums_frame / fps,
-        "num_of_frame": nums_frame,
-    }
-    return frames, meta
-
-
-class TestQwen3VLProcessor(unittest.TestCase):
-    """
-    Unit tests for Qwen Vision-Language Processor functionality
-    """
-
-    def setUp(self):
-        """
-        Initialize test case with:
-        - Mock configuration
-        - Patched message parsing and video processing methods
-        - QwenVLProcessor instance with test parameters
-        """
-        config = MagicMock()
-        config.vision_config.tokens_per_second = 2
-
-        self.patcher_parse_image = patch(
-            "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640)
-        )
-        self.patcher_parse_image.start()
-
-        self.patcher_parse_video = patch(
-            "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123"
-        )
-        self.patcher_parse_video.start()
-
-        self.patcher_read_frames = patch(
-            "fastdeploy.input.v1.qwen3_vl_processor.process.DataProcessor._load_and_process_video",
-            return_value=mock_read_frames(480, 640, 5, 2),
-        )
-        self.patcher_read_frames.start()
-
-        mm_processor_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
-        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-
-        self.model_name_or_path = "/ModelData/Qwen3-VL-4B-Instruct"
-        self.processor = Qwen3VLProcessor(
-            config=config,
-            model_name_or_path=self.model_name_or_path,
-            limit_mm_per_prompt=limit_mm_per_prompt,
-            mm_processor_kwargs=mm_processor_kwargs,
-            reasoning_parser_obj=None,
-            tool_parser_obj=None,
-        )
-
-    def tearDown(self) -> None:
-        """Clean up test case by stopping all mock patches"""
-        self.patcher_read_frames.stop()
-        self.patcher_parse_image.stop()
-        self.patcher_parse_video.stop()
-
-    def test_process_request_dict(self):
-        """
-        Test processing of dictionary-format request with multimodal input
-
-        Validates:
-        1. Token ID lengths match position_ids and token_type_ids shapes
-        2. Image processing produces expected output dimensions
-        3. Video processing produces expected output dimensions
-        4. Correct counts for images (1) and videos (1)
-        """
-        num_completion_token_ids = 10
-        request = {
-            "request_id": "12345",
-            "completion_token_ids": [1] * num_completion_token_ids,
-            "stop": ["stop", "eof"],
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
-        self.assertEqual(
-            result.multimodal_inputs["images"].shape[0],
-            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
-        )
-        self.assertEqual(
-            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
-        )
-
-    def test_prompt(self):
-        """
-        Test processing of prompt with image and video placeholders
-
-        Validates:
-        1. Token ID lengths match position_ids and token_type_ids shapes
-        2. Image processing produces expected output dimensions
-        3. Video processing produces expected output dimensions
-        4. Correct counts for images (1) and videos (1)
-        """
-        IMAGE_PLACEHOLDER = "<|image_pad|>"
-        VIDEO_PLACEHOLDER = "<|video_pad|>"
-        prompt = {
-            "request_id": "12345",
-            "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
-            "multimodal_data": {
-                "image": [mock_pil_image(10, 2100)],
-                "video": [{"video": b"123", "fps": 5}],
-            },
-        }
-
-        request = Request.from_dict(prompt)
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
-        self.assertEqual(
-            result.multimodal_inputs["images"].shape[0],
-            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
-        )
-        self.assertEqual(
-            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
-        )
-
-    def test_message_and_prompt(self):
-        """
-        Test consistency between message-based and prompt-based processing
-
-        Validates that processing a request through:
-        1. The message format (with image/video URLs)
-        2. The prompt format (with direct image/video data)
-        produces identical tokenization and multimodal input results.
-
-        Checks:
-        1. Prompt token IDs match between both processing methods
-        2. Grid dimensions (THW) match between both methods
-        3. Position IDs match between both methods
-        """
-        # Create test request in message format
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        # Create equivalent request in prompt format
-        prompt = {
-            "request_id": "12345",
-            "prompt": request.prompt_tokens,
-            "multimodal_data": {
-                "image": [mock_pil_image(480, 640)],
-                "video": [{"video": b"123"}],
-            },
-        }
-        request2 = Request.from_dict(prompt)
-        result2 = self.processor.process_request_dict(request2, 1024 * 100)
-
-        # Verify both processing methods produce identical results
-        self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids)
-        self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
-        self.assertTrue(
-            np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all()
-        )
-
-    def test_apply_chat_template(self):
-        """
-        Test the consistency between:
-        1. Directly applying chat template using HuggingFace tokenizer
-        2. Applying chat template through the processor's request processing
-
-        This test verifies that:
-        - The processor correctly handles multimodal messages (image, video, text)
-        - The prompt_tokens field matches the output from direct tokenizer application
-        - The chat template application preserves the message structure and content
-
-        Test Steps:
-        1. Create sample multimodal messages with image, video and text content
-        2. Apply chat template directly using the tokenizer
-        3. Process the same messages through the processor
-        4. Compare the outputs to ensure consistency
-        """
-        from transformers import AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
-
-        # Sample multimodal messages containing image, video and text
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                    {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
-                    {"type": "text", "text": "Describe image and video."},
-                ],
-            }
-        ]
-
-        # Apply chat template directly using the tokenizer
-        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-        # Create equivalent request dictionary
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-
-        # Process request through the processor
-        self.processor.process_request_dict(request, 1024 * 100)
-        prompt2 = request.prompt_tokens
-
-        # Verify both methods produce identical prompt strings
-        self.assertEqual(prompt, prompt2)
-
-    def test_add_processed_image(self):
-        """
-        Test DataProcessor._add_processed_image via Qwen3VLProcessor
-        """
-        merge_size = self.processor.processor.image_processor.merge_size
-
-        # shape[0] must be divisible by merge_size^2
-        num_tokens = 4
-        img = np.zeros(
-            (num_tokens * merge_size * merge_size, 3, 3),
-            dtype=np.float32,
-        )
-        meta = {
-            "thw": (1, 8, 8),
-        }
-        uuid = "test-image-uuid"
-
-        img_cache = (img, meta)
-
-        outputs = {
-            "mm_positions": [],
-            "input_ids": [],
-            "token_type_ids": [],
-            "position_ids": [],
-            "cur_position": 5,
-            "images": [],
-            "mm_hashes": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "fps": [],
-        }
-
-        # -----------------------
-        # mock vision position computation
-        # -----------------------
-        dp = self.processor.processor
-        dp._compute_vision_positions = MagicMock(return_value=np.array([[10, 11, 12]], dtype=np.int64))
-
-        dp._add_processed_image(img_cache, outputs, uuid)
-
-        # ---- input_ids / token_type_ids ----
-        self.assertEqual(len(outputs["input_ids"]), num_tokens)
-        self.assertEqual(
-            outputs["input_ids"],
-            [dp.image_token_id] * num_tokens,
-        )
-
-        # ---- mm_positions ----
-        self.assertEqual(len(outputs["mm_positions"]), 1)
-        mm_pos = outputs["mm_positions"][0]
-        self.assertEqual(mm_pos.length, num_tokens)
-
-        # ---- vision positions ----
-        dp._compute_vision_positions.assert_called_once_with(5, 1, 8, 8, 0)
-        np.testing.assert_array_equal(
-            outputs["position_ids"][0],
-            np.array([[10, 11, 12]], dtype=np.int64),
-        )
-        self.assertEqual(outputs["cur_position"], 13)
-
-        # ---- image payload ----
-        self.assertEqual(len(outputs["images"]), 1)
-        np.testing.assert_array_equal(outputs["images"][0], img)
-
-        self.assertEqual(outputs["mm_hashes"], [uuid])
-        np.testing.assert_array_equal(
-            outputs["grid_thw"][0],
-            np.array([[1, 8, 8]]),
-        )
-        self.assertEqual(outputs["image_type_ids"], [0])
-        self.assertEqual(outputs["fps"], [0])
-
-    def test_multimodal_token_len_validation(self):
-        """Test token_len validation for raw and processed multimodal paths"""
-        dp = self.processor.processor
-        merge_size = dp.image_processor.merge_size
-
-        def build_outputs(image=False, video=False):
-            outputs = {
-                "mm_positions": [],
-                "input_ids": [],
-                "token_type_ids": [],
-                "position_ids": [],
-                "cur_position": 0,
-                "images": [],
-                "mm_hashes": [],
-                "grid_thw": [],
-                "image_type_ids": [],
-                "fps": [],
-            }
-            if image:
-                outputs["num_input_image_tokens"] = 0
-            if video:
-                outputs["num_input_video_tokens"] = 0
-            return outputs
-
-        processed_image = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (1, 8, 8)},
-        )
-        processed_video = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (2, 8, 8), "fps": 5},
-        )
-
-        with self.subTest("add_image"):
-            with patch.object(
-                dp.image_processor,
-                "preprocess",
-                return_value={
-                    "grid_thw": np.array([1, merge_size * 2, merge_size * 2]),
-                    "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
-                },
-            ):
-                with self.assertRaisesRegex(ValueError, "image tokens num not match the size"):
-                    dp._add_image(mock_pil_image(32, 32), build_outputs(image=True), None, token_len=3)
-
-        with self.subTest("add_processed_image"):
-            with self.assertRaisesRegex(ValueError, "image tokens num not match the size"):
-                dp._add_processed_image(processed_image, build_outputs(), "uuid", token_len=2)
-
-        with self.subTest("add_video"):
-            with patch.object(
-                dp.image_processor,
-                "preprocess",
-                return_value={
-                    "grid_thw": np.array([1, merge_size * 2, merge_size * 2]),
-                    "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
-                },
-            ):
-                with self.assertRaisesRegex(ValueError, "video tokens num not match the size"):
-                    dp._add_video(
-                        np.zeros((2, 4, 4, 3), dtype=np.uint8),
-                        {"fps": 4},
-                        build_outputs(video=True),
-                        None,
-                        token_len=3,
-                    )
-
-        with self.subTest("add_processed_video"):
-            with self.assertRaisesRegex(ValueError, "video tokens num not match the size"):
-                dp._add_processed_video(processed_video, build_outputs(), "uuid", token_len=2)
-
-    def test_prompt_token_ids2outputs_error_branches(self):
-        """Test prompt_token_ids2outputs error branches with minimal fixtures"""
-        dp = self.processor.processor
-        request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": [dp.image_token_id],
-                "messages": [{"role": "user", "content": [{"type": "image_url", "uuid": "missing-image"}]}],
-            }
-        )
-        parsed_messages = [{"role": "user", "content": {"type": "image", "data": None, "uuid": "missing-image"}}]
-
-        with self.subTest("missing_without_cache"):
-            with patch(
-                "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
-            ):
-                with self.assertRaisesRegex(ValueError, "Missing items cannot be retrieved without processor cache."):
-                    dp.prompt_token_ids2outputs(request)
-
-        with self.subTest("missing_cache_item_not_found"):
-            old_enable_processor_cache = dp.enable_processor_cache
-            dp.enable_processor_cache = True
-            fake_context = MagicMock()
-            fake_context.socket.return_value = MagicMock()
-            try:
-                with patch(
-                    "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
-                ):
-                    with patch(
-                        "fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context
-                    ):
-                        with patch.object(dp, "get_processor_cache", return_value=[None]):
-                            with self.assertRaisesRegex(ValueError, "Missing item 0 not found in processor cache"):
-                                dp.prompt_token_ids2outputs(request)
-            finally:
-                dp.enable_processor_cache = old_enable_processor_cache
-
-        with self.subTest("unexpected_multimodal_type"):
-
-            class FlakyTypeItem:
-                def __init__(self):
-                    self.calls = 0
-
-                def get(self, key, default=None):
-                    if key == "type":
-                        self.calls += 1
-                        return "image" if self.calls == 1 else "audio"
-                    if key == "data":
-                        return "bad-data"
-                    if key == "uuid":
-                        return "bad-uuid"
-                    return default
-
-            parsed_messages = [{"role": "user", "content": FlakyTypeItem()}]
-            with patch(
-                "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
-            ):
-                with self.assertRaisesRegex(ValueError, "Unsupported multimodal type: audio"):
-                    dp.prompt_token_ids2outputs(request)
-
-    def test_prompt_token_ids2outputs_cache_update_paths(self):
-        """Test prompt_token_ids2outputs cache update for missing, 1D and 2D grid_thw paths"""
-        dp = self.processor.processor
-        merge_size = dp.image_processor.merge_size
-        old_enable_processor_cache = dp.enable_processor_cache
-        dp.enable_processor_cache = True
-
-        missing_image = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (1, 8, 8)},
-        )
-        processed_video = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (2, 8, 8), "fps": 6},
-        )
-        parsed_messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "data": None, "uuid": "missing-image"},
-                    {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"},
-                    {"type": "video", "data": processed_video, "uuid": "processed-video"},
-                ],
-            }
-        ]
-        request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": [dp.image_token_id, 99, dp.image_token_id, 98, dp.image_token_id],
-                "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}],
-            }
-        )
-        fake_socket = MagicMock()
-        fake_context = MagicMock()
-        fake_context.socket.return_value = fake_socket
-
-        try:
-            with patch(
-                "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
-            ):
-                with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context):
-                    with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)):
-                        with patch.object(
-                            dp.image_processor,
-                            "preprocess",
-                            return_value={
-                                "grid_thw": np.array([1, merge_size, merge_size]),
-                                "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
-                            },
-                        ):
-                            with patch.object(
-                                dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4)
-                            ):
-                                with patch.object(
-                                    dp, "get_processor_cache", return_value=[missing_image]
-                                ) as cache_get:
-                                    with patch.object(dp, "update_processor_cache") as cache_update:
-                                        outputs = dp.prompt_token_ids2outputs(request)
-
-            cache_get.assert_called_once_with(fake_socket, ["missing-image"])
-            cache_update.assert_called_once()
-            _, cached_hashes, cached_items = cache_update.call_args.args
-            self.assertEqual(cached_hashes, ["raw-video", "processed-video"])
-            self.assertEqual(cached_items[0][1]["thw"], (1, merge_size, merge_size))
-            self.assertEqual(cached_items[1][1]["thw"], (2, 8, 8))
-            self.assertEqual(outputs["mm_hashes"], ["missing-image", "raw-video", "processed-video"])
-            self.assertEqual(outputs["input_ids"][-1], dp.image_token_id)
-        finally:
-            dp.enable_processor_cache = old_enable_processor_cache
-
-    def test_request2ids_cache_update_paths(self):
-        """Test request2ids cache update for missing, 1D and 2D grid_thw paths"""
-        dp = self.processor.processor
-        merge_size = dp.image_processor.merge_size
-        old_enable_processor_cache = dp.enable_processor_cache
-        dp.enable_processor_cache = True
-
-        missing_image = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (1, 8, 8)},
-        )
-        processed_image = (
-            np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
-            {"thw": (1, 8, 8)},
-        )
-        parsed_messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "data": None, "uuid": "missing-image"},
-                    {"type": "image", "data": processed_image, "uuid": "processed-image"},
-                    {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"},
-                ],
-            }
-        ]
-        request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}],
-                "add_generation_prompt": True,
-            }
-        )
-        fake_socket = MagicMock()
-        fake_context = MagicMock()
-        fake_context.socket.return_value = fake_socket
-
-        try:
-            with patch(
-                "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
-            ):
-                with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context):
-                    with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)):
-                        with patch.object(
-                            dp.image_processor,
-                            "preprocess",
-                            return_value={
-                                "grid_thw": np.array([1, merge_size, merge_size]),
-                                "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
-                            },
-                        ):
-                            with patch.object(
-                                dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4)
-                            ):
-                                with patch.object(
-                                    dp, "get_processor_cache", return_value=[missing_image]
-                                ) as cache_get:
-                                    with patch.object(dp, "update_processor_cache") as cache_update:
-                                        with patch.object(
-                                            self.processor.tokenizer,
-                                            "apply_chat_template",
-                                            return_value="<|image_pad|>a<|image_pad|>b<|video_pad|>",
-                                        ):
-                                            outputs = dp.request2ids(request)
-
-            cache_get.assert_called_once_with(fake_socket, ["missing-image"])
-            cache_update.assert_called_once()
-            _, cached_hashes, cached_items = cache_update.call_args.args
-            self.assertEqual(cached_hashes, ["processed-image", "raw-video"])
-            self.assertEqual(cached_items[0][1]["thw"], (1, 8, 8))
-            self.assertEqual(cached_items[1][1]["thw"], (1, merge_size, merge_size))
-            self.assertEqual(outputs["mm_hashes"], ["missing-image", "processed-image", "raw-video"])
-        finally:
-            dp.enable_processor_cache = old_enable_processor_cache
-
-    def test_parse_processor_kwargs_valid(self):
-        """Test _parse_processor_kwargs with valid input"""
-        valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
-        result = self.processor._parse_processor_kwargs(valid_kwargs)
-        self.assertEqual(result, valid_kwargs)
-
-    def test_parse_processor_kwargs_empty(self):
-        """Test _parse_processor_kwargs with empty input"""
-        result = self.processor._parse_processor_kwargs(None)
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_invalid_type(self):
-        """Test _parse_processor_kwargs with invalid type"""
-        result = self.processor._parse_processor_kwargs("invalid")
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_invalid_value_type(self):
-        """Test _parse_processor_kwargs with invalid value type"""
-        invalid_kwargs = {"video_max_frames": "10"}  # Should be int
-        result = self.processor._parse_processor_kwargs(invalid_kwargs)
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_mixed_valid_invalid(self):
-        """Test _parse_processor_kwargs with mixed valid and invalid types"""
-        mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"}
-        result = self.processor._parse_processor_kwargs(mixed_kwargs)
-        self.assertEqual(result, {})
-
-    def test_parse_limits_valid(self):
-        """Test _parse_limits with valid limits"""
-        limits = {"image": 2, "video": 3}
-        result = self.processor._parse_limits(limits)
-        expected = {"image": 2, "video": 3, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_empty(self):
-        """Test _parse_limits with empty input"""
-        result = self.processor._parse_limits(None)
-        expected = {"image": 1, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_invalid_type(self):
-        """Test _parse_limits with invalid type"""
-        result = self.processor._parse_limits("invalid")
-        expected = {"image": 1, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_partial(self):
-        """Test _parse_limits with partial limits"""
-        limits = {"image": 5}
-        result = self.processor._parse_limits(limits)
-        expected = {"image": 5, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_check_mm_limits_dict_valid(self):
-        """Test _check_mm_limits with valid dict input"""
-        mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]}
-        # Should not raise exception
-        self.processor._check_mm_limits(mm_data)
-
-    def test_check_mm_limits_dict_exceed_limit(self):
-        """Test _check_mm_limits when dict input exceeds limit"""
-        mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]}
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(mm_data)
-        self.assertIn("Too many image items", str(context.exception))
-
-    def test_check_mm_limits_messages_valid(self):
-        """Test _check_mm_limits with valid messages input"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            }
-        ]
-        # Should not raise exception
-        self.processor._check_mm_limits(messages)
-
-    def test_check_mm_limits_messages_exceed_limit(self):
-        """Test _check_mm_limits when messages input exceeds limit"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}},
-                    {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}},
-                ],
-            }
-        ]
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(messages)
-        self.assertIn("Too many image items", str(context.exception))
-
-    def test_check_mm_limits_video_exceed(self):
-        """Test _check_mm_limits when video exceeds limit"""
-        mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]}
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(mm_data)
-        self.assertIn("Too many video items", str(context.exception))
-
-    def test_process_request_dict_with_prompt(self):
-        """Test process_request_dict with prompt format"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test prompt",
-            "multimodal_data": {"image": [mock_pil_image(10, 10)]},
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(len(result.prompt_token_ids), 0)
-        self.assertGreater(len(result.multimodal_inputs), 0)
-
-    def test_process_request_dict_with_messages(self):
-        """Test process_request_dict with messages format"""
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [{"type": "text", "text": "Hello"}],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(len(result.prompt_token_ids), 0)
-        self.assertGreater(len(result.multimodal_inputs), 0)
-
-    def test_process_request_dict_with_prompt_token_ids_only(self):
-        """Test process_request_dict with prompt_token_ids only"""
-        request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": [1, 2, 3],
-            }
-        )
-        result = self.processor.process_request_dict(request, 1024)
-
-        self.assertEqual(result.prompt_token_ids, [1, 2, 3])
-        self.assertEqual(result.prompt_token_ids_len, 3)
-        self.assertIsNone(result.multimodal_inputs["images"])
-        self.assertEqual(result.multimodal_inputs["token_type_ids"].tolist(), [0, 0, 0])
-
-    def test_process_request_dict_with_prompt_token_ids_and_messages(self):
-        """Test process_request_dict with prompt_token_ids and multimodal messages"""
-        source_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                            {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                            {"type": "text", "text": "Describe image and video."},
-                        ],
-                    }
-                ],
-            }
-        )
-        source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
-        token_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": list(source_result.prompt_token_ids),
-                "messages": copy.deepcopy(source_request.messages),
-            }
-        )
-        token_result = self.processor.process_request_dict(token_request, 1024 * 100)
-
-        self.assertEqual(token_result.prompt_token_ids, source_result.prompt_token_ids)
-        self.assertTrue(
-            np.equal(token_result.multimodal_inputs["grid_thw"], source_result.multimodal_inputs["grid_thw"]).all()
-        )
-        self.assertTrue(
-            np.equal(
-                token_result.multimodal_inputs["position_ids"],
-                source_result.multimodal_inputs["position_ids"],
-            ).all()
-        )
-        self.assertTrue(
-            np.equal(
-                token_result.multimodal_inputs["image_type_ids"],
-                source_result.multimodal_inputs["image_type_ids"],
-            ).all()
-        )
-
-    def test_process_request_dict_prompt_token_ids_more_multimodal_segments_than_messages(self):
-        """Test prompt_token_ids path when token-side multimodal segments exceed messages"""
-        source_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                            {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                            {"type": "text", "text": "Describe image and video."},
-                        ],
-                    }
-                ],
-            }
-        )
-        source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
-        token_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": list(source_result.prompt_token_ids),
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                            {"type": "text", "text": "Describe image and video."},
-                        ],
-                    }
-                ],
-            }
-        )
-
-        with self.assertRaisesRegex(ValueError, "more multimodal placeholder"):
-            self.processor.process_request_dict(token_request, 1024 * 100)
-
-    def test_process_request_dict_prompt_token_ids_unused_multimodal_messages(self):
-        """Test prompt_token_ids path when messages have unused multimodal items"""
-        source_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                            {"type": "text", "text": "Describe image."},
-                        ],
-                    }
-                ],
-            }
-        )
-        source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
-        token_request = Request.from_dict(
-            {
-                "request_id": "12345",
-                "prompt_token_ids": list(source_result.prompt_token_ids),
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                            {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                            {"type": "text", "text": "Describe image."},
-                        ],
-                    }
-                ],
-            }
-        )
-
-        with self.assertRaisesRegex(ValueError, "number of multimodal items does not match"):
-            self.processor.process_request_dict(token_request, 1024 * 100)
-
-    def test_process_request_dict_invalid_format(self):
-        """Test process_request_dict with invalid format"""
-        request = {"request_id": "12345"}
-        request = Request.from_dict(request)
-        with self.assertRaises(ValueError) as context:
-            self.processor.process_request_dict(request, 1024)
-        self.assertIn("must contain 'prompt', or 'messages'", str(context.exception))
-
-    def test_process_request_dict_with_bad_words(self):
-        """Test process_request_dict with bad_words"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test prompt",
-            "bad_words": ["bad", "word"],
-            "bad_words_token_ids": [100, 200],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        # Verify bad_words_token_ids is set
-        self.assertIsNotNone(result.sampling_params.bad_words_token_ids)
-
-    def test_process_request_dict_invalid_chat_template_kwargs(self):
-        """Test process_request_dict with invalid chat_template_kwargs"""
-        request = {
-            "request_id": "12345",
-            "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}],
-        }
-        request = Request.from_dict(request)
-        request.chat_template_kwargs = "invalid"
-        with self.assertRaises(ValueError) as context:
-            self.processor.process_request_dict(request, 1024)
-        self.assertIn("must be a dict", str(context.exception))
-
-    def test_process_request_dict_with_completion_token_ids(self):
-        """Test process_request_dict with completion_token_ids"""
-        request = {"request_id": "12345", "prompt": "Test"}
-        request = Request.from_dict(request)
-        request.completion_token_ids = [1, 2, 3]
-        result = self.processor.process_request_dict(request, 1024)
-        # Verify completion tokens are appended
-        self.assertGreater(len(result.prompt_token_ids), 3)
-
-    def test_process_request_dict_prompt_truncation(self):
-        """Test process_request_dict with prompt truncation"""
-        # Create a long prompt that exceeds max_model_len
-        long_prompt = "Test " * 1000
-        request = {
-            "request_id": "12345",
-            "prompt": long_prompt,
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 100)
-        # Verify prompt is truncated
-        self.assertLessEqual(len(result.prompt_token_ids), 99)
-
-    def test_process_request_dict_default_max_tokens(self):
-        """Test process_request_dict sets default max_tokens"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test",
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(result.sampling_params.max_tokens, 0)
-
-    def test_process_request_dict_enable_thinking_false(self):
-        """Test process_request_dict sets enable_thinking to False"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test",
-            "enable_thinking": True,
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertFalse(result.enable_thinking)
-
-    def test_append_completion_tokens(self):
-        """Test append_completion_tokens method"""
-        multimodal_inputs = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-            "cur_position": 3,
-        }
-        completion_token_ids = [4, 5]
-        self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
-        self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
-        self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
-        self.assertEqual(multimodal_inputs["cur_position"], 5)
-
-    def test_pack_outputs_with_images(self):
-        """Test pack_outputs with image data"""
-        outputs = {
-            "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
-            "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
-            "image_type_ids": [0, 1],
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-        }
-        result = self.processor.pack_outputs(outputs)
-
-        self.assertIsNotNone(result["images"])
-        self.assertIsNotNone(result["grid_thw"])
-        self.assertIsNotNone(result["image_type_ids"])
-        self.assertEqual(result["images"].shape[0], 4)
-        self.assertEqual(result["grid_thw"].shape[0], 2)
-
-    def test_pack_outputs_without_images(self):
-        """Test pack_outputs without image data"""
-        outputs = {
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-        }
-        result = self.processor.pack_outputs(outputs)
-
-        # Test that image-related fields are None when no images
-        self.assertIsNone(result["images"])
-        self.assertIsNone(result["grid_thw"])
-        self.assertIsNone(result["image_type_ids"])
-
-        # Test data types
-        self.assertEqual(result["input_ids"].dtype, np.int64)
-        self.assertEqual(result["token_type_ids"].dtype, np.int64)
-        self.assertEqual(result["position_ids"].dtype, np.int64)
-
-        # Test patch IDs are set
-        self.assertIn("image_patch_id", result)
-        self.assertIn("video_patch_id", result)
-        self.assertIn("mm_num_token_func", result)
-
-
-class TestSampleFrames(unittest.TestCase):
-    """
-    Unit tests for sample_frames function
-    """
-
-    def setUp(self):
-        self.metadata = {
-            "num_of_frame": 100,
-            "fps": 25,
-        }
-
-    def test_fps_and_num_frames_mutually_exclusive(self):
-        with self.assertRaises(ValueError):
-            sample_frames(
-                frame_factor=4,
-                min_frames=8,
-                max_frames=32,
-                metadata=self.metadata,
-                fps=2,
-                num_frames=16,
-            )
-
-    def test_num_frames_round_to_factor(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=64,
-            metadata=self.metadata,
-            num_frames=18,  # round(18 / 4) * 4 = 16
-        )
-
-        self.assertEqual(len(indices), 16)
-        self.assertEqual(indices[0], 0)
-        self.assertLess(indices[-1], self.metadata["num_of_frame"])
-
-    def test_fps_sampling_basic(self):
-        # total = 100 frames, fps=25, target fps=5 → 20 frames
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=64,
-            metadata=self.metadata,
-            fps=5,
-        )
-
-        self.assertEqual(len(indices), 20)
-        self.assertEqual(indices.dtype, np.int32)
-        self.assertEqual(indices[0], 0)
-
-    def test_fps_respects_min_frames(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=24,
-            max_frames=64,
-            metadata=self.metadata,
-            fps=1,  # very small fps
-        )
-
-        self.assertEqual(len(indices), 24)
-
-    def test_num_frames_exceeds_total_raises(self):
-        with self.assertRaises(ValueError):
-            sample_frames(
-                frame_factor=4,
-                min_frames=8,
-                max_frames=200,
-                metadata=self.metadata,
-                num_frames=200,
-            )
-
-    def test_force_multiple_of_4_hack(self):
-        indices = sample_frames(
-            frame_factor=2,
-            min_frames=2,
-            max_frames=100,
-            metadata=self.metadata,
-            num_frames=10,  # 10 % 4 != 0 → hack → 8
-        )
-
-        self.assertEqual(len(indices), 8)
-        self.assertEqual(len(indices) % 4, 0)
-
-    def test_keep_all_frames_when_num_frames_zero(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=0,
-            max_frames=100,
-            metadata=self.metadata,
-            num_frames=0,
-        )
-
-        self.assertEqual(len(indices), self.metadata["num_of_frame"])
-        np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32))
-
-    def test_indices_evenly_spaced(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=32,
-            metadata=self.metadata,
-            num_frames=16,
-        )
-
-        diffs = np.diff(indices)
-        self.assertTrue(np.all(diffs > 0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_qwen_vl_processor.py b/tests/input/v1/test_qwen_vl_processor.py
deleted file mode 100644
index 5de7df1da39..00000000000
--- a/tests/input/v1/test_qwen_vl_processor.py
+++ /dev/null
@@ -1,776 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.qwen_vl_processor import QwenVLProcessor
-from fastdeploy.input.v1.qwen_vl_processor.process_video import sample_frames
-
-
-def mock_pil_image(height, width):
-    """
-    Generate mock random RGB image
-
-    Args:
-        height: Image height in pixels
-        width: Image width in pixels
-
-    Returns:
-        PIL.Image object with random RGB data
-    """
-    rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-    return Image.fromarray(rgb_image)
-
-
-def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
-    """
-    Generate mock video frames with metadata for testing purposes
-
-    Creates synthetic video data by generating random RGB frames and constructing
-    corresponding metadata to simulate real video processing.
-
-    Args:
-        height (int): Height of video frames in pixels
-        width (int): Width of video frames in pixels
-        nums_frame (int): Number of frames to generate
-        fps (int): Frames per second for the mock video
-
-    Returns:
-        tuple: A tuple containing:
-            frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
-                containing randomly generated RGB frames
-            meta (dict): Dictionary with video metadata:
-                - fps (int): Frames per second (same as input)
-                - duration (float): Calculated duration in seconds (nums_frame/fps)
-                - num_of_frame (int): Number of frames (same as nums_frame input)
-    """
-    frames = []
-    for _ in range(nums_frame):
-        frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
-        frames.append(frame)
-    frames = np.stack(frames, axis=0)
-
-    meta = {
-        "fps": fps,
-        "duration": nums_frame / fps,
-        "num_of_frame": nums_frame,
-    }
-    return frames, meta
-
-
-class TestQwenVLProcessor(unittest.TestCase):
-    """
-    Unit tests for Qwen Vision-Language Processor functionality
-    """
-
-    def setUp(self):
-        """
-        Initialize test case with:
-        - Mock configuration
-        - Patched message parsing and video processing methods
-        - QwenVLProcessor instance with test parameters
-        """
-        config = MagicMock()
-        config.vision_config.tokens_per_second = 2
-
-        self.patcher_parse_image = patch(
-            "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640)
-        )
-        self.patcher_parse_image.start()
-
-        self.patcher_parse_video = patch(
-            "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123"
-        )
-        self.patcher_parse_video.start()
-
-        self.patcher_read_frames = patch(
-            "fastdeploy.input.v1.qwen_vl_processor.process.DataProcessor._load_and_process_video",
-            return_value=mock_read_frames(480, 640, 5, 2),
-        )
-        self.patcher_read_frames.start()
-
-        mm_processor_kwargs = {
-            "video_max_frames": 10,
-            "video_min_frames": 1,
-        }
-        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-
-        self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
-        self.processor = QwenVLProcessor(
-            config=config,
-            model_name_or_path=self.model_name_or_path,
-            limit_mm_per_prompt=limit_mm_per_prompt,
-            mm_processor_kwargs=mm_processor_kwargs,
-            reasoning_parser_obj=None,
-            tool_parser_obj=None,
-        )
-
-    def tearDown(self) -> None:
-        """Clean up test case by stopping all mock patches"""
-        self.patcher_read_frames.stop()
-        self.patcher_parse_image.stop()
-        self.patcher_parse_video.stop()
-
-    def test_process_request(self):
-        """
-        Test processing of Request object with multimodal input
-
-        Validates:
-        1. Token ID lengths match position_ids and token_type_ids shapes
-        2. Image processing produces expected output dimensions
-        3. Video processing produces expected output dimensions
-        4. Correct counts for images (1) and videos (1)
-        """
-        message = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-
-        request = Request.from_dict(message)
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
-        self.assertEqual(
-            result.multimodal_inputs["images"].shape[0],
-            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
-        )
-        self.assertEqual(
-            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
-        )
-
-    def test_process_request_dict(self):
-        """
-        Test processing of dictionary-format request with multimodal input
-
-        Validates:
-        1. Token ID lengths match position_ids and token_type_ids shapes
-        2. Image processing produces expected output dimensions
-        3. Video processing produces expected output dimensions
-        4. Correct counts for images (1) and videos (1)
-        """
-        num_completion_token_ids = 10
-        request = {
-            "request_id": "12345",
-            "completion_token_ids": [1] * num_completion_token_ids,
-            "stop": ["stop", "eof"],
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
-        self.assertEqual(
-            result.multimodal_inputs["images"].shape[0],
-            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
-        )
-        self.assertEqual(
-            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
-        )
-
-    def test_process_request_dict_enable_thinking(self):
-        num_completion_token_ids = 10
-        request = {
-            "request_id": "12345",
-            "completion_token_ids": [1] * num_completion_token_ids,
-            "stop": ["stop", "eof"],
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Hello"},
-                    ],
-                }
-            ],
-            "chat_template_kwargs": {"enable_thinking": True},
-        }
-        request = Request.from_dict(request)
-
-        result = self.processor.process_request_dict(request, 100)
-        self.assertEqual(result.enable_thinking, False)
-
-    def test_prompt(self):
-        """
-        Test processing of prompt with image and video placeholders
-
-        Validates:
-        1. Token ID lengths match position_ids and token_type_ids shapes
-        2. Image processing produces expected output dimensions
-        3. Video processing produces expected output dimensions
-        4. Correct counts for images (1) and videos (1)
-        """
-        IMAGE_PLACEHOLDER = "<|image_pad|>"
-        VIDEO_PLACEHOLDER = "<|video_pad|>"
-        prompt = {
-            "request_id": "12345",
-            "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
-            "multimodal_data": {
-                "image": [mock_pil_image(10, 2100)],
-                "video": [{"video": b"123", "fps": 5}],
-            },
-        }
-
-        request = Request.from_dict(prompt)
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
-        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
-        self.assertEqual(
-            result.multimodal_inputs["images"].shape[0],
-            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
-        )
-        self.assertEqual(
-            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
-        )
-
-    def test_message_and_prompt(self):
-        """
-        Test consistency between message-based and prompt-based processing
-
-        Validates that processing a request through:
-        1. The message format (with image/video URLs)
-        2. The prompt format (with direct image/video data)
-        produces identical tokenization and multimodal input results.
-
-        Checks:
-        1. Prompt token IDs match between both processing methods
-        2. Grid dimensions (THW) match between both methods
-        3. Position IDs match between both methods
-        """
-        # Create test request in message format
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024 * 100)
-
-        # Create equivalent request in prompt format
-        prompt = {
-            "request_id": "12345",
-            "prompt": request.prompt_tokens,
-            "multimodal_data": {
-                "image": [mock_pil_image(480, 640)],
-                "video": [{"video": b"123"}],
-            },
-        }
-        request2 = Request.from_dict(prompt)
-        result2 = self.processor.process_request_dict(request2, 1024 * 100)
-
-        # Verify both processing methods produce identical results
-        self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids)
-        self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
-        self.assertTrue(
-            np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all()
-        )
-
-    def test_apply_chat_template(self):
-        """
-        Test the consistency between:
-        1. Directly applying chat template using HuggingFace tokenizer
-        2. Applying chat template through the processor's request processing
-
-        This test verifies that:
-        - The processor correctly handles multimodal messages (image, video, text)
-        - The prompt_tokens field matches the output from direct tokenizer application
-        - The chat template application preserves the message structure and content
-
-        Test Steps:
-        1. Create sample multimodal messages with image, video and text content
-        2. Apply chat template directly using the tokenizer
-        3. Process the same messages through the processor
-        4. Compare the outputs to ensure consistency
-        """
-        from transformers import AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
-
-        # Sample multimodal messages containing image, video and text
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                    {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
-                    {"type": "text", "text": "Describe image and video."},
-                ],
-            }
-        ]
-
-        # Apply chat template directly using the tokenizer
-        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-        # Create equivalent request dictionary
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
-                        {"type": "text", "text": "Describe image and video."},
-                    ],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-
-        # Process request through the processor
-        self.processor.process_request_dict(request, 1024 * 100)
-        prompt2 = request.prompt_tokens
-
-        # Verify both methods produce identical prompt strings
-        self.assertEqual(prompt, prompt2)
-
-    def test_think_status(self):
-        """测试 思考机制"""
-        request = {
-            "prompt": "hello",
-            "request_id": "test_1",
-            "prompt_token_ids": [1, 2, 3],
-            "temperature": 0.7,
-            "top_p": 0.9,
-        }
-        request = Request.from_dict(request)
-        self.processor.reasoning_parser = MagicMock()
-        self.processor.reasoning_parser.get_model_status.return_value = "think_start"
-        self.processor.model_status_dict = {}
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-        request = {
-            "prompt": "hello",
-            "request_id": "test",
-            "prompt_token_ids": [1, 2, 3],
-            "temperature": 0.7,
-            "top_p": 0.9,
-        }
-        request = Request.from_dict(request)
-        self.processor.process_request_dict(request, max_model_len=512)
-        self.assertEqual(request.enable_thinking, True)
-
-    def test_parse_processor_kwargs_valid(self):
-        """Test _parse_processor_kwargs with valid input"""
-        valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
-        result = self.processor._parse_processor_kwargs(valid_kwargs)
-        self.assertEqual(result, valid_kwargs)
-
-    def test_parse_processor_kwargs_empty(self):
-        """Test _parse_processor_kwargs with empty input"""
-        result = self.processor._parse_processor_kwargs(None)
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_invalid_type(self):
-        """Test _parse_processor_kwargs with invalid type"""
-        result = self.processor._parse_processor_kwargs("invalid")
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_invalid_value_type(self):
-        """Test _parse_processor_kwargs with invalid value type"""
-        invalid_kwargs = {"video_max_frames": "10"}  # Should be int
-        result = self.processor._parse_processor_kwargs(invalid_kwargs)
-        self.assertEqual(result, {})
-
-    def test_parse_processor_kwargs_mixed_valid_invalid(self):
-        """Test _parse_processor_kwargs with mixed valid and invalid types"""
-        mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"}
-        result = self.processor._parse_processor_kwargs(mixed_kwargs)
-        self.assertEqual(result, {})
-
-    def test_parse_limits_valid(self):
-        """Test _parse_limits with valid limits"""
-        limits = {"image": 2, "video": 3}
-        result = self.processor._parse_limits(limits)
-        expected = {"image": 2, "video": 3, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_empty(self):
-        """Test _parse_limits with empty input"""
-        result = self.processor._parse_limits(None)
-        expected = {"image": 1, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_invalid_type(self):
-        """Test _parse_limits with invalid type"""
-        result = self.processor._parse_limits("invalid")
-        expected = {"image": 1, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_parse_limits_partial(self):
-        """Test _parse_limits with partial limits"""
-        limits = {"image": 5}
-        result = self.processor._parse_limits(limits)
-        expected = {"image": 5, "video": 1, "audio": 1}
-        self.assertEqual(result, expected)
-
-    def test_check_mm_limits_dict_valid(self):
-        """Test _check_mm_limits with valid dict input"""
-        mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]}
-        # Should not raise exception
-        self.processor._check_mm_limits(mm_data)
-
-    def test_check_mm_limits_dict_exceed_limit(self):
-        """Test _check_mm_limits when dict input exceeds limit"""
-        mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]}
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(mm_data)
-        self.assertIn("Too many image items", str(context.exception))
-
-    def test_check_mm_limits_messages_valid(self):
-        """Test _check_mm_limits with valid messages input"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
-                    {"type": "text", "text": "Describe this image."},
-                ],
-            }
-        ]
-        # Should not raise exception
-        self.processor._check_mm_limits(messages)
-
-    def test_check_mm_limits_messages_exceed_limit(self):
-        """Test _check_mm_limits when messages input exceeds limit"""
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}},
-                    {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}},
-                ],
-            }
-        ]
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(messages)
-        self.assertIn("Too many image items", str(context.exception))
-
-    def test_check_mm_limits_video_exceed(self):
-        """Test _check_mm_limits when video exceeds limit"""
-        mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]}
-        with self.assertRaises(ValueError) as context:
-            self.processor._check_mm_limits(mm_data)
-        self.assertIn("Too many video items", str(context.exception))
-
-    def test_process_request_dict_with_prompt(self):
-        """Test process_request_dict with prompt format"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test prompt",
-            "multimodal_data": {"image": [mock_pil_image(10, 10)]},
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(len(result.prompt_token_ids), 0)
-        self.assertGreater(len(result.multimodal_inputs), 0)
-
-    def test_process_request_dict_with_messages(self):
-        """Test process_request_dict with messages format"""
-        request = {
-            "request_id": "12345",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [{"type": "text", "text": "Hello"}],
-                }
-            ],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(len(result.prompt_token_ids), 0)
-        self.assertGreater(len(result.multimodal_inputs), 0)
-
-    def test_process_request_dict_invalid_format(self):
-        """Test process_request_dict with invalid format"""
-        request = {"request_id": "12345"}
-        request = Request.from_dict(request)
-        with self.assertRaises(ValueError) as context:
-            self.processor.process_request_dict(request, 1024)
-        self.assertIn("must contain 'prompt', or 'messages'", str(context.exception))
-
-    def test_process_request_dict_with_bad_words(self):
-        """Test process_request_dict with bad_words"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test prompt",
-            "bad_words": ["bad", "word"],
-            "bad_words_token_ids": [100, 200],
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        # Verify bad_words_token_ids is set
-        self.assertIsNotNone(result.sampling_params.bad_words_token_ids)
-
-    def test_process_request_dict_invalid_chat_template_kwargs(self):
-        """Test process_request_dict with invalid chat_template_kwargs"""
-        request = {
-            "request_id": "12345",
-            "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}],
-        }
-        request = Request.from_dict(request)
-        request.chat_template_kwargs = "invalid"
-        with self.assertRaises(ValueError) as context:
-            self.processor.process_request_dict(request, 1024)
-        self.assertIn("must be a dict", str(context.exception))
-
-    def test_process_request_dict_with_completion_token_ids(self):
-        """Test process_request_dict with completion_token_ids"""
-        request = {"request_id": "12345", "prompt": "Test"}
-        request = Request.from_dict(request)
-        request.completion_token_ids = [1, 2, 3]
-        result = self.processor.process_request_dict(request, 1024)
-        # Verify completion tokens are appended
-        self.assertGreater(len(result.prompt_token_ids), 3)
-
-    def test_process_request_dict_prompt_truncation(self):
-        """Test process_request_dict with prompt truncation"""
-        # Create a long prompt that exceeds max_model_len
-        long_prompt = "Test " * 1000
-        request = {
-            "request_id": "12345",
-            "prompt": long_prompt,
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 100)
-        # Verify prompt is truncated
-        self.assertLessEqual(len(result.prompt_token_ids), 99)
-
-    def test_process_request_dict_default_max_tokens(self):
-        """Test process_request_dict sets default max_tokens"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test",
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertGreater(result.sampling_params.max_tokens, 0)
-
-    def test_process_request_dict_enable_thinking_false(self):
-        """Test process_request_dict sets enable_thinking to False"""
-        request = {
-            "request_id": "12345",
-            "prompt": "Test",
-            "enable_thinking": True,
-        }
-        request = Request.from_dict(request)
-        result = self.processor.process_request_dict(request, 1024)
-        self.assertFalse(result.enable_thinking)
-
-    def test_append_completion_tokens(self):
-        """Test append_completion_tokens method"""
-        multimodal_inputs = {
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-            "cur_position": 3,
-        }
-        completion_token_ids = [4, 5]
-        self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
-        self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
-        self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
-        self.assertEqual(multimodal_inputs["cur_position"], 5)
-
-    def test_pack_outputs_with_images(self):
-        """Test pack_outputs with image data"""
-        outputs = {
-            "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
-            "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
-            "image_type_ids": [0, 1],
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-        }
-        result = self.processor.pack_outputs(outputs)
-
-        self.assertIsNotNone(result["images"])
-        self.assertIsNotNone(result["grid_thw"])
-        self.assertIsNotNone(result["image_type_ids"])
-        self.assertEqual(result["images"].shape[0], 4)
-        self.assertEqual(result["grid_thw"].shape[0], 2)
-
-    def test_pack_outputs_without_images(self):
-        """Test pack_outputs without image data"""
-        outputs = {
-            "images": [],
-            "grid_thw": [],
-            "image_type_ids": [],
-            "input_ids": [1, 2, 3],
-            "token_type_ids": [0, 0, 0],
-            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
-        }
-        result = self.processor.pack_outputs(outputs)
-
-        # Test that image-related fields are None when no images
-        self.assertIsNone(result["images"])
-        self.assertIsNone(result["grid_thw"])
-        self.assertIsNone(result["image_type_ids"])
-
-        # Test data types
-        self.assertEqual(result["input_ids"].dtype, np.int64)
-        self.assertEqual(result["token_type_ids"].dtype, np.int64)
-        self.assertEqual(result["position_ids"].dtype, np.int64)
-
-        # Test patch IDs are set
-        self.assertIn("image_patch_id", result)
-        self.assertIn("video_patch_id", result)
-        self.assertIn("mm_num_token_func", result)
-
-
-class TestSampleFrames(unittest.TestCase):
-    """
-    Unit tests for sample_frames function
-    """
-
-    def setUp(self):
-        self.metadata = {
-            "num_of_frame": 100,
-            "fps": 25,
-        }
-
-    def test_fps_and_num_frames_mutually_exclusive(self):
-        with self.assertRaises(ValueError):
-            sample_frames(
-                frame_factor=4,
-                min_frames=8,
-                max_frames=32,
-                metadata=self.metadata,
-                fps=2,
-                num_frames=16,
-            )
-
-    def test_num_frames_round_to_factor(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=64,
-            metadata=self.metadata,
-            num_frames=18,  # round(18 / 4) * 4 = 16
-        )
-
-        self.assertEqual(len(indices), 16)
-        self.assertEqual(indices[0], 0)
-        self.assertLess(indices[-1], self.metadata["num_of_frame"])
-
-    def test_fps_sampling_basic(self):
-        # total = 100 frames, fps=25, target fps=5 → 20 frames
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=64,
-            metadata=self.metadata,
-            fps=5,
-        )
-
-        self.assertEqual(len(indices), 20)
-        self.assertEqual(indices.dtype, np.int32)
-        self.assertEqual(indices[0], 0)
-
-    def test_fps_respects_min_frames(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=24,
-            max_frames=64,
-            metadata=self.metadata,
-            fps=1,  # very small fps
-        )
-
-        self.assertEqual(len(indices), 24)
-
-    def test_num_frames_exceeds_total_raises(self):
-        with self.assertRaises(ValueError):
-            sample_frames(
-                frame_factor=4,
-                min_frames=8,
-                max_frames=200,
-                metadata=self.metadata,
-                num_frames=200,
-            )
-
-    def test_force_multiple_of_4_hack(self):
-        indices = sample_frames(
-            frame_factor=2,
-            min_frames=2,
-            max_frames=100,
-            metadata=self.metadata,
-            num_frames=10,  # 10 % 4 != 0 → hack → 8
-        )
-
-        self.assertEqual(len(indices), 8)
-        self.assertEqual(len(indices) % 4, 0)
-
-    def test_keep_all_frames_when_num_frames_zero(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=0,
-            max_frames=100,
-            metadata=self.metadata,
-            num_frames=0,
-        )
-
-        self.assertEqual(len(indices), self.metadata["num_of_frame"])
-        np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32))
-
-    def test_indices_evenly_spaced(self):
-        indices = sample_frames(
-            frame_factor=4,
-            min_frames=8,
-            max_frames=32,
-            metadata=self.metadata,
-            num_frames=16,
-        )
-
-        diffs = np.diff(indices)
-        self.assertTrue(np.all(diffs > 0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_text_processor.py b/tests/input/v1/test_text_processor.py
deleted file mode 100644
index 147d843e85d..00000000000
--- a/tests/input/v1/test_text_processor.py
+++ /dev/null
@@ -1,586 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import importlib
-import importlib.util
-import sys
-import types
-import unittest
-from pathlib import Path
-from types import SimpleNamespace
-from unittest import mock
-
-import numpy as np
-
-from fastdeploy.engine.request import Request, RequestOutput
-from fastdeploy.engine.sampling_params import SamplingParams
-
-
-class DummyTokenizer:
-    bos_token = "<s>"
-    cls_token = "<cls>"
-    sep_token = "</s>"
-    eos_token = "</eos>"
-    mask_token = "<mask>"
-    chat_template = "dummy"
-
-    def __init__(self):
-        self.pad_token_id = 1
-        self.eos_token_id = 2
-        self.eos_token = 2
-        self.vocab_size = 256
-        self.bos_token_id = self._convert_token_to_id(self.bos_token)
-        self.cls_token_id = self._convert_token_to_id(self.cls_token)
-        self.sep_token_id = self._convert_token_to_id(self.sep_token)
-        self.mask_token_id = self._convert_token_to_id(self.mask_token)
-
-    def _convert_token_to_id(self, token):
-        return len(str(token))
-
-    def __call__(self, text, **kwargs):
-        if isinstance(text, list):
-            values = [self._value(item) for item in text]
-        else:
-            values = [self._value(text)]
-        max_length = kwargs.get("max_length")
-        if max_length is not None:
-            values = values[:max_length]
-        return {"input_ids": np.array([values], dtype=np.int64)}
-
-    def _value(self, item):
-        if isinstance(item, str):
-            return len(item)
-        return int(item)
-
-    def tokenize(self, text):
-        if isinstance(text, str):
-            return [text]
-        return [str(text)]
-
-    def convert_tokens_to_ids(self, tokens):
-        return [self._value(token) for token in tokens]
-
-    def decode(self, token_ids, **kwargs):
-        return " ".join(str(t) for t in token_ids)
-
-    def decode_token(self, token_ids, prefix_offset, read_offset):
-        start = read_offset
-        delta_tokens = token_ids[start:]
-        delta = "".join(str(t) for t in delta_tokens)
-        prefix_offset += len(token_ids)
-        read_offset += len(delta_tokens)
-        return delta, prefix_offset, read_offset
-
-    def batch_decode(self, batch, **kwargs):
-        return [self.decode(seq) for seq in batch]
-
-    def apply_chat_template(self, request, **kwargs):
-        if isinstance(request, dict):
-            system = request.get("system")
-            messages = request.get("messages", [])
-        else:
-            system = getattr(request, "system", None)
-            messages = getattr(request, "messages", [])
-        parts = [system] if system else []
-        parts.extend(msg.get("content", "") for msg in messages)
-        return " ".join(part for part in parts if part)
-
-
-class DummyLlamaTokenizer(DummyTokenizer):
-    pass
-
-
-class DummyAutoTokenizer:
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        return DummyTokenizer()
-
-
-class DummyHFTokenizer:
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        return DummyTokenizer()
-
-
-def _create_dummy_modules():
-    """Create all dummy modules needed for testing fastdeploy.input.text_processor."""
-    repo_root = Path(__file__).resolve().parents[2]
-
-    dummy_logger = SimpleNamespace(
-        info=lambda *args, **kwargs: None,
-        warning=lambda *args, **kwargs: None,
-        debug=lambda *args, **kwargs: None,
-    )
-
-    utils_module = types.ModuleType("fastdeploy.utils")
-    utils_module.data_processor_logger = dummy_logger
-
-    envs_module = types.ModuleType("fastdeploy.envs")
-    envs_module.FD_USE_HF_TOKENIZER = False
-
-    generation_module = types.ModuleType("paddleformers.generation")
-
-    class DummyGenerationConfig:
-        def __init__(self):
-            self.top_p = 0.8
-            self.temperature = 0.9
-            self.repetition_penalty = 1.1
-            self.frequency_penalty = 0.2
-            self.presence_penalty = 0.1
-
-        @classmethod
-        def from_pretrained(cls, *args, **kwargs):
-            return cls()
-
-    generation_module.GenerationConfig = DummyGenerationConfig
-
-    transformers_module = types.ModuleType("paddleformers.transformers")
-    transformers_module.AutoTokenizer = DummyAutoTokenizer
-    transformers_module.LlamaTokenizer = DummyLlamaTokenizer
-    transformers_module.Llama3Tokenizer = DummyLlamaTokenizer
-
-    hf_transformers_module = types.ModuleType("transformers")
-    hf_transformers_module.AutoTokenizer = DummyHFTokenizer
-
-    llm_utils_module = types.ModuleType("paddleformers.cli.utils.llm_utils")
-    llm_utils_module.get_eos_token_id = lambda tokenizer, config: [tokenizer.eos_token_id]
-
-    fastdeploy_module = types.ModuleType("fastdeploy")
-    fastdeploy_module.__path__ = [str(repo_root / "fastdeploy")]
-    fastdeploy_module.utils = utils_module
-    fastdeploy_module.envs = envs_module
-
-    return {
-        "fastdeploy": fastdeploy_module,
-        "fastdeploy.utils": utils_module,
-        "fastdeploy.envs": envs_module,
-        "paddleformers.generation": generation_module,
-        "paddleformers.transformers": transformers_module,
-        "transformers": hf_transformers_module,
-        "paddleformers.cli.utils.llm_utils": llm_utils_module,
-    }
-
-
-def _import_text_processor(use_hf_tokenizer=False):
-    modules = _create_dummy_modules()
-
-    modules["fastdeploy.envs"].FD_USE_HF_TOKENIZER = use_hf_tokenizer
-
-    previous_modules = {}
-    for name, module in modules.items():
-        previous_modules[name] = sys.modules.get(name)
-        sys.modules[name] = module
-
-    try:
-        text_processor_module = importlib.import_module("fastdeploy.input.v1.text_processor")
-        importlib.reload(text_processor_module)
-    except Exception:
-        for name, original in previous_modules.items():
-            if original is None:
-                sys.modules.pop(name, None)
-            else:
-                sys.modules[name] = original
-        raise
-
-    def cleanup():
-        sys.modules.pop("fastdeploy.input.text_processor", None)
-        for name, original in previous_modules.items():
-            if original is None:
-                sys.modules.pop(name, None)
-            else:
-                sys.modules[name] = original
-
-    return text_processor_module, cleanup
-
-
-class DummyRequest:
-    def __init__(self, **kwargs):
-        self.request_id = kwargs.get("request_id", "req")
-        self.prompt = kwargs.get("prompt")
-        self.prompt_token_ids = kwargs.get("prompt_token_ids")
-        self.messages = kwargs.get("messages")
-        self.eos_token_ids = kwargs.get("eos_token_ids")
-        self.chat_template = kwargs.get("chat_template")
-        self.enable_thinking = kwargs.get("enable_thinking")
-        self.history = kwargs.get("history")
-        self.tools = kwargs.get("tools")
-        self.system = kwargs.get("system")
-        self.sampling_params = SimpleNamespace(
-            top_p=kwargs.get("top_p"),
-            temperature=kwargs.get("temperature"),
-            repetition_penalty=kwargs.get("repetition_penalty"),
-            frequency_penalty=kwargs.get("frequency_penalty"),
-            presence_penalty=kwargs.get("presence_penalty"),
-            stop=kwargs.get("stop"),
-            stop_token_ids=kwargs.get("stop_token_ids"),
-            stop_seqs_len=kwargs.get("stop_seqs_len"),
-            bad_words=kwargs.get("bad_words"),
-            bad_words_token_ids=kwargs.get("bad_words_token_ids"),
-            max_tokens=kwargs.get("max_tokens"),
-        )
-
-    def get(self, key, default=None):
-        if hasattr(self, key) and getattr(self, key) is not None:
-            return getattr(self, key)
-        return getattr(self.sampling_params, key, default)
-
-    def set(self, key, value):
-        if hasattr(self.sampling_params, key):
-            setattr(self.sampling_params, key, value)
-        else:
-            setattr(self, key, value)
-
-    def to_dict(self):
-        return {
-            "request_id": self.request_id,
-            "messages": self.messages,
-            "prompt": self.prompt,
-            "system": self.system,
-            "history": self.history,
-            "tools": self.tools,
-            "chat_template": self.chat_template,
-            "enable_thinking": self.enable_thinking,
-        }
-
-    def __getitem__(self, key):
-        return self.get(key)
-
-    def __setitem__(self, key, value):
-        self.set(key, value)
-
-
-class DataProcessorTestCase(unittest.TestCase):
-    @staticmethod
-    def create_dummy_reasoning(tokenizer, reasoning_content="think"):
-        class DummyReasoning:
-            def __init__(self, tokenizer):
-                self.tokenizer = tokenizer
-
-            def extract_reasoning_content(self, full_text, response_dict, model_status):
-                return reasoning_content, f"{full_text}!"
-
-        return DummyReasoning(tokenizer)
-
-    @staticmethod
-    def create_dummy_tool_parser(tokenizer, content="tool-text"):
-        class DummyToolParser:
-            def __init__(self, tokenizer):
-                self.tokenizer = tokenizer
-
-            def extract_tool_calls(self, full_text, response_dict):
-                return SimpleNamespace(tools_called=True, tool_calls=["tool"], content=content)
-
-        return DummyToolParser
-
-    def setUp(self):
-        module, cleanup = _import_text_processor()
-        self.text_processor_module = module
-        self.addCleanup(cleanup)
-        self.processor = self.text_processor_module.DataProcessor("stub-model")
-
-    def test_base_data_processor_contract(self):
-        text_processor_module = self.text_processor_module
-
-        class MinimalProcessor(text_processor_module.BaseDataProcessor):
-            def __init__(self):
-                self.generation_config = SimpleNamespace(
-                    top_p=0.5,
-                    temperature=0.6,
-                    repetition_penalty=1.1,
-                    frequency_penalty=0.2,
-                    presence_penalty=0.3,
-                )
-                super().__init__()
-
-            def _load_tokenizer(self):
-                return DummyTokenizer()
-
-            def process_request_dict(self, request, **kwargs):
-                return super().process_request_dict(request, **kwargs)
-
-            def process_response_dict(self, response_obj):
-                return super().process_response_dict(response_obj)
-
-        processor = MinimalProcessor()
-        request = Request(request_id="test_0", sampling_params=SamplingParams())
-        defaults = processor._apply_default_parameters(request)
-        self.assertAlmostEqual(defaults.sampling_params.top_p, 0.5)
-        with self.assertRaises(NotImplementedError):
-            processor.process_request_dict({}, max_model_len=None)
-        with self.assertRaises(NotImplementedError):
-            processor.process_response_dict({})
-        with self.assertRaises(NotImplementedError):
-            processor.text2ids("text")
-        with self.assertRaises(NotImplementedError):
-            processor.messages2ids([])
-        with self.assertRaises(NotImplementedError):
-            processor.ids2tokens([1], "task")
-
-    def test_process_request_dict_prompt_defaults(self):
-        request = {"request_id": "test_0", "prompt": "hi", "temperature": 0, "top_p": 0, "stop": ["stop"]}
-        request = Request.from_dict(request)
-        processed = self.processor.process_request_dict(request, max_model_len=5)
-
-        self.assertEqual(processed.prompt_token_ids, [2])
-        self.assertEqual(processed.sampling_params.stop_token_ids, [[4]])
-        self.assertEqual(processed.sampling_params.stop_seqs_len, [1])
-        self.assertEqual(processed.sampling_params.temperature, 1)
-        self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5)
-        self.assertEqual(processed.sampling_params.max_tokens, 4)
-
-    def test_process_request_dict_messages_template(self):
-        request = {
-            "request_id": "chat",
-            "messages": [{"role": "user", "content": "hello"}],
-            "chat_template_kwargs": {"system": "system prompt"},
-        }
-        request = Request.from_dict(request)
-        request.chat_template_kwargs = {"system": "system prompt"}
-        processed = self.processor.process_request_dict(request, max_model_len=6)
-
-        self.assertEqual(processed.prompt_token_ids, [len("system prompt hello")])
-        self.assertEqual(processed.system, "system prompt")
-        self.assertTrue(processed.enable_thinking)
-        self.assertEqual(processed.prompt_tokens, "system prompt hello")
-
-    def test_process_request_dictect_handles_sequences(self):
-        request = DummyRequest(
-            prompt=[1, 2, 3, 4, 5, 6],
-            stop=["stop"],
-            bad_words=["zz"],
-            temperature=0,
-            top_p=0,
-        )
-        processed = self.processor.process_request_dict(request, max_model_len=5)
-
-        self.assertEqual(processed.prompt_token_ids, [1, 2, 3, 4])
-        self.assertEqual(processed.sampling_params.max_tokens, 1)
-        self.assertEqual(processed.sampling_params.stop_token_ids, [[4]])
-        self.assertEqual(set(processed.sampling_params.bad_words_token_ids), {2, 3})
-        self.assertEqual(processed.sampling_params.temperature, 1)
-        self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5)
-
-    def test_process_request_requires_prompt_or_messages(self):
-        request = DummyRequest(prompt=None, messages=None, prompt_token_ids=None)
-        with self.assertRaisesRegex(ValueError, "Request must contain 'prompt_token_ids', 'prompt', or 'messages'"):
-            self.processor.process_request_dict(request, max_model_len=5)
-
-    def test_process_request_dict_rejects_bad_kwargs(self):
-        request = {
-            "request_id": "test_0",
-            "messages": [{"role": "user", "content": "hi"}],
-            "chat_template_kwargs": "invalid",
-        }
-        request = Request.from_dict(request)
-        request.chat_template_kwargs = "invalid"
-        request.sampling_params = SamplingParams()
-        with self.assertRaisesRegex(ValueError, "chat_template_kwargs must be a dict"):
-            self.processor.process_request_dict(request)
-
-    def test_ids2tokens_and_clear_request_status(self):
-        delta, _, _ = self.processor.ids2tokens([3], "task-1")
-        self.assertEqual(delta, "3")
-        delta, _, _ = self.processor.ids2tokens([4], "task-1")
-        self.assertEqual(delta, "4")
-
-        combined = self.processor.clear_request_status("task-1")
-        self.assertEqual(combined, "34")
-        self.assertNotIn("task-1", self.processor.decode_status)
-
-    def test_clear_request_status_hf_branch(self):
-        module, cleanup = _import_text_processor(use_hf_tokenizer=True)
-        self.addCleanup(cleanup)
-        processor = module.DataProcessor("stub-model")
-        processor.decode_status = {"task": [[], [], "transcript"]}
-
-        self.assertEqual(processor.clear_request_status("task"), "transcript")
-        self.assertNotIn("task", processor.decode_status)
-
-    def test_data_processor_init_handles_missing_generation_config(self):
-        with mock.patch.object(
-            self.text_processor_module.GenerationConfig,
-            "from_pretrained",
-            side_effect=OSError("missing"),
-        ):
-            processor = self.text_processor_module.DataProcessor("stub-model")
-        self.assertIsNone(processor.generation_config)
-
-    def test_process_response_with_reasoning_and_tools(self):
-        processor = self.processor
-        processor.model_status_dict = {"resp": "normal"}
-
-        processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer)
-        processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-only")
-
-        response = SimpleNamespace(
-            request_id="resp", outputs=SimpleNamespace(token_ids=[1, processor.tokenizer.eos_token_id]), finished=True
-        )
-
-        processed = processor.process_response_obj_normal(response)
-        self.assertEqual(processed.outputs.text, "tool-only")
-        self.assertEqual(processed.outputs.reasoning_content, "think")
-        self.assertEqual(processed.outputs.tool_calls, ["tool"])
-
-    def test_process_response_streaming_clears_state(self):
-        processor = self.processor
-        req_id = "stream"
-        processor.decode_status[req_id] = [0, 0, [], ""]
-        response = {"finished": True, "request_id": req_id, "outputs": {"token_ids": [7]}}
-        response = RequestOutput.from_dict(response)
-
-        result = processor.process_response_obj_streaming(response, enable_thinking=False)
-        self.assertEqual(result.outputs.text, "7")
-        self.assertNotIn(req_id, processor.decode_status)
-
-    def test_process_response_obj_normal_with_reasoning(self):
-        processor = self.processor
-        processor.model_status_dict = {"normal": "normal"}
-        processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer, reasoning_content="because")
-        processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-text")
-
-        response = {
-            "finished": True,
-            "request_id": "normal",
-            "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
-        }
-        response = RequestOutput.from_dict(response)
-
-        result = processor.process_response_obj_normal(response, enable_thinking=True)
-        self.assertEqual(result.outputs.completion_tokens, "7")
-        self.assertEqual(result.outputs.text, "tool-text")
-        self.assertEqual(result.outputs.reasoning_content, "because")
-        self.assertEqual(result.outputs.reasoning_token_num, 1)
-
-    def test_process_response_dict_dispatch(self):
-        processor = self.processor
-        calls = {}
-
-        def fake_stream(response_obj, **kwargs):
-            calls["stream"] = kwargs
-            return "stream"
-
-        def fake_normal(response_obj, **kwargs):
-            calls["normal"] = kwargs
-            return "normal"
-
-        original_stream = processor.process_response_obj_streaming
-        original_normal = processor.process_response_obj_normal
-        processor.process_response_obj_streaming = fake_stream
-        processor.process_response_obj_normal = fake_normal
-        self.addCleanup(lambda: setattr(processor, "process_response_obj_streaming", original_stream))
-        self.addCleanup(lambda: setattr(processor, "process_response_obj_normal", original_normal))
-
-        response = {"outputs": {}, "finished": False, "request_id": "req"}
-        self.assertEqual(processor.process_response_dict(response, stream=True, enable_thinking=True), "stream")
-        self.assertTrue(calls["stream"]["enable_thinking"])
-        self.assertEqual(
-            processor.process_response_dict(response, stream=False, enable_thinking=True),
-            "normal",
-        )
-        self.assertTrue(calls["normal"]["enable_thinking"])
-
-    def test_update_stop_seq_excludes_eos(self):
-        stop_seqs, stop_len = self.processor.update_stop_seq(["stop", self.processor.tokenizer.eos_token_id])
-        self.assertEqual(stop_seqs, [[4]])
-        self.assertEqual(stop_len, [1])
-
-    def test_pad_batch_data_left_padding(self):
-        padded, lengths = self.processor.pad_batch_data(
-            [[1], [2, 3]],
-            pad_id=-1,
-            return_seq_len=True,
-            return_array=False,
-            pad_style="left",
-        )
-        self.assertEqual(padded, [[-1, 1], [2, 3]])
-        self.assertEqual(lengths, [1, 2])
-
-    def test_pad_batch_data_empty_returns_array(self):
-        padded, lengths = self.processor.pad_batch_data([], return_seq_len=True)
-        self.assertEqual(padded.shape, (1, 0))
-        self.assertEqual(lengths.shape, (0,))
-
-    def test_get_pad_id_prefers_eos_when_missing(self):
-        processor = self.text_processor_module.DataProcessor("stub-model")
-        llama_tokenizer = DummyLlamaTokenizer()
-        llama_tokenizer.pad_token_id = None
-        llama_tokenizer.eos_token = 99
-        processor.tokenizer = llama_tokenizer
-
-        self.assertEqual(processor.get_pad_id(), 99)
-
-    def test_load_tokenizer_hf_branch(self):
-        module, cleanup = _import_text_processor(use_hf_tokenizer=True)
-        self.addCleanup(cleanup)
-        processor = module.DataProcessor("stub-model")
-        self.assertIsInstance(processor.tokenizer, DummyTokenizer)
-
-    def test_text2ids_hf_branch(self):
-        module, cleanup = _import_text_processor(use_hf_tokenizer=True)
-        self.addCleanup(cleanup)
-        processor = module.DataProcessor("stub-model")
-        ids = processor.text2ids("hi", max_model_len=5)
-        self.assertEqual(ids.tolist(), [2, 0, 0, 0, 0][: len(ids)])
-
-    def test_process_logprob_response(self):
-        self.assertEqual(self.processor.process_logprob_response([1, 2]), "1 2")
-
-    def test_process_request_dict_uses_existing_ids(self):
-        request = {"request_id": "test_0", "prompt_token_ids": [1, 2, 3], "max_tokens": 5}
-        request = Request.from_dict(request)
-        processed = self.processor.process_request_dict(request, max_model_len=6)
-        self.assertEqual(processed.prompt_token_ids, [1, 2, 3])
-        self.assertEqual(processed.sampling_params.max_tokens, 3)
-
-    def test_process_request_dict_requires_chat_template(self):
-        original_template = self.processor.tokenizer.chat_template
-        self.processor.tokenizer.chat_template = None
-        self.addCleanup(lambda: setattr(self.processor.tokenizer, "chat_template", original_template))
-        with self.assertRaisesRegex(ValueError, "chat_template"):
-            request = {"request_id": "test_0", "messages": [{"role": "user", "content": "hi"}]}
-            request = Request.from_dict(request)
-            self.processor.process_request_dict(request)
-
-    def test_update_bad_words_with_warnings(self):
-        processor = self.processor
-
-        def custom_tokenize(text):
-            base = text.strip()
-            if base == "combo":
-                return ["co", "mbo"]
-            if base == "oversize":
-                return [base]
-            return [base]
-
-        def custom_convert(tokens):
-            if tokens == ["co", "mbo"]:
-                return [1, 2]
-            if tokens == ["oversize"]:
-                return [processor.tokenizer.vocab_size + 1]
-            return [len(tokens[0])]
-
-        original_tokenize = processor.tokenizer.tokenize
-        original_convert = processor.tokenizer.convert_tokens_to_ids
-        processor.tokenizer.tokenize = custom_tokenize
-        processor.tokenizer.convert_tokens_to_ids = custom_convert
-        self.addCleanup(lambda: setattr(processor.tokenizer, "tokenize", original_tokenize))
-        self.addCleanup(lambda: setattr(processor.tokenizer, "convert_tokens_to_ids", original_convert))
-
-        self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), [])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/input/v1/test_tokenizer_client.py b/tests/input/v1/test_tokenizer_client.py
deleted file mode 100644
index 06804ebade3..00000000000
--- a/tests/input/v1/test_tokenizer_client.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import httpx
-import pytest
-import respx
-
-from fastdeploy.input.tokenzier_client import (
-    AsyncTokenizerClient,
-    ImageEncodeRequest,
-    VideoEncodeRequest,
-)
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_image_success():
-    base_url = "http://testserver"
-    client = AsyncTokenizerClient(base_url=base_url)
-
-    # Mock 创建任务接口
-    respx.post(f"{base_url}/image/encode").mock(
-        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task123"})
-    )
-    # Mock 轮询接口，返回完成状态
-    mock_get_ret = {
-        "state": "Finished",
-        "result": {"feature_url": "bos://host:port/key", "feature_shape": [80, 45, 1563]},
-    }
-    respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json=mock_get_ret))
-
-    request = ImageEncodeRequest(
-        version="v1", req_id="req_img_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg"
-    )
-
-    result = await client.encode_image(request)
-    assert result["feature_url"] == "bos://host:port/key"
-    assert result["feature_shape"] == [80, 45, 1563]
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_video_failure():
-    base_url = "http://testserver"
-    client = AsyncTokenizerClient(base_url=base_url, max_wait=1)
-
-    respx.post(f"{base_url}/video/encode").mock(
-        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_vid_001"})
-    )
-    # 模拟轮询接口失败状态
-    respx.get(f"{base_url}/encode/get").mock(
-        return_value=httpx.Response(200, json={"state": "Error", "message": "Encode failed"})
-    )
-
-    request = VideoEncodeRequest(
-        version="v1",
-        req_id="req_vid_001",
-        is_gen=True,
-        resolution=720,
-        video_url="http://example.com/video.mp4",
-        start_ts=0.0,
-        end_ts=10.0,
-        frames=30,
-        vit_merge=True,
-    )
-
-    with pytest.raises(RuntimeError, match="Encode failed"):
-        await client.encode_video(request)
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_timeout():
-    base_url = "http://testserver"
-    client = AsyncTokenizerClient(base_url=base_url, max_wait=1, poll_interval=0.1)
-
-    respx.post(f"{base_url}/image/encode").mock(
-        return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_timeout"})
-    )
-    # 模拟轮询接口一直返回等待状态，导致超时
-    respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json={"status": "processing"}))
-
-    request = ImageEncodeRequest(
-        version="v1", req_id="req_img_timeout", is_gen=False, resolution=256, image_url="http://example.com/image.jpg"
-    )
-
-    with pytest.raises(TimeoutError):
-        await client.encode_image(request)
diff --git a/tests/inter_communicator/test_zmq_server.py b/tests/inter_communicator/test_zmq_server.py
index 629551b1707..57c9a0c479a 100644
--- a/tests/inter_communicator/test_zmq_server.py
+++ b/tests/inter_communicator/test_zmq_server.py
@@ -9,7 +9,6 @@
 import types
 import unittest
 from collections import defaultdict
-from multiprocessing.reduction import ForkingPickler
 from unittest import mock
 
 import msgpack
@@ -264,19 +263,6 @@ def send(self, msg, flags=0, **kwargs):
         with self.assertRaises(RuntimeError):
             server.send_pyobj({"boom": True})
 
-    def test_pack_aggregated_data_respects_env_flag(self):
-        server = _DummyServer()
-        responses = [_DummyResponse(1), _DummyResponse(2, finished=True)]
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            packed = server.pack_aggregated_data(responses)
-            unpacked = ForkingPickler.loads(packed)
-            self.assertEqual(unpacked[0]["tensor_sum"], 3)
-
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
-            packed = server.pack_aggregated_data(responses)
-            unpacked = ForkingPickler.loads(packed)
-            self.assertIsInstance(unpacked[0], _DummyResponse)
-
     def test_receive_json_once_paths(self):
         fake_socket = _FakeSocket()
         fake_socket.closed = True
@@ -360,8 +346,7 @@ def test_send_response_per_query_cache_and_flush(self):
         self.assertIn(req_id, server.cached_results)
 
         server.req_dict[req_id] = b"client"
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)])
+        server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)])
         self.assertNotIn(req_id, server.req_dict)
         self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
 
@@ -370,17 +355,7 @@ def test_send_response_per_query_aggregate(self):
         server = _DummyServer(socket=fake_socket)
         server.req_dict["req-agg"] = b"client"
         server.aggregate_send = True
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)])
-        self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
-
-    def test_send_response_per_query_v1_processor(self):
-        fake_socket = _FakeSocket()
-        server = _DummyServer(socket=fake_socket)
-        server.req_dict["req-v1"] = b"client"
-        server.aggregate_send = False
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
-            server._send_response_per_query("req-v1", [_DummyResponse(6, finished=True)])
+        server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)])
         self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
 
     def test_send_response_per_query_send_failure(self):
@@ -391,8 +366,7 @@ def send_multipart(self, parts, copy=True):
         server = _DummyServer(socket=_ErrorSocket())
         server.req_dict["req-error"] = b"client"
         server.aggregate_send = False
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)])
+        server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)])
         self.assertEqual(server.req_dict, {})
 
     def test_send_response_per_query_raises_without_socket(self):
@@ -436,22 +410,11 @@ def test_send_batch_response_success(self):
         fake_socket = _FakeSocket()
         server = _DummyServer(socket=fake_socket)
         server.address = "test-address"
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            batch_data = [[_DummyResponse(1, finished=True)]]
-            server._send_batch_response(batch_data)
+        batch_data = [[_DummyResponse(1, finished=True)]]
+        server._send_batch_response(batch_data)
         self.assertEqual(len(fake_socket.sent), 1)
         self.assertEqual(fake_socket.sent[0][0], "send")
 
-    def test_send_batch_response_v1_processor(self):
-        """Test _send_batch_response with ENABLE_V1_DATA_PROCESSOR=True"""
-        fake_socket = _FakeSocket()
-        server = _DummyServer(socket=fake_socket)
-        server.address = "test-address"
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
-            batch_data = [[_DummyResponse(1, finished=True)]]
-            server._send_batch_response(batch_data)
-        self.assertEqual(len(fake_socket.sent), 1)
-
     def test_send_batch_response_raises_without_socket(self):
         """Test _send_batch_response logs error and returns when socket is None"""
         server = _DummyServer(socket=None)
@@ -470,9 +433,8 @@ def send(self, msg, flags=0, **kwargs):
         server = _DummyServer(socket=_ErrorSocket())
         server.address = "test-address"
         batch_data = [[_DummyResponse(1)]]
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            # Should not raise, error is caught and logged
-            server._send_batch_response(batch_data)
+        # Should not raise, error is caught and logged
+        server._send_batch_response(batch_data)
 
     def test_recv_result_handle_paths(self):
         fake_socket = _FakeSocket()
@@ -640,10 +602,9 @@ def test_send_batch_response_with_worker_pid_none_uses_default_socket(self):
         server = _DummyServer(socket=fake_socket)
         server.address = "test-address"
 
-        with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
-            batch_data = [[_DummyResponse(1, finished=True)]]
-            # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket
-            server._send_batch_response(batch_data, worker_pid=None)
+        batch_data = [[_DummyResponse(1, finished=True)]]
+        # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket
+        server._send_batch_response(batch_data, worker_pid=None)
 
         # The default socket should have been used to send the data
         self.assertEqual(len(fake_socket.sent), 1)
diff --git a/tests/model_executor/test_thinking_budget.py b/tests/model_executor/test_thinking_budget.py
index 8ba9319ff7d..d9dd8f4b9b9 100644
--- a/tests/model_executor/test_thinking_budget.py
+++ b/tests/model_executor/test_thinking_budget.py
@@ -27,13 +27,6 @@
     Ernie4_5_VLProcessor as ErnieVLDataProcessor,
 )
 from fastdeploy.input.text_processor import DataProcessor as TextDataProcessor
-from fastdeploy.input.v1.ernie4_5_processor import (
-    Ernie4_5Processor as V1ErnieTextDataProcessor,
-)
-from fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor import (
-    Ernie4_5_VLProcessor as V1ErnieVLDataProcessor,
-)
-from fastdeploy.input.v1.text_processor import DataProcessor as V1TextDataProcessor
 from fastdeploy.model_executor.logits_processor import ThinkingBudgetLogitsProcessor
 from fastdeploy.scheduler import SchedulerConfig
 
@@ -711,31 +704,6 @@ def parallel_config(self):
     ips = None
 
 
-class DummyRequestV1(SimpleNamespace):
-    def get(self, key, default=None):
-        if hasattr(self, key):
-            value = getattr(self, key)
-            if value is not None:
-                return value
-        if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key):
-            value = getattr(self.sampling_params, key)
-            if value is not None:
-                return value
-        return default
-
-    def __getitem__(self, key):
-        return getattr(self, key)
-
-    def __setitem__(self, key, value):
-        setattr(self, key, value)
-
-    def set(self, key, value):
-        if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key):
-            setattr(self.sampling_params, key, value)
-        else:
-            setattr(self, key, value)
-
-
 class TestThinkingBudgetSupplemental(unittest.TestCase):
     def test_update_thinking_prompt_state_from_text_processor(self):
         processor = TextDataProcessor.__new__(TextDataProcessor)
@@ -750,43 +718,6 @@ def test_update_thinking_prompt_state_from_text_processor(self):
         self.assertEqual(updated["think_prompt_tokens_after_start"], 0)
         self.assertEqual(updated["think_prompt_last_token_id"], 3)
 
-    def test_v1_process_request_missing_logits_processors_args(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        processor.generation_config = SimpleNamespace(
-            top_p=0.7,
-            temperature=1.0,
-            repetition_penalty=1.0,
-            frequency_penalty=0.0,
-            presence_penalty=0.0,
-        )
-        processor.eos_token_ids = [1]
-        processor.update_stop_seq = lambda *args, **kwargs: None
-        processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
-        processor.encode_with_cache = lambda *args, **kwargs: [1]
-        processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
-        processor.reasoning_parser = None
-        request = DummyRequestV1(
-            request_id="req",
-            eos_token_ids=None,
-            prompt_token_ids=[1],
-            prompt=None,
-            messages=None,
-            max_tokens=1,
-            chat_template_kwargs=None,
-            sampling_params=SimpleNamespace(
-                bad_words=None,
-                bad_words_token_ids=None,
-                max_tokens=1,
-                temperature=1.0,
-                top_p=0.9,
-                repetition_penalty=1.0,
-                frequency_penalty=0.0,
-                presence_penalty=0.0,
-            ),
-        )
-        with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
-            processor.process_request(request, max_model_len=8)
-
     def test_engine_line_break_id_from_dict(self):
         tokenizer = DummyTokenizerForTextProcessor()
         data_processor = SimpleNamespace(tokenizer=tokenizer, eos_token_id_len=1, pad_token_id=0)
@@ -835,27 +766,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False):
         self.assertEqual(processor.encode_with_cache("iter"), [21, 22])
         self.assertNotIn(("np", False), processor._tokenize_cache)
 
-    def test_v1_encode_with_cache_branches(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        processor._tokenize_cache = OrderedDict()
-        processor._tokenize_cache_capacity = 1
-        call_counter = {"np": 0, "iter": 0}
-
-        def _text2ids(text, max_model_len=None, add_special_tokens=False):
-            if text == "np":
-                call_counter["np"] += 1
-                return np.array([31, 32], dtype=np.int64)
-            call_counter["iter"] += 1
-            return (v for v in [41, 42])
-
-        processor.text2ids = _text2ids
-
-        self.assertEqual(processor.encode_with_cache("np"), [31, 32])
-        self.assertEqual(processor.encode_with_cache("np"), [31, 32])
-        self.assertEqual(call_counter["np"], 1)
-        self.assertEqual(processor.encode_with_cache("iter"), [41, 42])
-        self.assertNotIn(("np", False), processor._tokenize_cache)
-
     def test_text_encode_with_cache_lazy_init(self):
         processor = TextDataProcessor.__new__(TextDataProcessor)
         call_counter = {"count": 0}
@@ -872,22 +782,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False):
         self.assertEqual(processor.encode_with_cache("lazy"), [51, 52])
         self.assertEqual(call_counter["count"], 1)
 
-    def test_v1_encode_with_cache_lazy_init(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        call_counter = {"count": 0}
-
-        def _text2ids(text, max_model_len=None, add_special_tokens=False):
-            call_counter["count"] += 1
-            return np.array([61, 62], dtype=np.int64)
-
-        processor.text2ids = _text2ids
-
-        self.assertFalse(hasattr(processor, "_tokenize_cache"))
-        self.assertEqual(processor.encode_with_cache("lazy"), [61, 62])
-        self.assertTrue(hasattr(processor, "_tokenize_cache"))
-        self.assertEqual(processor.encode_with_cache("lazy"), [61, 62])
-        self.assertEqual(call_counter["count"], 1)
-
     def test_ernie_encode_literal_text_with_cache(self):
         processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor)
         processor.tokenizer = SimpleNamespace(
@@ -898,16 +792,6 @@ def test_ernie_encode_literal_text_with_cache(self):
         self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72])
         self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72])
 
-    def test_v1_ernie_encode_literal_text_with_cache(self):
-        processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor)
-        processor.tokenizer = SimpleNamespace(
-            tokenize=lambda text: ["token_c", "token_d"],
-            convert_tokens_to_ids=lambda tokens: [81, 82],
-        )
-
-        self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82])
-        self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82])
-
     def test_text_update_thinking_prompt_state_branches(self):
         processor = TextDataProcessor.__new__(TextDataProcessor)
         processor._think_token_ids = None
@@ -949,29 +833,6 @@ def test_text_update_thinking_prompt_state_branches(self):
         # 命中 _get_think_token_ids 的缓存分支
         self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID))
 
-    def test_v1_update_thinking_prompt_state_branches(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        processor._think_token_ids = None
-        processor.tokenizer = DummyTokenizerForTextProcessor()
-
-        self.assertEqual(processor._update_thinking_prompt_state([1], "not-dict"), "not-dict")
-        self.assertEqual(
-            processor._update_thinking_prompt_state([1], {"thinking_budget": -1}), {"thinking_budget": -1}
-        )
-        self.assertEqual(processor._update_thinking_prompt_state(None, {"thinking_budget": 1}), {"thinking_budget": 1})
-
-        with_start_no_end = processor._update_thinking_prompt_state(
-            np.array([1, THINKING_START_TOKEN_ID, 2, 3], dtype=np.int64),
-            {"thinking_budget": 4},
-        )
-        self.assertTrue(with_start_no_end["think_prompt_started"])
-        self.assertFalse(with_start_no_end["think_prompt_ended"])
-        self.assertEqual(with_start_no_end["think_prompt_tokens_after_start"], 0)
-        self.assertEqual(with_start_no_end["think_prompt_last_token_id"], 3)
-
-        # 命中 _get_think_token_ids 的缓存分支
-        self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID))
-
     def test_text_process_request_dict_think_stop_sentence(self):
         processor = TextDataProcessor.__new__(TextDataProcessor)
         processor._apply_default_parameters = lambda request: request
@@ -1003,74 +864,6 @@ def test_text_process_request_dict_think_stop_sentence(self):
         )
         self.assertNotIn("think_stop_sentence", processed["logits_processors_args"])
 
-    def test_v1_process_request_think_stop_sentence(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        processor._apply_default_parameters = lambda request: request
-        processor.eos_token_ids = [1]
-        processor.update_stop_seq = lambda *args, **kwargs: None
-        processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
-        processor._encode_literal_text_with_cache = lambda text: [301, 302]
-        processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
-        processor.reasoning_parser = None
-
-        request = DummyRequestV1(
-            request_id="req_v1",
-            eos_token_ids=[1],
-            prompt_token_ids=[10],
-            prompt=None,
-            messages=None,
-            logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
-            bad_words=None,
-            bad_words_token_ids=None,
-            max_tokens=1,
-            temperature=1.0,
-            top_p=0.9,
-        )
-        with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
-            processed = processor.process_request(request, max_model_len=16)
-        self.assertEqual(
-            processed.logits_processors_args.get("think_stop_sentence_token_ids"),
-            [301, 302],
-        )
-        self.assertNotIn("think_stop_sentence", processed.logits_processors_args)
-
-    def test_v1_process_request_dict_think_stop_sentence(self):
-        processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
-        processor._apply_default_parameters = lambda request: request
-        processor.eos_token_ids = [1]
-        processor.update_stop_seq = lambda *args, **kwargs: None
-        processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
-        processor._encode_literal_text_with_cache = lambda text: [401, 402]
-        processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
-        processor.reasoning_parser = None
-
-        request = DummyRequestV1(
-            request_id="req_v1_dict",
-            eos_token_ids=[1],
-            prompt_token_ids=[11],
-            prompt=None,
-            messages=None,
-            chat_template_kwargs=None,
-            sampling_params=SimpleNamespace(
-                bad_words=None,
-                bad_words_token_ids=None,
-                max_tokens=1,
-                temperature=1.0,
-                top_p=0.9,
-                repetition_penalty=1.0,
-                frequency_penalty=0.0,
-                presence_penalty=0.0,
-                logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
-            ),
-        )
-        with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
-            processed = processor.process_request_dict(request, max_model_len=16)
-        self.assertEqual(
-            processed.sampling_params.logits_processors_args.get("think_stop_sentence_token_ids"),
-            [401, 402],
-        )
-        self.assertNotIn("think_stop_sentence", processed.sampling_params.logits_processors_args)
-
     def test_ernie_process_request_dict_prepares_thinking_budget_args(self):
         processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor)
         processor._apply_default_parameters = lambda request: request
@@ -1104,46 +897,6 @@ def test_ernie_process_request_dict_prepares_thinking_budget_args(self):
         self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"])
         self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0)
 
-    def test_v1_ernie_process_request_dict_prepares_thinking_budget_args(self):
-        processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor)
-        processor._apply_default_parameters = lambda request: request
-        processor.eos_token_ids = [1]
-        processor.update_stop_seq = lambda *args, **kwargs: None
-        processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
-        processor._encode_literal_text_with_cache = lambda text: [601, 602]
-        processor.tokenizer = DummyTokenizerForTextProcessor()
-        processor.reasoning_parser = None
-
-        request = DummyRequestV1(
-            request_id="req_v1_ernie_text",
-            eos_token_ids=[1],
-            prompt_token_ids=[1, THINKING_START_TOKEN_ID, 2],
-            prompt=None,
-            messages=None,
-            chat_template_kwargs=None,
-            enable_thinking=True,
-            sampling_params=SimpleNamespace(
-                bad_words=None,
-                bad_words_token_ids=None,
-                max_tokens=1,
-                temperature=1.0,
-                top_p=0.9,
-                repetition_penalty=1.0,
-                frequency_penalty=0.0,
-                presence_penalty=0.0,
-                response_max_tokens=None,
-                n=1,
-                logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
-            ),
-        )
-        with patch("fastdeploy.input.v1.ernie4_5_processor.process_stop_token_ids", lambda *args, **kwargs: None):
-            processed = processor.process_request_dict(request, max_model_len=16)
-
-        self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [601, 602])
-        self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"])
-        self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"])
-        self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0)
-
     def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
         processor = ErnieVLDataProcessor.__new__(ErnieVLDataProcessor)
         processor._apply_default_parameters = lambda request: request
@@ -1182,58 +935,6 @@ def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
         self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"])
         self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0)
 
-    def test_v1_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
-        processor = V1ErnieVLDataProcessor.__new__(V1ErnieVLDataProcessor)
-        processor._apply_default_parameters = lambda request: request
-        processor.eos_token_ids = [1]
-        processor.update_stop_seq = lambda *args, **kwargs: None
-        processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
-        processor._encode_literal_text_with_cache = lambda text: [801, 802]
-        processor.tokenizer = DummyTokenizerForTextProcessor()
-        processor.reasoning_parser = None
-        processor._check_mm_limits = lambda *args, **kwargs: None
-        processor.append_completion_tokens = lambda *args, **kwargs: None
-        processor.pack_outputs = lambda outs: outs
-        processor.ernie4_5_processor = SimpleNamespace(
-            request2ids=lambda request: {"input_ids": np.array([1, THINKING_START_TOKEN_ID, 2], dtype=np.int64)}
-        )
-
-        request = DummyRequestV1(
-            request_id="req_v1_ernie_vl",
-            eos_token_ids=[1],
-            prompt_token_ids=None,
-            prompt=None,
-            messages=[{"role": "user", "content": "hi"}],
-            chat_template_kwargs=None,
-            enable_thinking=True,
-            completion_token_ids=None,
-            multimodal_data=None,
-            sampling_params=SimpleNamespace(
-                bad_words=None,
-                bad_words_token_ids=None,
-                max_tokens=1,
-                temperature=1.0,
-                top_p=0.9,
-                repetition_penalty=1.0,
-                frequency_penalty=0.0,
-                presence_penalty=0.0,
-                response_max_tokens=None,
-                reasoning_max_tokens=None,
-                n=1,
-                logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
-            ),
-        )
-        with patch(
-            "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.process_stop_token_ids",
-            lambda *args, **kwargs: None,
-        ):
-            processed = processor.process_request_dict(request, max_model_len=16)
-
-        self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [801, 802])
-        self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"])
-        self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"])
-        self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0)
-
 
 if __name__ == "__main__":
     unittest.main()