diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py
index 3f99388d685..4afb3dc5c49 100644
--- a/fastdeploy/engine/async_llm.py
+++ b/fastdeploy/engine/async_llm.py
@@ -446,7 +446,7 @@ async def add_request(
)
if envs.ZMQ_SEND_BATCH_DATA and self.connection_manager is not None:
request["zmq_worker_pid"] = self.connection_manager.worker_pid
- if not envs.ENABLE_V1_DATA_PROCESSOR and self.cfg.model_config.enable_mm:
+ if self.cfg.model_config.enable_mm:
self.request_client.send_pyobj(request)
else:
self.request_client.send_json(request)
@@ -543,8 +543,7 @@ async def generate(
)
else:
processed_output = response_item
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- processed_output = RequestOutput.from_dict(processed_output)
+ processed_output = RequestOutput.from_dict(processed_output)
# Enrich outputs with prompt metadata on the first packet
if req_id:
prompt_meta = self._prompt_metadata.get(req_id)
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index 28776b53ede..a2e72a1c844 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -1165,7 +1165,7 @@ def _insert_zmq_task_to_scheduler(self):
while self.running:
try:
block = True if len(added_requests) == 0 else False
- if not self.cfg.model_config.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+ if not self.cfg.model_config.enable_mm:
err, data = self.recv_request_server.receive_json_once(block)
else:
err, data = self.recv_request_server.receive_pyobj_once(block)
@@ -1222,8 +1222,7 @@ def _insert_zmq_task_to_scheduler(self):
continue
err_msg = None
try:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- request = Request.from_dict(data)
+ request = Request.from_dict(data)
request.metrics.scheduler_recv_req_time = time.time()
main_process_metrics.requests_number.inc()
trace_carrier = data.get("trace_carrier")
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 3f311e743fd..f03a18594de 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -437,7 +437,7 @@ async def add_requests(self, task):
def _send_task(self, task):
if envs.ZMQ_SEND_BATCH_DATA:
task["zmq_worker_pid"] = self.worker_pid
- if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+ if not self.enable_mm:
self.zmq_client.send_json(task)
else:
if envs.FD_ENABLE_E2W_TENSOR_CONVERT:
@@ -599,7 +599,7 @@ async def run_control_method(self, request: ControlRequest):
req_dict = request.to_dict()
if envs.ZMQ_SEND_BATCH_DATA:
req_dict["zmq_worker_pid"] = self.worker_pid
- if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR:
+ if not self.enable_mm:
self.zmq_client.send_json(req_dict)
else:
self.zmq_client.send_pyobj(req_dict)
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
index 9d380b0db0c..09e06ffa0f6 100644
--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@
import fastdeploy.envs as envs
import fastdeploy.metrics.trace as tracing
-from fastdeploy.engine.request import Request, RequestOutput
+from fastdeploy.engine.request import RequestOutput
from fastdeploy.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
@@ -145,10 +145,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
prompt_tokens = None
max_tokens = None
try:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- current_req_dict = request.to_dict_for_infer(f"{request_id}_0")
- else:
- current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0")
+ current_req_dict = request.to_dict_for_infer(f"{request_id}_0")
if "chat_template" not in current_req_dict:
current_req_dict["chat_template"] = self.chat_template
current_req_dict["metrics"]["arrival_time"] = time.time()
diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py
index 4caf9fe210a..9c2b386fce0 100644
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -27,7 +27,7 @@
import fastdeploy.envs as envs
import fastdeploy.metrics.trace as tracing
-from fastdeploy.engine.request import Request, RequestOutput
+from fastdeploy.engine.request import RequestOutput
from fastdeploy.entrypoints.openai.protocol import (
CompletionLogprobs,
CompletionRequest,
@@ -178,10 +178,7 @@ async def create_completion(self, request: CompletionRequest):
try:
for idx, prompt in enumerate(request_prompts):
request_id_idx = f"{request_id}_{idx}"
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
- else:
- current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0")
+ current_req_dict = request.to_dict_for_infer(request_id_idx, prompt)
current_req_dict["metrics"]["arrival_time"] = time.time()
prompt_token_ids = await self.engine_client.format_and_add_data(current_req_dict) # tokenize
if isinstance(prompt_token_ids, np.ndarray):
diff --git a/fastdeploy/entrypoints/openai/serving_embedding.py b/fastdeploy/entrypoints/openai/serving_embedding.py
index ec3223b3576..25f3f630510 100644
--- a/fastdeploy/entrypoints/openai/serving_embedding.py
+++ b/fastdeploy/entrypoints/openai/serving_embedding.py
@@ -15,20 +15,17 @@
"""
import base64
-import time
from collections.abc import AsyncGenerator
from typing import Literal, Union
import numpy as np
from typing_extensions import assert_never, override
-import fastdeploy.envs as envs
from fastdeploy.engine.pooling_params import PoolingParams
from fastdeploy.engine.request import (
EmbeddingOutput,
EmbeddingRequestOutput,
PoolingRequestOutput,
- Request,
)
from fastdeploy.entrypoints.openai.protocol import (
EmbeddingCompletionRequest,
@@ -69,25 +66,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_
@override
def _request_to_dict(self, ctx: ServeContext):
request: EmbeddingRequest = ctx.request
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- request_dict = super()._request_to_dict(ctx)
- if hasattr(request, "to_pooling_params"):
- pooling_params: PoolingParams = request.to_pooling_params()
- pooling_params.verify("embed", self.cfg.model_config)
- request_dict["pooling_params"] = pooling_params.to_dict()
- request_dict["metrics"] = {}
- return request_dict
- else:
- request_obj = None
- if hasattr(request, "to_pooling_params"):
- pooling_params: PoolingParams = request.to_pooling_params()
- pooling_params.verify("embed", self.cfg.model_config)
- request_obj = Request.from_generic_request(
- req=request, request_id=ctx.request_id, pooling_params=pooling_params
- )
- request_obj.metrics.arrival_time = time.time()
- super()._process_chat_template_kwargs(request_obj)
- return request_obj
+ request_dict = super()._request_to_dict(ctx)
+ if hasattr(request, "to_pooling_params"):
+ pooling_params: PoolingParams = request.to_pooling_params()
+ pooling_params.verify("embed", self.cfg.model_config)
+ request_dict["pooling_params"] = pooling_params.to_dict()
+ request_dict["metrics"] = {}
+ return request_dict
@override
def _request_to_batch_dicts(self, ctx: ServeContext):
diff --git a/fastdeploy/entrypoints/openai/serving_reward.py b/fastdeploy/entrypoints/openai/serving_reward.py
index cbde62deea5..cc3ed8a4729 100644
--- a/fastdeploy/entrypoints/openai/serving_reward.py
+++ b/fastdeploy/entrypoints/openai/serving_reward.py
@@ -14,14 +14,12 @@
# limitations under the License.
"""
-import time
from collections.abc import AsyncGenerator
from typing_extensions import override
-import fastdeploy.envs as envs
from fastdeploy.engine.pooling_params import PoolingParams
-from fastdeploy.engine.request import PoolingRequestOutput, Request, RewardRequestOutput
+from fastdeploy.engine.request import PoolingRequestOutput, RewardRequestOutput
from fastdeploy.entrypoints.openai.protocol import (
ChatRewardData,
ChatRewardRequest,
@@ -46,25 +44,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_
@override
def _request_to_dict(self, ctx: ServeContext):
request: ChatRewardRequest = ctx.request
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- request_dict = super()._request_to_dict(ctx)
- if hasattr(request, "to_pooling_params"):
- pooling_params: PoolingParams = request.to_pooling_params()
- pooling_params.verify("reward", self.cfg.model_config)
- request_dict["pooling_params"] = pooling_params.to_dict()
- request_dict["metrics"] = {}
- return request_dict
- else:
- request_obj: Request = None
- if hasattr(request, "to_pooling_params"):
- pooling_params: PoolingParams = request.to_pooling_params()
- pooling_params.verify("reward", self.cfg.model_config)
- request_obj = Request.from_generic_request(
- req=request, request_id=ctx.request_id, pooling_params=pooling_params
- )
- request_obj.metrics.arrival_time = time.time()
- super()._process_chat_template_kwargs(request_obj)
- return request_obj
+ request_dict = super()._request_to_dict(ctx)
+ if hasattr(request, "to_pooling_params"):
+ pooling_params: PoolingParams = request.to_pooling_params()
+ pooling_params.verify("reward", self.cfg.model_config)
+ request_dict["pooling_params"] = pooling_params.to_dict()
+ request_dict["metrics"] = {}
+ return request_dict
@override
def _request_to_batch_dicts(self, ctx: ServeContext):
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index 72cd6dc7c48..fef58eaf6cc 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -94,8 +94,6 @@ def _validate_split_kv_size(value: int) -> int:
"EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"),
# enable kv cache block scheduler v1 (no need for kv_cache_ratio)
"ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")),
- # enable data processor v2
- "ENABLE_V1_DATA_PROCESSOR": lambda: int(os.getenv("ENABLE_V1_DATA_PROCESSOR", "0")),
# set prealloc block num for decoder
"FD_ENC_DEC_BLOCK_NUM": lambda: int(os.getenv("FD_ENC_DEC_BLOCK_NUM", "2")),
# enbale max prefill of one execute step
diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py
index 04c028d9060..56bbe1296f7 100644
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -19,7 +19,6 @@
from fastdeploy.config import ErnieArchitectures, ModelConfig
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
from fastdeploy.reasoning import ReasoningParserManager
-from fastdeploy.utils import envs
from fastdeploy.utils import llm_logger as logger
@@ -83,10 +82,7 @@ def create_processor(self):
logger.info(f"Plugin input processor not available ({e}), using built-in processor")
if not self.model_config.enable_mm:
if not ErnieArchitectures.contains_ernie_arch(architecture):
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.text_processor import DataProcessor
- else:
- from fastdeploy.input.v1.text_processor import DataProcessor
+ from fastdeploy.input.text_processor import DataProcessor
self.processor = DataProcessor(
model_name_or_path=self.model_name_or_path,
@@ -94,14 +90,7 @@ def create_processor(self):
tool_parser_obj=tool_parser_obj,
)
else:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.ernie4_5_processor import (
- Ernie4_5Processor,
- )
- else:
- from fastdeploy.input.v1.ernie4_5_processor import (
- Ernie4_5Processor,
- )
+ from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
self.processor = Ernie4_5Processor(
model_name_or_path=self.model_name_or_path,
@@ -110,14 +99,9 @@ def create_processor(self):
)
else:
if ErnieArchitectures.contains_ernie_arch(architecture):
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.ernie4_5_vl_processor import (
- Ernie4_5_VLProcessor,
- )
- else:
- from fastdeploy.input.v1.ernie4_5_vl_processor import (
- Ernie4_5_VLProcessor,
- )
+ from fastdeploy.input.ernie4_5_vl_processor import (
+ Ernie4_5_VLProcessor,
+ )
self.processor = Ernie4_5_VLProcessor(
model_name_or_path=self.model_name_or_path,
@@ -128,14 +112,9 @@ def create_processor(self):
enable_processor_cache=self.enable_processor_cache,
)
elif "PaddleOCRVL" in architecture:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.paddleocr_vl_processor import (
- PaddleOCRVLProcessor,
- )
- else:
- from fastdeploy.input.v1.paddleocr_vl_processor import (
- PaddleOCRVLProcessor,
- )
+ from fastdeploy.input.paddleocr_vl_processor import (
+ PaddleOCRVLProcessor,
+ )
self.processor = PaddleOCRVLProcessor(
config=self.model_config,
@@ -145,12 +124,7 @@ def create_processor(self):
reasoning_parser_obj=reasoning_parser_obj,
)
elif "Qwen2_5_VL" in architecture:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
- else:
- from fastdeploy.input.v1.qwen_vl_processor import (
- QwenVLProcessor,
- )
+ from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
self.processor = QwenVLProcessor(
config=self.model_config,
@@ -161,12 +135,7 @@ def create_processor(self):
enable_processor_cache=self.enable_processor_cache,
)
elif "Qwen3VL" in architecture:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
- else:
- from fastdeploy.input.v1.qwen3_vl_processor import (
- Qwen3VLProcessor,
- )
+ from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
self.processor = Qwen3VLProcessor(
config=self.model_config,
diff --git a/fastdeploy/input/v1/__init__.py b/fastdeploy/input/v1/__init__.py
deleted file mode 100644
index f4ede90624a..00000000000
--- a/fastdeploy/input/v1/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
diff --git a/fastdeploy/input/v1/ernie4_5_processor.py b/fastdeploy/input/v1/ernie4_5_processor.py
deleted file mode 100644
index f6545dc068a..00000000000
--- a/fastdeploy/input/v1/ernie4_5_processor.py
+++ /dev/null
@@ -1,641 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.v1.text_processor import BaseDataProcessor
-from fastdeploy.utils import data_processor_logger
-
-_SAMPLING_EPS = 1e-5
-from fastdeploy.input.utils import process_stop_token_ids
-
-
-class Ernie4_5Processor(BaseDataProcessor):
- """
- 初始化模型实例。
-
- Args:
- model_name_or_path (str): 模型名称或路径。
-
- Attributes:
- model_name_or_path (str): 存储模型名称或路径。
- decode_status (dict): 存储解码状态信息。
- tokenizer (object): 存储分词器实例。
- eos_token_ids (list): 存储结束符号的token ID列表。
- eos_token_id_len (int): 存储结束符号的token ID列表的长度。
- pad_token_id (int): 存储填充符号的token ID。
- """
-
- def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
-
- self.model_name_or_path = model_name_or_path
- data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
-
- # Generation config
- try:
- self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
- except Exception as e:
- data_processor_logger.warning(
- f"Can't find generation config, so it will not use "
- f"generation_config field in the model config, details={e}"
- )
- self.generation_config = None
-
- self.decode_status = dict()
- self.tool_parser_dict = dict()
- self.thinking_parser_dict = dict()
- self.model_status_dict = dict()
- self._load_tokenizer()
- data_processor_logger.info(
- f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
- {self.tokenizer.bos_token_id}, \
- eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
- )
- try:
- from paddleformers.trl.llm_utils import get_eos_token_id
- except Exception:
- from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
- self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
- self.eos_token_id_len = len(self.eos_token_ids)
- self.pad_token_id = self.get_pad_id()
- self.reasoning_parser = None
- self.tool_parser_obj = tool_parser_obj
- if reasoning_parser_obj:
- self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """
- Preprocess the request
-
- Args:
- request (Dict): may contain text and messages fields
-
- Returns:
- bool: Whether preprocessing is successful
- str: error message
- """
- data_processor_logger.info(f"Start processing request: {request}")
- request = self._apply_default_parameters(request)
- if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- # processing bad_words
- bad_words = request.get("bad_words")
- bad_words_token_ids = request.get("bad_words_token_ids")
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request["bad_words_token_ids"] = bad_words_token_ids
-
- logits_processors_args = self._prepare_think_stop_sentence(
- request.get("logits_processors_args") or {}, max_model_len
- )
- request["logits_processors_args"] = logits_processors_args
-
- # processing prompt_token_ids
- if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
- if request.prompt is not None:
- # prompt = request.prompt if request.prompt is not None else request.messages[0]
- prompt = request.prompt
- assert isinstance(prompt, str) or (
- isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
- ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
-
- if isinstance(prompt, list): # if prompt is a token id list
- request.prompt_token_ids = prompt
- else:
- tokens = self.tokenizer.tokenize(prompt)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
- request.prompt_token_ids = token_ids
- data_processor_logger.debug(
- f"request_ids: {request.request_id}, prompt: {prompt}, "
- f"tokens: {tokens}, token_ids: {token_ids}"
- )
- elif request.messages is not None:
- task = request.to_dict()
- chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if k not in task or task[k] is None:
- task[k] = v
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs)
- else:
- raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
-
- if len(request.prompt_token_ids) == 0:
- raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
- # truncate prompts that exceed the length limit
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
- logits_processors_args = self._update_thinking_prompt_state(
- request.prompt_token_ids, request.get("logits_processors_args") or {}
- )
- request["logits_processors_args"] = logits_processors_args
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if request.get("max_tokens") is None:
- request.set("max_tokens", max(1, max_tokens))
- else:
- request.set("max_tokens", min(max_tokens, request.get("max_tokens")))
- if request.get("temperature") < _SAMPLING_EPS:
- # zero temperature is equivalent to greedy sampling
- request.set("temperature", 1)
- request.set("top_k", 1)
- if request.get("top_p") < _SAMPLING_EPS:
- request.set("top_p", _SAMPLING_EPS)
- request.set("top_k", 1)
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.get("n", 1)
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
- if request.get("response_max_tokens") is not None and request.enable_thinking is False:
- request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
-
- data_processor_logger.info(f"Processed request: {request}")
- return request
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """
- Preprocess the request
-
- Args:
- request Request: may contain text and messages fields
-
- Returns:
- bool: Whether preprocessing is successful
- str: error message
- """
- data_processor_logger.info(f"Start processing request: {request}")
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- # processing bad_words
- bad_words = request.sampling_params.bad_words
- bad_words_token_ids = request.sampling_params.bad_words_token_ids
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
- logits_processors_args = self._prepare_think_stop_sentence(
- getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
- )
- request.sampling_params.logits_processors_args = logits_processors_args
-
- # processing prompt_token_ids
- if not request.prompt_token_ids:
- if request.prompt:
- prompt = request.prompt
- assert isinstance(prompt, str) or (
- isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
- ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
- if isinstance(prompt, list): # if prompt is a token id list
- request.prompt_token_ids = prompt
- else:
- request.prompt_tokens = prompt
- tokens = self.tokenizer.tokenize(prompt)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
- request.prompt_token_ids = token_ids
- req_id = request.request_id
- data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
- elif request.messages:
- chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
- if not chat_template_kwargs:
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if not getattr(request, k, None):
- setattr(request, k, v)
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- if getattr(request, "enable_thinking") is None:
- setattr(request, "enable_thinking", True)
- request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs)
- delattr(request, "chat_template_kwargs")
- else:
- raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
-
- if len(request.prompt_token_ids) == 0:
- raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
- # truncate prompts that exceed the length limit
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
- logits_processors_args = self._update_thinking_prompt_state(
- request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {}
- )
- request.sampling_params.logits_processors_args = logits_processors_args
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
- if request.sampling_params.temperature < _SAMPLING_EPS:
- # zero temperature is equivalent to greedy sampling
- request.sampling_params.temperature = 1
- request.sampling_params.top_k = 1
- if request.sampling_params.top_p < _SAMPLING_EPS:
- request.sampling_params.top_p = _SAMPLING_EPS
- request.sampling_params.top_k = 1
-
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.sampling_params.n or 1
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
- if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False:
- request.sampling_params.max_tokens = min(
- request.sampling_params.response_max_tokens, request.sampling_params.max_tokens
- )
-
- data_processor_logger.info(f"Processed request: {request}")
- return request
-
- def process_response(self, response_dict, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_dict (Dict): response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- req_id = response_dict.request_id
- token_ids = response_dict.outputs.token_ids
-
- response_dict.usage = {"completion_tokens": response_dict.outputs.index + 1}
- if token_ids[-1] == self.tokenizer.eos_token_id:
- token_ids = token_ids[:-1]
- full_text = self.tokenizer.decode(token_ids)
- if self.reasoning_parser:
- reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
- full_text,
- response_dict,
- self.model_status_dict[req_id],
- )
- response_dict.outputs.text = text
- response_dict.outputs.reasoning_content = reasoning_content
- else:
- response_dict.outputs.text = full_text
- if self.tool_parser_obj:
- tool_parser = self.tool_parser_obj(self.tokenizer)
- tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
- if tool_call_info.tools_called:
- response_dict.outputs.tool_calls = tool_call_info.tool_calls
- response_dict.outputs.text = tool_call_info.content
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
- if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
- return None
- return response_dict
-
- def process_response_dict(self, response_dict, stream, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_dict (Dict): response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- if stream:
- return self.process_response_obj_streaming(response_dict, **kwargs)
- else:
- return self.process_response_obj_normal(response_dict, **kwargs)
-
- def process_response_obj_normal(self, response_obj, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_obj : response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- token_ids = response_obj.outputs.token_ids
- is_end = response_obj.finished
- req_id = response_obj.request_id
- request = kwargs.get("request", None)
- if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
- if token_ids[-1] == self.tokenizer.eos_token_id:
- token_ids = token_ids[:-1]
- delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
- if is_end:
- full_text = previous_texts + delta_text
- response_obj.outputs.text = full_text
- if self.reasoning_parser:
- reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
- full_text,
- request,
- self.model_status_dict[req_id],
- )
- response_obj.outputs.text = text
- response_obj.outputs.reasoning_content = reasoning_content
- reasoning_tokens = self.tokenizer.tokenize(reasoning_content)
- response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
- if self.tool_parser_obj:
- tool_parser = self.tool_parser_obj(self.tokenizer)
- tool_call_info = tool_parser.extract_tool_calls(full_text, request)
- if tool_call_info.tools_called:
- response_obj.outputs.tool_calls = tool_call_info.tool_calls
- response_obj.outputs.text = tool_call_info.content
- response_obj.outputs.completion_tokens = full_text
- data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
- del self.decode_status[req_id]
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- return response_obj
-
- def process_response_obj_streaming(self, response_obj, **kwargs):
- """
- Preprocess the response streaming
-
- Args:
- response_obj : response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- token_ids = response_obj.outputs.token_ids
- is_end = response_obj.finished
- req_id = response_obj.request_id
- request = kwargs.get("request", None)
-
- if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
- if token_ids[-1] == self.tokenizer.eos_token_id:
- token_ids = token_ids[:-1]
- delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
- response_obj.outputs.completion_tokens = delta_text
- if self.reasoning_parser:
- reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
- previous_texts,
- previous_texts + delta_text,
- delta_text,
- previous_token_ids,
- previous_token_ids + token_ids,
- token_ids,
- self.model_status_dict[req_id],
- )
- response_obj.outputs.delta_message = reasoning_delta_message
- reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
- reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
- response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
- response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
- response_obj.outputs.reasoning_content = reasoning_content
- response_obj.outputs.text = (
- reasoning_delta_message.content or ""
- if reasoning_delta_message and hasattr(reasoning_delta_message, "content")
- else ""
- )
- else:
- response_obj.outputs.text = delta_text
- if self.tool_parser_obj:
- if req_id not in self.tool_parser_dict:
- self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
- tool_parser = self.tool_parser_dict[req_id]
- tool_call_delta_message = tool_parser.extract_tool_calls_streaming(
- previous_texts,
- previous_texts + delta_text,
- delta_text,
- previous_token_ids,
- previous_token_ids + token_ids,
- token_ids,
- request,
- )
- if tool_call_delta_message is None or tool_call_delta_message.tool_calls:
- response_obj.outputs.delta_message = tool_call_delta_message
-
- if is_end:
- data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
- del self.decode_status[req_id]
- if req_id in self.tool_parser_dict:
- del self.tool_parser_dict[req_id]
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- return response_obj
-
- def messages2ids(self, request_or_messages, **kwargs):
- """
- Convert multi-turn messages into ID sequences.
-
- Args:
- request_or_messages: Either a request dict containing 'messages' field,
- or a list of message dicts directly
-
- Returns:
- List of token IDs as strings (converted from token objects)
- """
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat_template.")
- message_dict = {
- key: getattr(request_or_messages, key, None)
- for key in ["messages", "tools", "documents", "enable_thinking", "system"]
- if getattr(request_or_messages, key, None) is not None
- }
- spliced_message = self.tokenizer.apply_chat_template(
- message_dict,
- tokenize=False,
- split_special_tokens=False,
- add_special_tokens=False,
- **kwargs,
- )
- request_or_messages.prompt_tokens = spliced_message
- req_id = getattr(request_or_messages, "request_id", None)
- tokens = self.tokenizer.tokenize(spliced_message)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
- data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
- return token_ids
-
- def ids2tokens(self, token_id, task_id):
- """
- token ids to strings
-
- Args:
- token_ids (List[int]): token ids
- task_id (str): task id
-
- Returns:
- List[str]: strings
- """
-
- if task_id not in self.decode_status:
- # prefix offset & read offset & history token ids & history token strings
- self.decode_status[task_id] = [0, 0, [], ""]
-
- prefix_offset = self.decode_status[task_id][0]
- read_offset = self.decode_status[task_id][1]
- previous_token_ids = self.decode_status[task_id][2]
- previous_texts = self.decode_status[task_id][3]
- decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
- previous_token_ids + token_id, prefix_offset, read_offset
- )
- self.decode_status[task_id][0] = prefix_offset
- self.decode_status[task_id][1] = read_offset
- self.decode_status[task_id][2] += token_id
- self.decode_status[task_id][3] += decode_str
-
- return decode_str, previous_token_ids, previous_texts
-
- def _load_tokenizer(self):
- """
- load tokenizer
-
- Returns:
- tokenizer (AutoTokenizer)
- """
- vocab_file_names = [
- "tokenizer.model",
- "spm.model",
- "ernie_token_100k.model",
- ]
- for i in range(len(vocab_file_names)):
- if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
- Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
- break
- self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
-
- def get_pad_id(self):
- """
- get pad_token_id, if not pad_token_id, use eos_token
-
- Returns:
- int: pad_token_id
- """
- # if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
- # return self.tokenizer.eos_token
- return self.tokenizer.pad_token_id
-
- def pad_batch_data(
- self,
- insts,
- pad_id=0,
- return_seq_len=False,
- return_array=True,
- pad_style="right",
- ):
- """Pad the instances to the max sequence length in batch."""
- if len(insts) == 0:
- padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]]
- if return_seq_len:
- seq_len = np.array([], dtype=np.int64) if return_array else []
- return padded_insts, seq_len
- return padded_insts
-
- max_len = max(map(len, insts))
- if pad_style == "left":
- padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]
- else:
- padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]
- if return_array:
- padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len])
-
- if return_seq_len:
- seq_len = [len(inst) for inst in insts]
- if return_array:
- seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
- return padded_insts, seq_len
- return padded_insts
-
- def update_stop_seq(self, stop_sequences):
- """
- Update stop sequences from request.
- """
- stop_seqs = []
- if isinstance(stop_sequences, str):
- stop_sequences = [stop_sequences]
- for seq in stop_sequences:
- if seq != self.tokenizer.eos_token_id:
- stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
- stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
- data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
- return stop_seqs, stop_seqs_len
-
- def process_logprob_response(self, token_ids, **kwargs):
- full_text = self.tokenizer.decode(token_ids, **kwargs)
- return full_text
-
- def update_bad_words(self, bad_words, bad_words_token_ids):
- """Support bad words"""
-
- token_ids = bad_words_token_ids
-
- if token_ids is None:
- token_ids = []
- for bad_word in bad_words:
- # To prohibit words both at the beginning
- # and in the middle of text
- # (related to add_prefix_space tokenizer parameter)
- for add_prefix_space in [False, True]:
- prefix = " " if add_prefix_space else ""
- prompt = prefix + bad_word.lstrip()
- prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
- data_processor_logger.debug(f"processed bad_words: {prompt}, {prompt_token_ids}")
-
- if len(prompt_token_ids) != 1:
- if not add_prefix_space:
- data_processor_logger.warning(
- f"Skip bad_words: <{prompt}>."
- f"Bad words should be a single token."
- f"Got tokens: {prompt_token_ids}."
- )
- continue
-
- if prompt_token_ids[0] > self.tokenizer.vocab_size:
- if not add_prefix_space:
- data_processor_logger.warning(
- f"Skip bad_words: <{prompt}>."
- f"All token id values should be satisfying:"
- f" 0 <= token_id < {self.tokenizer.vocab_size}."
- f"Got token: {prompt_token_ids}."
- )
- continue
-
- if prompt_token_ids not in token_ids:
- token_ids.extend(prompt_token_ids)
- return token_ids
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py
deleted file mode 100644
index f7d30a78d58..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .ernie4_5_vl_processor import Ernie4_5_VLProcessor
-from .process import DataProcessor, fancy_print
-from .process_video import read_video_decord
-from .utils.video_utils import VideoReaderWrapper
-
-__all__ = [
- "DataProcessor",
- "fancy_print",
- "VideoReaderWrapper",
- "read_video_decord",
- "Ernie4_5_VLProcessor",
-]
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py
deleted file mode 100644
index bae80b60a96..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import traceback
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
-from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-_SAMPLING_EPS = 1e-5
-
-
-class Ernie4_5_VLProcessor(Ernie4_5Processor):
- """The processor class for ERNIE MoE VL models."""
-
- def __init__(
- self,
- model_name_or_path,
- limit_mm_per_prompt=None,
- mm_processor_kwargs=None,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- enable_processor_cache=False,
- ):
- data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
- tokenizer_path = model_name_or_path
- preprocessor_path = model_name_or_path
- processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-
- self.ernie4_5_processor = DataProcessor(
- tokenizer_name=tokenizer_path,
- image_preprocessor_name=preprocessor_path,
- enable_processor_cache=enable_processor_cache,
- **processor_kwargs,
- )
- self.ernie4_5_processor.eval()
- self.image_patch_id = self.ernie4_5_processor.image_patch_id
- self.spatial_conv_size = self.ernie4_5_processor.spatial_conv_size
-
- self.tool_parser_dict = dict()
- self.decode_status = dict()
- self.model_status_dict = dict()
- self._load_tokenizer()
-
- # Generation config
- try:
- self.generation_config = GenerationConfig.from_pretrained(model_name_or_path)
- except Exception as e:
- data_processor_logger.warning(
- f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
- )
- self.generation_config = None
-
- # self.eos_token_ids = [self.tokenizer.eos_token_id]
- try:
- from paddleformers.trl.llm_utils import get_eos_token_id
- except Exception:
- from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
- self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
- self.eos_token_id_len = len(self.eos_token_ids)
- self.pad_token_id = self.get_pad_id()
- self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
- self.reasoning_parser = None
- if reasoning_parser_obj:
- self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
- self.tool_parser_obj = tool_parser_obj
-
- def get_pad_id(self):
- """get pad id"""
- return self.tokenizer.pad_token_id
-
- def _load_tokenizer(self):
- """
- load tokenizer
-
- Returns:
- tokenizer (AutoTokenizer)
- """
- self.tokenizer = self.ernie4_5_processor.tokenizer
-
- def _apply_default_parameters(self, request):
- """
- Apply default value for parameters in request
- """
-
- def set_value(req, key, value):
- value = getattr(self.generation_config, key, value)
- if getattr(req.sampling_params, key) is None:
- setattr(req.sampling_params, key, value)
-
- set_value(request, "top_p", 0.7)
- set_value(request, "temperature", 1.0)
- set_value(request, "repetition_penalty", 1.0)
- set_value(request, "frequency_penalty", 0.0)
- set_value(request, "presence_penalty", 0.0)
- return request
-
- def _parse_processor_kwargs(self, kwargs):
- """解析多模态处理器参数配置"""
- if not kwargs:
- return {}
-
- try:
- if not isinstance(kwargs, dict):
- raise ValueError("mm-processor-kwargs must be a dictionary")
-
- # 验证参数类型
- data_processor_logger.info(f"kwargs:{kwargs}")
- expected_types = {
- "spatial_conv_size": int,
- "temporal_conv_size": int,
- "image_min_pixels": int,
- "image_max_pixels": int,
- "video_min_pixels": int,
- "video_max_pixels": int,
- "video_target_frames": int,
- "video_frames_sample": str,
- "video_max_frames": int,
- "video_min_frames": int,
- "video_fps": int,
- }
-
- for key, value in kwargs.items():
- if key in expected_types and not isinstance(value, expected_types[key]):
- raise ValueError(
- f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
- )
-
- return kwargs
-
- except Exception as e:
- data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}, {str(traceback.format_exc())}")
- return {}
-
- def _parse_limits(self, limits):
- """解析多模态限制配置"""
- DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
- if not limits:
- return DEFAULT_LIMITS
-
- try:
- if not isinstance(limits, dict):
- raise ValueError("limit-mm-per-prompt must be a dictionary")
- data_processor_logger.info(f"_parse_limits:{limits}")
- return {**DEFAULT_LIMITS, **limits}
- except Exception as e:
- data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
- return DEFAULT_LIMITS
-
- def _check_mm_limits(self, item):
- if isinstance(item, dict):
- # 请求包含prompt和multi_modal_data
- mm_data = item
- else:
- # 请求包含messages
- mm_data = {"image": [], "video": []}
-
- for message in item:
- if isinstance(message.get("content"), list):
- for part in message["content"]:
- if part.get("type") == "image":
- mm_data["image"].append(part)
- elif part.get("type") == "video":
- mm_data["video"].append(part)
-
- for modality, data in mm_data.items():
- if modality in self.limit_mm_per_prompt:
- limit = self.limit_mm_per_prompt[modality]
- if len(data) > limit:
- raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """process the input data"""
- task = request.to_dict()
- task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs")
- self.process_request_dict(task, max_model_len)
- request = Request.from_dict(task)
- request = self._apply_default_parameters(request)
-
- return request
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """process the input data"""
-
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- bad_words = request.sampling_params.bad_words
- bad_words_token_ids = request.sampling_params.bad_words_token_ids
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
- logits_processors_args = self._prepare_think_stop_sentence(
- getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
- )
- request.sampling_params.logits_processors_args = logits_processors_args
-
- if request.prompt_token_ids:
- messages = request.messages
- if messages:
- self._check_mm_limits(messages)
- if getattr(request, "enable_thinking") is None:
- setattr(request, "enable_thinking", True)
- outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request)
- elif request.prompt:
- multimodal_data = request.multimodal_data
- if multimodal_data is None:
- multimodal_data = {}
- self._check_mm_limits(multimodal_data)
- images = multimodal_data.get("image", None)
- videos = multimodal_data.get("video", None)
- request.prompt_tokens = request.prompt
- outputs = self.ernie4_5_processor.text2ids(request.prompt, images, videos)
- elif request.messages:
- messages = request.messages
- self._check_mm_limits(messages)
- chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
- if not chat_template_kwargs:
- chat_template_kwargs = request.chat_template_kwargs
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if getattr(request, k, None) is None:
- setattr(request, k, v)
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- if getattr(request, "enable_thinking") is None:
- setattr(request, "enable_thinking", True)
- outputs = self.ernie4_5_processor.request2ids(request)
- delattr(request, "chat_template_kwargs")
- else:
- raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
- if request.completion_token_ids:
- self.append_completion_tokens(outputs, request.completion_token_ids)
-
- outputs = self.pack_outputs(outputs)
- request.prompt_token_ids = (
- outputs["input_ids"].tolist()
- if not getattr(request, "prompt_token_ids", None)
- else request.prompt_token_ids
- )
- request.prompt_token_ids_len = len(request.prompt_token_ids)
- request.multimodal_inputs = outputs
-
- # 截断超过长度限制的prompt
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
- logits_processors_args = self._update_thinking_prompt_state(
- request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {}
- )
- request.sampling_params.logits_processors_args = logits_processors_args
-
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
- if request.sampling_params.reasoning_max_tokens is None:
- request.sampling_params.reasoning_max_tokens = max(int(request.sampling_params.max_tokens * 0.8), 1)
- request.reasoning_max_tokens = request.sampling_params.reasoning_max_tokens
- data_processor_logger.info(f"Processed request {request}")
-
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.sampling_params.n or 1
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
- if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS:
- request.sampling_params.top_p = _SAMPLING_EPS
- request.sampling_params.top_k = 1
- if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False:
- request.sampling_params.max_tokens = min(
- request.sampling_params.response_max_tokens, request.sampling_params.max_tokens
- )
- return request
-
- def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
- "append already completion tokens"
-
- num_tokens = len(completion_token_ids)
- multimodal_inputs["input_ids"].extend(completion_token_ids)
- multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
- start = multimodal_inputs["cur_position"]
- for i in range(num_tokens):
- multimodal_inputs["position_ids"].append([start + i] * 3)
- multimodal_inputs["cur_position"] += num_tokens
-
- def pack_outputs(self, outs):
- # Stack or nullify image-related fields
- if not outs["images"]:
- outs["images"] = None
- outs["grid_thw"] = None
- outs["image_type_ids"] = None
- else:
- outs["images"] = np.vstack(outs["images"])
- outs["grid_thw"] = np.vstack(outs["grid_thw"])
- outs["image_type_ids"] = np.array(outs["image_type_ids"])
-
- outs["image_patch_id"] = self.image_patch_id
- # Convert lists to arrays
- outs["input_ids"] = np.array(outs["input_ids"], dtype=np.int64)
- outs["token_type_ids"] = np.array(outs["token_type_ids"], dtype=np.int64)
- outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
- outs["mm_num_token_func"] = self.ernie4_5_processor.mm_num_tokens
- return outs
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py
deleted file mode 100644
index c11444e6758..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .get_image_preprocessor import get_image_preprocessor
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
deleted file mode 100644
index 0ff6f7d1ed5..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""get image preprocessor"""
-
-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-
-def get_image_preprocessor(args):
- """
- get_image_preprocessor from args
- """
-
- if args.vision_model_name_or_path is None:
- return None
-
- data_processor_logger.info("use AdaptiveImageProcessor")
- image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
- return image_preprocess
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
deleted file mode 100644
index 6dcdf3a4e96..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py
+++ /dev/null
@@ -1,587 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""image preprocessor adaptive"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
- convert_to_rgb,
- normalize,
- rescale,
- resize,
- to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
- ChannelDimension,
- ImageInput,
- PILImageResampling,
- get_image_size,
- infer_channel_dimension_format,
- is_valid_image,
- make_list_of_images,
- to_numpy_array,
- valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-
-
-VideoInput = Union[
- List["PIL.Image.Image"],
- "np.ndarray",
- "paddle.Tensor",
- List["np.ndarray"],
- List["paddle.Tensor"],
- List[List["PIL.Image.Image"]],
- List[List["np.ndarrray"]],
- List[List["paddle.Tensor"]],
-]
-
-
-__all__ = [
- "AdaptiveImageProcessor",
-]
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
- """
- Checks to see whether the pixel values have already been rescaled to [0, 1].
- """
- if image.dtype == np.uint8:
- return False
-
- # It's possible the image has pixel values in [0, 255] but is of floating type
- return np.min(image) >= 0 and np.max(image) <= 1
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
- """
- Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
- Args:
- images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
- The input image.
-
- Returns:
- list: A list of images.
- """
- if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
- return [img for img_list in images for img in img_list]
-
- elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
- return images
-
- elif is_valid_image(images):
- return [images]
-
- raise ValueError(f"Could not make batched images from {images}")
-
-
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
- """dummy"""
- if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
- return videos
-
- elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
- if isinstance(videos[0], Image.Image):
- return [videos]
- elif len(videos[0].shape) == 4:
- return [list(video) for video in videos]
-
- elif is_valid_image(videos) and len(videos.shape) == 4:
- return [list(videos)]
-
- raise ValueError(f"Could not make batched video from {videos}")
-
-
-class AdaptiveImageProcessor(BaseImageProcessor):
- r"""
- Constructs a adaptive image processor that dynamically resizes images based on the original images.
-
- Args:
- do_resize (`bool`, *optional*, defaults to `True`):
- Whether to resize the image's (height, width) dimensions.
- resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
- Resampling filter to use when resizing the image.
- do_rescale (`bool`, *optional*, defaults to `True`):
- Whether to rescale the image by the specified scale `rescale_factor`.
- rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
- Scale factor to use if rescaling the image.
- do_normalize (`bool`, *optional*, defaults to `True`):
- Whether to normalize the image.
- image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
- Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
- image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
- Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
- in the image.
- do_convert_rgb (`bool`, *optional*, defaults to `True`):
- Whether to convert the image to RGB.
- min_pixels (`int`, *optional*, defaults to `56 * 56`):
- The min pixels of the image to resize the image.
- max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
- The max pixels of the image to resize the image.
- patch_size (`int`, *optional*, defaults to 14):
- The spacial patch size of the vision encoder.
- temporal_conv_size (`int`, *optional*, defaults to 2):
- The temporal conv size in resampler.
- merge_size (`int`, *optional*, defaults to 2):
- The merge size of the vision encoder to llm encoder.
- """
-
- model_input_names = [
- "pixel_values",
- "image_grid_thw",
- "pixel_values_videos",
- "video_grid_thw",
- ]
-
- def __init__(
- self,
- do_resize: bool = True,
- resample: PILImageResampling = PILImageResampling.BICUBIC,
- do_rescale: bool = True,
- rescale_factor: float = 1 / 255,
- do_normalize: bool = True,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: bool = True,
- min_pixels: int = 56 * 56,
- max_pixels: int = 28 * 28 * 1280,
- patch_size: int = 14,
- temporal_conv_size: int = 2,
- merge_size: int = 2,
- **kwargs,
- ) -> None:
- """init"""
- super().__init__(**kwargs)
- self.do_resize = do_resize
- self.resample = resample
- self.do_rescale = do_rescale
- self.rescale_factor = rescale_factor
- self.do_normalize = do_normalize
- self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
- self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
- self.min_pixels = min_pixels
- self.max_pixels = max_pixels
- self.patch_size = patch_size
- self.temporal_conv_size = temporal_conv_size
- self.merge_size = merge_size
- self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
- self.do_convert_rgb = do_convert_rgb
-
- def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
- """设定pixels"""
- if min_pixels is not None:
- assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
- data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
- self.min_pixels = min_pixels
- self.size["min_pixels"] = int(min_pixels)
- if max_pixels is not None:
- assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
- data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
- self.max_pixels = max_pixels
- self.size["max_pixels"] = int(max_pixels)
-
- def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
- """dummy"""
- actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
- actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
- resized_height, resized_width = smart_resize(
- height,
- width,
- factor=self.patch_size * self.merge_size,
- min_pixels=actual_min_pixels,
- max_pixels=actual_max_pixels,
- )
- return (resized_height, resized_width), (
- resized_height // self.patch_size,
- resized_width // self.patch_size,
- )
-
- def _preprocess(
- self,
- images: Union[ImageInput, VideoInput],
- do_resize: bool = True,
- resample: PILImageResampling = None,
- do_rescale: bool = True,
- rescale_factor: float = 1 / 255,
- do_normalize: bool = True,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: bool = False,
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
- predetermined_grid_thw=None,
- ):
- """
- Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
- Args:
- images (`ImageInput`):
- Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
- If pixel values range from 0 to 1, set `do_rescale=False`.
- vision_info (`List[Dict]`, *optional*):
- Optional list of dictionaries containing additional information about vision inputs.
- do_resize (`bool`, *optional*, defaults to `self.do_resize`):
- Whether to resize the image.
- resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
- Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
- do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
- Whether to rescale the image.
- rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
- Scale factor to use if rescaling the image.
- do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
- Whether to normalize the image.
- image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
- Mean to use if normalizing the image.
- Can be a float or a list of floats corresponding to the number of channels in the image.
- image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
- Standard deviation to use if normalizing the image.
- Can be a float or a list of floats corresponding to the number of channels in the image.
- do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
- Whether to convert the image to RGB.
- data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
- The channel dimension format for the output image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- - Unset: Use the channel dimension format of the input image.
- input_data_format (`ChannelDimension` or `str`, *optional*):
- The channel dimension format for the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
- - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
- """
- images = make_list_of_images(images)
-
- if do_convert_rgb:
- images = [convert_to_rgb(image) for image in images]
-
- # All transformations expect numpy arrays.
- images = [to_numpy_array(image) for image in images]
-
- if is_scaled_image(images[0]) and do_rescale:
- data_processor_logger.warning(
- "It looks like you are trying to rescale already rescaled images. If the input"
- " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
- )
- if input_data_format is None:
- # We assume that all images have the same channel dimension format.
- input_data_format = infer_channel_dimension_format(images[0])
-
- height, width = get_image_size(images[0], channel_dim=input_data_format)
- resized_height, resized_width = height, width
- processed_images = []
-
- if predetermined_grid_thw is not None:
- assert len(predetermined_grid_thw) == len(
- images
- ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
-
- for img_idx, image in enumerate(images):
- if do_resize:
- if predetermined_grid_thw is not None:
- (resized_height, resized_width) = predetermined_grid_thw[img_idx]
- resized_height *= self.patch_size
- resized_width *= self.patch_size
- else:
- resized_height, resized_width = smart_resize(
- height,
- width,
- factor=self.patch_size * self.merge_size,
- min_pixels=self.min_pixels,
- max_pixels=self.max_pixels,
- )
- image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
- # 直接fromarray,不要靠paddleformers里面的
- image = Image.fromarray(image)
- image = resize(
- image,
- size=(resized_height, resized_width),
- resample=resample,
- data_format=input_data_format,
- )
- if do_rescale:
- image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
- if do_normalize:
- image = normalize(
- image=image,
- mean=image_mean,
- std=image_std,
- data_format=input_data_format,
- )
-
- image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
-
- processed_images.append(image)
- patches = np.array(processed_images)
- if data_format == ChannelDimension.LAST:
- patches = patches.transpose([0, 3, 1, 2])
-
- channel = patches.shape[1] # [time, C, H, W]
- grid_t = patches.shape[0]
- grid_h, grid_w = (
- resized_height // self.patch_size,
- resized_width // self.patch_size,
- )
- patches = patches.reshape(
- [
- grid_t,
- channel,
- grid_h // self.merge_size,
- self.merge_size,
- self.patch_size,
- grid_w // self.merge_size,
- self.merge_size,
- self.patch_size,
- ]
- )
- # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
- patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
-
- flatten_patches = patches.reshape(
- [
- grid_t * grid_h * grid_w,
- channel * self.patch_size * self.patch_size,
- ]
- ) # [grid_t * grid_h * grid_w, C * psz * psz]
-
- return flatten_patches, (grid_t, grid_h, grid_w)
-
- def preprocess(
- self,
- images: ImageInput,
- videos: VideoInput = None,
- do_resize: bool = True,
- size: Optional[Union[int, List[int]]] = None,
- resample: PILImageResampling = None,
- do_rescale: bool = True,
- rescale_factor: float = 1 / 255,
- do_normalize: bool = True,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: bool = False,
- return_tensors: Optional[Union[str, TensorType]] = None,
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
- input_data_format: Optional[Union[str, ChannelDimension]] = None,
- predetermined_grid_thw=None,
- ):
- """
- Args:
- images (`ImageInput`):
- Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
- passing in images with pixel values between 0 and 1, set `do_rescale=False`.
- videos (`VideoInput`):
- Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
- passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
- do_resize (`bool`, *optional*, defaults to `self.do_resize`):
- Whether to resize the image.
- size (`Dict[str, int]`, *optional*, defaults to `self.size`):
- Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
- the longest edge resized to keep the input aspect ratio.
- resample (`int`, *optional*, defaults to `self.resample`):
- Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
- has an effect if `do_resize` is set to `True`.
- do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
- Whether to rescale the image.
- rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
- Rescale factor to rescale the image by if `do_rescale` is set to `True`.
- do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
- Whether to normalize the image.
- image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
- Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
- image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
- Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
- `True`.
- do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
- Whether to convert the image to RGB.
- return_tensors (`str` or `TensorType`, *optional*):
- The type of tensors to return. Can be one of:
- - Unset: Return a list of `np.ndarray`.
- - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
- - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
- data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
- The channel dimension format for the output image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- - Unset: Use the channel dimension format of the input image.
- input_data_format (`ChannelDimension` or `str`, *optional*):
- The channel dimension format for the input image. If unset, the channel dimension format is inferred
- from the input image. Can be one of:
- - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-
- """
- do_resize = do_resize if do_resize is not None else self.do_resize
- size = size if size is not None else self.size
- resample = resample if resample is not None else self.resample
- do_rescale = do_rescale if do_rescale is not None else self.do_rescale
- rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
- do_normalize = do_normalize if do_normalize is not None else self.do_normalize
- image_mean = image_mean if image_mean is not None else self.image_mean
- image_std = image_std if image_std is not None else self.image_std
- do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
- if images is not None:
- images = make_batched_images(images)
- if videos is not None:
- videos = make_batched_videos(videos)
-
- if images is not None and not valid_images(images):
- raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
- if images is not None:
- pixel_values, vision_grid_thws = [], []
- for img_idx, image in enumerate(images):
- if predetermined_grid_thw is not None:
- predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
- else:
- predetermined_grid_thw_one = None
- patches, image_grid_thw = self._preprocess(
- image,
- do_resize=do_resize,
- resample=resample,
- do_rescale=do_rescale,
- rescale_factor=rescale_factor,
- do_normalize=do_normalize,
- image_mean=image_mean,
- image_std=image_std,
- data_format=data_format,
- do_convert_rgb=do_convert_rgb,
- input_data_format=input_data_format,
- predetermined_grid_thw=predetermined_grid_thw_one,
- )
- pixel_values.extend(patches)
- vision_grid_thws.append(image_grid_thw)
- pixel_values = np.array(pixel_values)
- vision_grid_thws = np.array(vision_grid_thws)
- data = {
- "pixel_values": pixel_values,
- "image_grid_thw": vision_grid_thws,
- }
-
- if videos is not None:
- pixel_values, vision_grid_thws = [], []
- for images in videos:
- patches, video_grid_thw = self._preprocess(
- images,
- do_resize=do_resize,
- resample=resample,
- do_rescale=do_rescale,
- rescale_factor=rescale_factor,
- do_normalize=do_normalize,
- image_mean=image_mean,
- image_std=image_std,
- data_format=data_format,
- do_convert_rgb=do_convert_rgb,
- input_data_format=input_data_format,
- predetermined_grid_thw=predetermined_grid_thw,
- )
- pixel_values.extend(patches)
- vision_grid_thws.append(video_grid_thw)
- pixel_values = np.array(pixel_values)
- vision_grid_thws = np.array(vision_grid_thws)
-
- data = {
- "pixel_values_videos": pixel_values,
- "video_grid_thw": vision_grid_thws,
- }
-
- return BatchFeature(data=data, tensor_type=return_tensors)
-
-
-def round_by_factor(number: int, factor: int) -> int:
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
- return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
- return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
- return math.floor(number / factor) * factor
-
-
-def smart_resize(
- height: int,
- width: int,
- factor: int = IMAGE_FACTOR,
- min_pixels: int = MIN_PIXELS,
- max_pixels: int = MAX_PIXELS,
-):
- """
- Rescales the image so that the following conditions are met:
-
- 1. Both dimensions (height and width) are divisible by 'factor'.
-
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
- 3. The aspect ratio of the image is maintained as closely as possible.
- """
- if max(height, width) / min(height, width) > MAX_RATIO:
- if height > width:
- new_width = max(factor, round_by_factor(width, factor))
- new_height = floor_by_factor(new_width * MAX_RATIO, factor)
- else:
- new_height = max(factor, round_by_factor(height, factor))
- new_width = floor_by_factor(new_height * MAX_RATIO, factor)
-
- data_processor_logger.info(
- f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
- resize to {max(new_height, new_width) / min(new_height, new_width)}"
- )
-
- height = new_height
- width = new_width
-
- h_bar = max(factor, round_by_factor(height, factor))
- w_bar = max(factor, round_by_factor(width, factor))
- if h_bar * w_bar > max_pixels:
- beta = math.sqrt((height * width) / max_pixels)
- h_bar = floor_by_factor(height / beta, factor)
- w_bar = floor_by_factor(width / beta, factor)
- elif h_bar * w_bar < min_pixels:
- beta = math.sqrt(min_pixels / (height * width))
- h_bar = ceil_by_factor(height * beta, factor)
- w_bar = ceil_by_factor(width * beta, factor)
-
- if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
- raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
- return h_bar, w_bar
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process.py
deleted file mode 100644
index d8b90e54d57..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py
+++ /dev/null
@@ -1,751 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-""" process.py """
-import copy
-import os
-import pickle
-from collections import defaultdict
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers.image_utils import ChannelDimension
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor
-from .process_video import read_frames_decord, read_video_decord
-from .utils.render_timestamp import render_frame_timestamp
-
-
-def fancy_print(input_ids, tokenizer, image_patch_id=None):
- """
- input_ids: input_ids
- tokenizer: the tokenizer of models
- """
- i = 0
- res = ""
- text_ids = []
- real_image_token_len = 0
- while i < len(input_ids):
- if input_ids[i] == image_patch_id:
- if len(text_ids) > 0:
- res += tokenizer.decode(text_ids)
- text_ids = []
-
- real_image_token_len += 1
- else:
- if real_image_token_len != 0:
- res += f"<|IMAGE@{real_image_token_len}|>"
- real_image_token_len = 0
-
- text_ids.append(input_ids[i])
-
- i += 1
- if len(text_ids) > 0:
-
- res += tokenizer.decode(text_ids)
- text_ids = []
- return res
-
-
-class DataProcessor(MMBaseDataProcessor):
- """
- Processes multimodal chat messages into model-ready inputs,
- handling text, images, and videos with 3D positional embeddings.
- """
-
- CLS_TOKEN = "<|begin_of_sentence|>"
- SEP_TOKEN = "<|end_of_sentence|>"
- EOS_TOKEN = ""
- IMG_START = "<|IMAGE_START|>"
- IMG_END = "<|IMAGE_END|>"
- VID_START = "<|VIDEO_START|>"
- VID_END = "<|VIDEO_END|>"
-
- def __init__(
- self,
- tokenizer_name: str,
- image_preprocessor_name: str,
- enable_processor_cache: bool = False,
- spatial_conv_size: int = 2,
- temporal_conv_size: int = 2,
- image_min_pixels: int = 4 * 28 * 28,
- image_max_pixels: int = 6177 * 28 * 28,
- video_min_pixels: int = 299 * 28 * 28,
- video_max_pixels: int = 1196 * 28 * 28,
- video_target_frames: int = -1,
- video_frames_sample: str = "leading",
- video_max_frames: int = 180,
- video_min_frames: int = 16,
- video_fps: int = 2,
- **kwargs,
- ) -> None:
- super().__init__()
- # Tokenizer and image preprocessor
- self.model_name_or_path = tokenizer_name
- self._load_tokenizer()
- self.tokenizer.ignored_index = -100
- self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
- self.enable_processor_cache = enable_processor_cache
-
- # Convolution sizes for patch aggregation
- self.spatial_conv_size = spatial_conv_size
- self.temporal_conv_size = temporal_conv_size
-
- # Pixel constraints
- self.image_min_pixels = image_min_pixels
- self.image_max_pixels = image_max_pixels
- self.video_min_pixels = video_min_pixels
- self.video_max_pixels = video_max_pixels
-
- # Video sampling parameters
- self.target_frames = video_target_frames
- self.frames_sample = video_frames_sample
- self.max_frames = video_max_frames
- self.min_frames = video_min_frames
- self.fps = video_fps
-
- # Special tokens and IDs
- self.cls_token = self.CLS_TOKEN
- self.sep_token = self.SEP_TOKEN
- self.eos_token = self.EOS_TOKEN
- self.image_start = self.IMG_START
- self.image_end = self.IMG_END
- self.video_start = self.VID_START
- self.video_end = self.VID_END
- self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
- self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start)
- self.image_end_id = self.tokenizer.convert_tokens_to_ids(self.image_end)
- self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start)
- self.video_end_id = self.tokenizer.convert_tokens_to_ids(self.video_end)
- self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token)
- self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token)
-
- self.token_type_mapping = self._build_token_type_mapping()
- self.is_training = True
- self.role_prefixes = {
- "system": "",
- "user": "User: ",
- "bot": "Assistant: ",
- "assistant": "Assistant: ",
- "tool": "Tool: ",
- }
-
- @staticmethod
- def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
- """
- Calculate the number of tokens in the multimodal input.
- """
- if isinstance(grid_thw, paddle.Tensor):
- grid_thw = grid_thw.numpy()
-
- if len(grid_thw) == 0:
- return 0
-
- def calc_one(thw):
- t, h, w = map(int, thw)
- if t == 1:
- return t * h * w // 4
- else:
- return t * h * w // 4 // 2
-
- if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
- return [calc_one(x) for x in grid_thw]
-
- return calc_one(grid_thw)
-
- def _build_token_type_mapping(self) -> Dict[Any, int]:
- mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
- for token in (
- self.IMG_START,
- self.IMG_END,
- self.VID_START,
- self.VID_END,
- ):
- mapping[token] = IDS_TYPE_FLAG["image"]
- mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"]
- return mapping
-
- def train(self) -> None:
- """Enable training mode (produces labels)."""
- self.is_training = True
-
- def eval(self) -> None:
- """Enable evaluation mode (doesn't produce labels)."""
- self.is_training = False
-
- def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
- """
- Convert chat text into model inputs.
-
- Args:
- text (str): The chat text containing placeholders for images and videos.
- images (list, optional): List of images to be processed and inserted at image placeholders.
- videos (list, optional): List of videos to be processed and inserted at video placeholders.
- image_uuid (list, optional): List of unique identifiers for each image, used for caching or hashing.
- video_uuid (list, optional): List of unique identifiers for each video, used for caching or hashing.
- Returns:
- dict: A dictionary with keys input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels, etc.
- """
-
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "mm_positions": [],
- "mm_hashes": [],
- }
-
- IMAGE_PLACEHOLDER = "<|image@placeholder|>"
- VIDEO_PLACEHOLDER = "<|video@placeholder|>"
- IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
- VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
- st, image_idx, video_idx = 0, 0, 0
- while st < len(text):
- image_pos = text.find(IMAGE_PLACEHOLDER, st)
- image_pos = len(text) if image_pos == -1 else image_pos
- video_pos = text.find(VIDEO_PLACEHOLDER, st)
- video_pos = len(text) if video_pos == -1 else video_pos
- ed = min(image_pos, video_pos)
-
- self._add_text(text[st:ed], outputs)
- if ed == len(text):
- break
-
- if ed == image_pos:
- image = images[image_idx]
- uuid = image_uuid[image_idx] if image_uuid else None
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid)
- else:
- # cached images are already processed
- self._add_processed_image(image, outputs, uuid)
- image_idx += 1
- st = ed + IMAGE_PLACEHOLDER_LEN
- else:
- item = videos[video_idx]
- uuid = video_uuid[video_idx] if video_uuid else None
- if not isinstance(item, tuple):
- if isinstance(item, dict):
- frames = self._load_and_process_video(item["video"], item)
- else:
- frames = self._load_and_process_video(item, {})
- self._add_video(frames, outputs, uuid)
- else:
- # cached frames are already processed
- self._add_processed_video(item, outputs, uuid)
- video_idx += 1
- st = ed + VIDEO_PLACEHOLDER_LEN
-
- return outputs
-
- def extract_mm_items(self, request: Request):
- messages = parse_chat_messages(request.messages)
- mm_items = []
- for msg in messages:
- role = msg.get("role")
- assert role in self.role_prefixes, f"Unsupported role: {role}"
- content = msg.get("content")
- if not isinstance(content, list):
- content = [content]
- for item in content:
- if item.get("type") in ["image", "video"]:
- mm_items.append(item)
-
- missing_hashes, missing_idx = [], []
- for idx, item in enumerate(mm_items):
- if not item.get("data"):
- # raw data not provided, should be retrieved from processor cache
- missing_hashes.append(item.get("uuid"))
- missing_idx.append(idx)
-
- if len(missing_hashes) > 0 and not self.enable_processor_cache:
- raise ValueError("Missing items cannot be retrieved without processor cache.")
-
- dealer = None
- if self.enable_processor_cache:
- context = zmq.Context()
- dealer = context.socket(zmq.DEALER)
- dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
- missing_items = self.get_processor_cache(dealer, missing_hashes)
- for idx in range(len(missing_items)):
- if not missing_items[idx]:
- raise ValueError(f"Missing item {idx} not found in processor cache")
- mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
- images, videos = [], []
- image_uuid, video_uuid = [], []
- for item in mm_items:
- if item.get("type") == "image":
- images.append(item["data"])
- image_uuid.append(item["uuid"])
- elif item.get("type") == "video":
- videos.append(item["data"])
- video_uuid.append(item["uuid"])
- else:
- raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
- return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
-
- def request2ids(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- """
- Convert chat messages into model inputs.
- Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
- """
- images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
-
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat template.")
-
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- message_dict = {
- key: getattr(request, key, None)
- for key in ["messages", "tools", "documents", "enable_thinking", "system"]
- if getattr(request, key, None) is not None
- }
- prompt = self.tokenizer.apply_chat_template(
- message_dict,
- tokenize=False,
- add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
- **chat_template_kwargs,
- )
- request.prompt_tokens = prompt
-
- outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- t, h, w = outputs["grid_thw"][idx][0]
- meta["thw"] = (t, h, w)
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- if self.is_training:
- assert tgts, "Training must give tgt"
- self._extract_labels(outputs, tgts)
-
- return outputs
-
- def prompt_token_ids2outputs(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "mm_positions": [],
- "mm_hashes": [],
- }
- prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else []
- prompt_token_ids_len = len(prompt_token_ids)
- if not request.messages:
- outputs["input_ids"].extend(prompt_token_ids)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
- for i in range(prompt_token_ids_len):
- outputs["position_ids"].append([i] * 3)
- outputs["cur_position"] += prompt_token_ids_len
- return outputs
- images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request)
- st, image_idx, video_idx = 0, 0, 0
- while st < prompt_token_ids_len:
- cur_token_id = prompt_token_ids[st]
- if cur_token_id == self.image_start_id:
- if image_idx >= len(images):
- raise ValueError("prompt token ids has more image placeholder than in messages")
- # append image_start_id
- outputs["input_ids"].extend([cur_token_id])
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
- outputs["position_ids"].append([outputs["cur_position"]] * 3)
- outputs["cur_position"] += 1
- st += 1
- # process placeholder token ids
- cur_idx = st
- while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_end_id:
- cur_idx += 1
- if cur_idx >= prompt_token_ids_len:
- raise ValueError("image token ids not complete")
- image = images[image_idx]
- uuid = image_uuid[image_idx] if image_uuid else None
- token_len = cur_idx - st
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid, token_len)
- else:
- self._add_processed_image(image, outputs, uuid, token_len)
- image_idx += 1
- st = cur_idx
- elif cur_token_id == self.video_start_id:
- if video_idx >= len(videos):
- raise ValueError("prompt token ids has more video placeholder than in messages")
- # append video_start_id
- outputs["input_ids"].extend([cur_token_id])
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
- outputs["position_ids"].append([outputs["cur_position"]] * 3)
- outputs["cur_position"] += 1
- st += 1
- # process placeholder token ids
- cur_idx = st
- while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.video_end_id:
- cur_idx += 1
- if cur_idx >= prompt_token_ids_len:
- raise ValueError("video token ids not complete")
- video = videos[video_idx]
- uuid = video_uuid[video_idx] if video_uuid else None
- token_len = cur_idx - st
- if not isinstance(video, tuple):
- if isinstance(video, dict):
- frames = self._load_and_process_video(video["video"], video)
- else:
- frames = self._load_and_process_video(video, {})
- self._add_video(frames, outputs, uuid, token_len)
- else:
- self._add_processed_video(video, outputs, uuid, token_len)
- video_idx += 1
- st = cur_idx
- else:
- outputs["input_ids"].extend([cur_token_id])
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]])
- outputs["position_ids"].append([outputs["cur_position"]] * 3)
- outputs["cur_position"] += 1
- st += 1
- if image_idx != len(images):
- raise ValueError("number of images does not match")
- if video_idx != len(videos):
- raise ValueError("number of videos does not match")
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- t, h, w = outputs["grid_thw"][idx][0]
- meta["thw"] = (t, h, w)
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- return outputs
-
- def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
- token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token)
- outputs["input_ids"].append(token_id)
- outputs["token_type_ids"].append(self.token_type_mapping[token])
- pos = outputs["cur_position"]
- outputs["position_ids"].append([pos] * 3)
- outputs["cur_position"] += 1
-
- def _add_text(self, tokens, outputs: Dict) -> None:
- if isinstance(tokens, str):
- tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
- outputs["input_ids"].extend(tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
-
- start = outputs["cur_position"]
- for i in range(len(tokens)):
- outputs["position_ids"].append([start + i] * 3)
- outputs["cur_position"] += len(tokens)
-
- def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
- patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
- img.height,
- img.width,
- min_pixels=self.image_min_pixels,
- max_pixels=self.image_max_pixels,
- )[1]
- num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
- if token_len and token_len != num_tokens:
- raise ValueError("image tokens num not match the size")
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
- outputs["num_input_image_tokens"] += num_tokens
-
- pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
- outputs["position_ids"].extend(pos_ids)
- outputs["cur_position"] = np.max(pos_ids) + 1
-
- # Preprocess pixels
- ret = self.image_preprocessor.preprocess(
- images=[img.convert("RGB")],
- do_normalize=False,
- do_rescale=False,
- predetermined_grid_thw=np.array([[patches_h, patches_w]]),
- do_convert_rgb=True,
- input_data_format=ChannelDimension.LAST,
- )
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(ret["image_grid_thw"])
- outputs["image_type_ids"].append(0)
-
- def _add_processed_image(
- self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
- ) -> None:
- img, meta = img_cache
- num_tokens = img.shape[0] // (self.spatial_conv_size**2)
- if token_len and num_tokens != token_len:
- raise ValueError("image tokens num not match the size")
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
- _, h, w = meta["thw"]
- pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
- outputs["position_ids"].extend(pos_ids)
- outputs["cur_position"] = np.max(pos_ids) + 1
-
- outputs["images"].append(img)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[1, h, w]]))
- outputs["image_type_ids"].append(0)
-
- def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None:
- patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
- frames[0].height,
- frames[0].width,
- min_pixels=self.video_min_pixels,
- max_pixels=self.video_max_pixels,
- )[1]
- num_frames = len(frames)
- num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
- if token_len and num_tokens != token_len:
- raise ValueError("video tokens num not match the size")
-
- pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
- ret = self.image_preprocessor.preprocess(
- images=None,
- videos=pixel_stack,
- do_normalize=False,
- do_rescale=False,
- predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
- do_convert_rgb=True,
- input_data_format=ChannelDimension.LAST,
- )
- outputs["images"].append(ret["pixel_values_videos"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(ret["video_grid_thw"])
- outputs["image_type_ids"].extend([1] * num_frames)
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["num_input_video_tokens"] += num_tokens
-
- pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
- outputs["position_ids"].extend(pos_ids)
- outputs["cur_position"] = np.max(pos_ids) + 1
-
- def _add_processed_video(
- self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None
- ) -> None:
- frames, meta = frames_cache
- num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
- if token_len and num_tokens != token_len:
- raise ValueError("video tokens num not match the size")
-
- t, h, w = meta["thw"]
- outputs["images"].append(frames)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[t, h, w]]))
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["image_type_ids"].extend([1] * t)
-
- pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
- outputs["position_ids"].extend(pos_ids)
- outputs["cur_position"] = np.max(pos_ids) + 1
-
- def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None:
- input_ids = copy.deepcopy(outputs["input_ids"])
- labels = [self.tokenizer.ignored_index] * len(input_ids)
-
- tgt_count = input_ids.count(self.sep_token_id)
- assert tgt_count == len(tgts), f"len(tgts) != len(src) {len(tgts)} vs {tgt_count}"
-
- tgt_index = 0
- for i, token_id in enumerate(input_ids):
- if token_id == self.sep_token_id:
- labels_token = self.tokenizer.tokenize(tgts[tgt_index])
- labels_token_id = self.tokenizer.convert_tokens_to_ids(labels_token)
- labels[i - len(labels_token_id) : i] = labels_token_id
- labels[i] = self.eos_token_id #
- tgt_index += 1
-
- outputs["labels"] = labels
-
- def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
- reader, meta, path = read_video_decord(url, save_to_disk=False)
-
- video_frame_args = dict()
- video_frame_args["fps"] = item.get("fps", self.fps)
- video_frame_args["min_frames"] = item.get("min_frames", self.min_frames)
- video_frame_args["max_frames"] = item.get("max_frames", self.max_frames)
- video_frame_args["target_frames"] = item.get("target_frames", self.target_frames)
- video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample)
-
- video_frame_args = self._set_video_frame_args(video_frame_args, meta)
-
- frames_data, _, timestamps = read_frames_decord(
- path,
- reader,
- meta,
- target_frames=video_frame_args["target_frames"],
- target_fps=video_frame_args["fps"],
- frames_sample=video_frame_args["frames_sample"],
- save_to_disk=False,
- )
-
- frames: List[Image.Image] = []
- for img_array, ts in zip(frames_data, timestamps):
- frames.append(render_frame_timestamp(img_array, ts))
- # Ensure even number of frames for temporal conv
- if len(frames) % 2 != 0:
- frames.append(copy.deepcopy(frames[-1]))
- return frames
-
- def _set_video_frame_args(self, video_frame_args, video_meta):
- """
- 根据已知参数和优先级,设定最终的抽帧参数
- """
- # 优先级:video_target_frames > (video_min_frames, video_max_frames) > video_fps
- if video_frame_args["target_frames"] > 0:
- if video_frame_args["fps"] >= 0:
- raise ValueError("fps must be negative if target_frames is given")
- if (
- video_frame_args["min_frames"] > 0
- and video_frame_args["target_frames"] < video_frame_args["min_frames"]
- ):
- raise ValueError("target_frames must be larger than min_frames")
- if (
- video_frame_args["max_frames"] > 0
- and video_frame_args["target_frames"] > video_frame_args["max_frames"]
- ):
- raise ValueError("target_frames must be smaller than max_frames")
- else:
- if video_frame_args["fps"] < 0:
- raise ValueError("Must provide either positive target_fps or positive target_frames.")
- # 先计算在video_fps下抽到的帧数
- frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
- # 判断是否在目标区间内,如果不是,则取target_frames为上界或下界
- if (
- video_frame_args["min_frames"] > 0
- and video_frame_args["max_frames"] > 0
- and video_frame_args["min_frames"] > video_frame_args["max_frames"]
- ):
- raise ValueError("min_frames must be smaller than max_frames")
- if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
- video_frame_args["target_frames"] = video_frame_args["min_frames"]
- video_frame_args["fps"] = -1
- if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
- video_frame_args["target_frames"] = video_frame_args["max_frames"]
- video_frame_args["fps"] = -1
-
- return video_frame_args
-
- def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]:
- # Downsample time if needed
- t_eff = t // self.temporal_conv_size if t != 1 else 1
- gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
- time_idx = np.repeat(np.arange(t_eff), gh * gw)
- h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
- w_idx = np.tile(np.arange(gw), t_eff * gh)
-
- coords = list(zip(time_idx, h_idx, w_idx))
- return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
-
- def _load_tokenizer(self):
- """
- load tokenizer
-
- Returns:
- tokenizer (AutoTokenizer)
- """
- vocab_file_names = [
- "tokenizer.model",
- "spm.model",
- "ernie_token_100k.model",
- ]
- for i in range(len(vocab_file_names)):
- if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
- Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
- break
- self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
-
- def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
- """
- get cache correspond to given hash values
- """
- req = pickle.dumps(mm_hashes)
- socket.send_multipart([b"", req])
- _, resp = socket.recv_multipart()
- mm_items = pickle.loads(resp)
- data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
- return mm_items
-
- def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
- """
- update cache data
- """
- req = pickle.dumps((mm_hashes, mm_items))
- socket.send_multipart([b"", req])
- data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py
deleted file mode 100644
index 91120096c70..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import os
-import random
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-from .utils.io_utils import EXTRACTED_FRAME_DIR, get_filename
-from .utils.video_utils import VideoReaderWrapper
-
-
-def read_video_decord(video_path, save_to_disk):
- """get reader and meta by decord"""
- # video_path = get_downloadable(video_path, save_to_disk=save_to_disk)
- if isinstance(video_path, VideoReaderWrapper):
- video_reader = video_path
- else:
- if isinstance(video_path, bytes):
- video_path = io.BytesIO(video_path)
- video_reader = VideoReaderWrapper(video_path, num_threads=1)
- vlen = len(video_reader)
- fps = video_reader.get_avg_fps()
- duration = vlen / float(fps)
-
- video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
-
- return video_reader, video_meta, video_path
-
-
-def get_frame_indices(
- vlen,
- target_frames=-1,
- target_fps=-1,
- frames_sample="middle",
- fix_start=None,
- input_fps=-1,
-):
- """
- 取出对应的frame index
- """
- assert frames_sample in ["rand", "middle", "leading"]
- if target_frames > 0:
- assert target_fps <= 0, "target_fps must be negative if target_frames is given."
- if target_frames > vlen:
- acc_samples = vlen
- data_processor_logger.info(
- f"target_frames={target_frames} is larger than video length {vlen}, "
- f"will sample {acc_samples} frames."
- )
- else:
- acc_samples = target_frames
- data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
-
- # split the video into `acc_samples` intervals, and sample from each interval.
- intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
- ranges = []
- for idx, interv in enumerate(intervals[:-1]):
- ranges.append((interv, intervals[idx + 1] - 1))
- if frames_sample == "rand":
- try:
- frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
- except Exception:
- frame_indices = np.random.permutation(vlen)[:acc_samples]
- frame_indices.sort()
- frame_indices = list(frame_indices)
- elif fix_start is not None:
- frame_indices = [x[0] + fix_start for x in ranges]
- elif frames_sample == "leading":
- frame_indices = [x[0] for x in ranges]
- elif frames_sample == "middle":
- frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
- else:
- raise NotImplementedError
-
- elif target_fps > 0:
- assert target_frames <= 0, "target_frames must be negative if target_fps is given."
- assert input_fps > 0, "input_fps must be provided if target_fps is given."
- data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
- duration = float(vlen) / input_fps
- delta = 1 / target_fps # gap between frames, this is also the clip length each frame represents
- if frames_sample == "middle":
- frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
- elif frames_sample == "leading":
- frame_seconds = np.arange(0, duration, delta)
- if frames_sample == "rand":
- frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
- rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
- frame_seconds += rand_offset * delta
- frame_indices = np.around(frame_seconds * input_fps).astype(int)
- frame_indices = [e for e in frame_indices if e < vlen]
-
- else:
- raise ValueError("Must provide either positive target_fps or positive target_frames.")
-
- return frame_indices
-
-
-def read_frames_decord(
- video_path,
- video_reader,
- video_meta,
- target_frames=-1,
- target_fps=-1,
- frames_sample="middle",
- fix_start=None,
- save_to_disk=False,
- cache_dir=EXTRACTED_FRAME_DIR,
- frame_indices=None,
- tol=10,
-):
- """get frames by decord"""
-
- if frame_indices is None:
- frame_indices = get_frame_indices(
- video_meta["num_of_frame"],
- target_frames=target_frames,
- target_fps=target_fps,
- frames_sample=frames_sample,
- fix_start=fix_start,
- input_fps=video_meta["fps"],
- )
-
- frames = []
- for frame_indice_index in range(0, len(frame_indices)):
- frame_indice = frame_indices[frame_indice_index]
- try:
- frames.append(video_reader[frame_indice].asnumpy()) # (T, H, W, C)
- except Exception as e:
- data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
- previous_counter = 1
- later_counter = 1
- previous_after_flag = True
- if frame_indice == 0 or frame_indice == len(video_reader) - 1:
- cur_tol = tol * 2
- else:
- cur_tol = tol
- while previous_counter < cur_tol or later_counter < cur_tol:
- if previous_after_flag:
- if frame_indice - previous_counter < 0:
- previous_counter += 1
- previous_after_flag = not previous_after_flag
- continue
- try:
- frames.append(video_reader[frame_indice - previous_counter].asnumpy())
- data_processor_logger.info(
- f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
- )
- frame_indices[frame_indice_index] = frame_indice - previous_counter
- break
- except Exception as e:
- previous_counter += 1
- data_processor_logger.info(f"error: {e}")
- else:
- if frame_indice + later_counter >= len(video_reader):
- later_counter += 1
- previous_after_flag = not previous_after_flag
- continue
- try:
- frames.append(video_reader[frame_indice + later_counter].asnumpy())
- data_processor_logger.info(
- f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
- )
- frame_indices[frame_indice_index] = frame_indice + later_counter
- break
- except Exception:
- later_counter += 1
- previous_after_flag = not previous_after_flag
-
- frames = np.stack(frames, axis=0)
- assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
-
- ret = []
-
- url_sha1 = get_filename()
- for idx, frame in enumerate(frames):
- tmp = Image.fromarray(frame, "RGB")
- if save_to_disk:
- save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
- if not os.path.exists(os.path.dirname(save_path)):
- os.makedirs(os.path.dirname(save_path))
- tmp.save(save_path)
- tmp = save_path
- ret.append(tmp)
-
- time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
-
- return ret, frame_indices, time_stamps
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf
deleted file mode 100644
index 7e3bb2f8ce7..00000000000
Binary files a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf and /dev/null differ
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py
deleted file mode 100644
index 31be300c18e..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py
deleted file mode 100644
index 1535b64d4f0..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import base64
-import datetime
-import hashlib
-import os
-import threading
-import uuid
-from pathlib import Path
-
-import requests
-from PIL import Image
-
-RAW_VIDEO_DIR = "./download_tmp/raw_video/"
-RAW_IMAGE_DIR = "./download_tmp/raw_images/"
-EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
-TMP_DIR = "./download_tmp/upload_tmp/"
-
-
-def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3):
- """
- Description: 下载url,如果url是PIL直接返回
- Args:
- url(str, PIL): http/本地路径/io.Bytes,注意io.Bytes是图片字节流
- download_path: 在save_to_disk=True的情况下生效,返回保存地址
- save_to_disk: 是否保存在本地路径
-
- """
- from .video_utils import VideoReaderWrapper
-
- if isinstance(url, Image.Image):
- return url
- elif isinstance(url, VideoReaderWrapper):
- return url
- elif url.startswith("http"):
- response = requests.get(url)
- bytes_data = response.content
- elif os.path.isfile(url):
- if save_to_disk:
- return url
- bytes_data = open(url, "rb").read()
- else:
- bytes_data = base64.b64decode(url)
- if not save_to_disk:
- return bytes_data
-
- download_path = os.path.join(download_dir, get_filename(url))
- Path(download_path).parent.mkdir(parents=True, exist_ok=True)
- with open(download_path, "wb") as f:
- f.write(bytes_data)
- return download_path
-
-
-def get_filename(url=None):
- """
- Get Filename
- """
- if url is None:
- return str(uuid.uuid4()).replace("-", "")
- t = datetime.datetime.now()
- if not isinstance(url, bytes):
- url = url.encode("utf-8")
-
- md5_hash = hashlib.md5(url).hexdigest()
- pid = os.getpid()
- tid = threading.get_ident()
-
- # 去掉后缀,防止save-jpg报错
- image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
- return image_filname
-
-
-def get_downloadable(
- url,
- download_dir=RAW_VIDEO_DIR,
- save_to_disk=False,
- retry=0,
- retry_interval=3,
-):
- """download video and store it in the disk
-
- return downloaded **path** if save_to_disk is set to true
- return downloaded **bytes** if save_to_disk is set to false
- """
-
- if not os.path.exists(download_dir):
- os.makedirs(download_dir)
- downloaded_path = file_download(
- url,
- download_dir,
- save_to_disk=save_to_disk,
- retry=retry,
- retry_interval=retry_interval,
- )
- return downloaded_path
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py
deleted file mode 100644
index 9b24226ed88..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-from pathlib import Path
-
-from PIL import Image, ImageDraw, ImageFont
-
-cur_directory = Path(__file__).parent.absolute()
-FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf")
-
-
-def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
- """
- 函数功能: 给pil.image的图片渲染时间戳
- 时间戳的大小为 min(width, height)的rate
- 字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10%
- 返回一个 Image 对象
- """
- draw = ImageDraw.Draw(image) # 创建一个可绘制对象
- width, height = image.size # 获取图片大小
- font_size = int(min(width, height) * rate) # 设置字体大小
- outline_size = int(font_size * 0.1) # 设置轮廓大小
- font = ImageFont.truetype(font_path, font_size) # 加载字体文件, 设置字体大小
- x = 0
- y = 0 # 文本的x坐标, y坐标
-
- # 绘制黑色的时间戳,白色的边框
- draw.text(
- (x, y),
- number,
- font=font,
- fill=(0, 0, 0),
- stroke_width=outline_size,
- stroke_fill=(255, 255, 255),
- )
-
- return image
-
-
-def timestamp_converting(time_stamp_in_seconds):
- """
- convert timestamp format from seconds to hr:min:sec
- """
- # get hours
- hours = 0
- while time_stamp_in_seconds >= 3600:
- hours += 1
- time_stamp_in_seconds -= 3600
- # get minutes
- mins = 0
- while time_stamp_in_seconds >= 60:
- mins += 1
- time_stamp_in_seconds -= 60
- time_hours = f"{int(hours):02d}"
- time_mins = f"{int(mins):02d}"
- time_secs = f"{time_stamp_in_seconds:05.02f}"
- fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
-
- return fi_time_stamp
-
-
-def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
- """
- function: get the timestamp of a frame, 在均匀抽帧时用。
-
- num_frames: 总帧数
- frameid_list: 被抽帧的帧的索引
- duration: 视频的总时长
- return: timestamp; xx:xx:xx (str)
- """
- time_stamp = duration * 1.0 * frame_id / num_frames
-
- return time_stamp
-
-
-def render_frame_timestamp(frame, timestamp, font_rate=0.1):
- """
- 函数功能, 给frame, 按照顺序将 index 渲染上去
- 逻辑思路: 把index渲染到图片的左上方
-
- frame: 帧,PIL.Image object
- timestamp: 时间戳,单位是秒
- font_rate: 字体大小占 min(wi, hei)的比率
- """
-
- time_stamp = "time: " + timestamp_converting(timestamp)
- new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
-
- return new_frame
diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py
deleted file mode 100644
index a4769ca8ecc..00000000000
--- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import os
-from tempfile import NamedTemporaryFile as ntf
-
-import decord
-
-try:
- # moviepy 1.0
- import moviepy.editor as mp
-except:
- # moviepy 2.0
- import moviepy as mp
-
-
-def is_gif(data: bytes) -> bool:
- """
- check if a bytes is a gif based on the magic head
- """
- return data[:6] in (b"GIF87a", b"GIF89a")
-
-
-class VideoReaderWrapper(decord.VideoReader):
- """
- Solving memory leak bug
-
- https://github.com/dmlc/decord/issues/208
- """
-
- def __init__(self, video_path, *args, **kwargs):
- with ntf(delete=True, suffix=".gif") as gif_file:
- gif_input = None
- self.original_file = None
- if isinstance(video_path, str):
- self.original_file = video_path
- if video_path.lower().endswith(".gif"):
- gif_input = video_path
- elif isinstance(video_path, bytes):
- if is_gif(video_path):
- gif_file.write(video_path)
- gif_input = gif_file.name
- elif isinstance(video_path, io.BytesIO):
- video_path.seek(0)
- tmp_bytes = video_path.read()
- video_path.seek(0)
- if is_gif(tmp_bytes):
- gif_file.write(tmp_bytes)
- gif_input = gif_file.name
-
- if gif_input is not None:
- clip = mp.VideoFileClip(gif_input)
- mp4_file = ntf(delete=False, suffix=".mp4")
- clip.write_videofile(mp4_file.name, verbose=False, logger=None)
- clip.close()
- video_path = mp4_file.name
- self.original_file = video_path
-
- super().__init__(video_path, *args, **kwargs)
- self.seek(0)
-
- def __getitem__(self, key):
- frames = super().__getitem__(key)
- self.seek(0)
- return frames
-
- def __del__(self):
- if self.original_file and os.path.exists(self.original_file):
- os.remove(self.original_file)
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py b/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py
deleted file mode 100644
index 8f79e65d634..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .paddleocr_vl_processor import PaddleOCRVLProcessor
-from .process import DataProcessor
-
-__all__ = ["DataProcessor", "PaddleOCRVLProcessor"]
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py
deleted file mode 100644
index 8e333d5bf96..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py
+++ /dev/null
@@ -1,275 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-"""Image processor class for Keye."""
-
-# TODO: Support videos
-
-import json
-import logging
-import math
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_utils import (
- ImageInput,
- is_valid_image,
- make_list_of_images,
- to_numpy_array,
-)
-
-_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
- """
- Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
- Args:
- images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
- The input image.
-
- Returns:
- list: A list of images.
- """
- if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
- return [img for img_list in images for img in img_list]
-
- elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
- return images
-
- elif is_valid_image(images):
- return [images]
-
- raise ValueError(f"Could not make batched images from {images}")
-
-
-def adjust_size(size, patch_size):
- num_patches = size // patch_size
- if num_patches % 2 != 0:
- num_patches -= 1
- return num_patches * patch_size
-
-
-def smart_resize(
- height: int,
- width: int,
- factor: int = 28,
- min_pixels: int = 28 * 28 * 130,
- max_pixels: int = 28 * 28 * 1280,
-):
- """Rescales the image so that the following conditions are met:
-
- 1. Both dimensions (height and width) are divisible by 'factor'.
-
- 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
- 3. The aspect ratio of the image is maintained as closely as possible.
-
- """
- # if height < factor or width < factor:
- # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
- # if int(height < factor//4) + int(width < factor//4):
- # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
-
- if height < factor:
- logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
- width = round((width * factor) / height)
- height = factor
-
- if width < factor:
- logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
- height = round((height * factor) / width)
- width = factor
-
- if max(height, width) / min(height, width) > 200:
- raise ValueError(
- f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
- )
- h_bar = round(height / factor) * factor
- w_bar = round(width / factor) * factor
- if h_bar * w_bar > max_pixels:
- beta = math.sqrt((height * width) / max_pixels)
- h_bar = math.floor(height / beta / factor) * factor
- w_bar = math.floor(width / beta / factor) * factor
- elif h_bar * w_bar < min_pixels:
- beta = math.sqrt(min_pixels / (height * width))
- h_bar = math.ceil(height * beta / factor) * factor
- w_bar = math.ceil(width * beta / factor) * factor
- return h_bar, w_bar
-
-
-class ImageProcessor(BaseImageProcessor):
- model_input_names = [
- "pixel_values",
- "image_grid_thw",
- "pixel_values_videos",
- "video_grid_thw",
- ]
-
- def __init__(
- self,
- do_resize: bool = True,
- resample: int = 3,
- do_rescale: bool = True,
- rescale_factor: Union[int, float] = 1 / 255,
- do_normalize: bool = True,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: bool = True,
- min_pixels: int = 28 * 28 * 130,
- max_pixels: int = 28 * 28 * 1280,
- patch_size: int = 14,
- temporal_patch_size: int = 1,
- merge_size: int = 2,
- **kwargs,
- ) -> None:
- super().__init__()
- self.do_resize = do_resize
- self.resample = resample
- self.do_rescale = do_rescale
- self.rescale_factor = rescale_factor
- self.do_normalize = do_normalize
- self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
- self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
- self.min_pixels = min_pixels
- self.max_pixels = max_pixels
- self.patch_size = patch_size
- self.temporal_patch_size = temporal_patch_size
- self.merge_size = merge_size
- self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
- self.do_convert_rgb = do_convert_rgb
-
- @classmethod
- def from_pretrained(cls, pretrained_model_dir):
- pretrained_model_dir = Path(pretrained_model_dir)
- image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
- with open(image_processor_config_path, "r", encoding="utf-8") as f:
- image_processor_config = json.load(f)
- return cls(**image_processor_config)
-
- def _preprocess(
- self,
- images,
- do_resize: Optional[bool] = None,
- do_rescale: Optional[bool] = None,
- rescale_factor: Optional[float] = None,
- do_normalize: Optional[bool] = None,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: Optional[bool] = None,
- ):
- images = make_list_of_images(images)
-
- if do_convert_rgb:
- images = [image.convert("RGB") for image in images]
-
- width, height = images[0].size
- resized_height, resized_width = height, width
- processed_images = []
-
- for image in images:
- if do_resize:
- resized_height, resized_width = smart_resize(
- height,
- width,
- factor=self.patch_size * self.merge_size,
- min_pixels=self.min_pixels,
- max_pixels=self.max_pixels,
- )
-
- image = image.resize((resized_width, resized_height), resample=self.resample)
-
- image = to_numpy_array(image)
-
- if do_rescale:
- image = (image * rescale_factor).astype(np.float32)
-
- if do_normalize:
- image = image.astype(np.float32)
- image -= np.array(image_mean, dtype=np.float32)
- image /= np.array(image_std, dtype=np.float32)
-
- processed_images.append(image)
-
- patches = np.array(processed_images)
- patches = patches.transpose(0, 3, 1, 2)
- if patches.shape[0] == 1:
- patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
- channel = patches.shape[1]
- grid_t = patches.shape[0] // self.temporal_patch_size
- grid_h, grid_w = (
- resized_height // self.patch_size,
- resized_width // self.patch_size,
- )
-
- patches = patches.reshape(
- grid_t,
- self.temporal_patch_size,
- channel,
- grid_h,
- self.patch_size,
- grid_w,
- self.patch_size,
- )
- patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
- assert self.temporal_patch_size == 1
- flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
- return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
- def preprocess(
- self,
- images,
- videos=None,
- do_resize: Optional[bool] = None,
- size: Optional[Dict[str, int]] = None,
- do_rescale: Optional[bool] = None,
- rescale_factor: Optional[float] = None,
- do_normalize: Optional[bool] = None,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- do_convert_rgb: Optional[bool] = None,
- return_tensors=None,
- ):
- do_resize = do_resize if do_resize is not None else self.do_resize
- size = size if size is not None else self.size
- do_rescale = do_rescale if do_rescale is not None else self.do_rescale
- rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
- do_normalize = do_normalize if do_normalize is not None else self.do_normalize
- image_mean = image_mean if image_mean is not None else self.image_mean
- image_std = image_std if image_std is not None else self.image_std
- do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
- if videos is not None:
- raise NotImplementedError("Videos are not yet supported")
-
- patches, image_grid_thw = self._preprocess(
- images,
- do_resize=do_resize,
- do_rescale=do_rescale,
- rescale_factor=rescale_factor,
- do_normalize=do_normalize,
- image_mean=image_mean,
- image_std=image_std,
- do_convert_rgb=do_convert_rgb,
- )
- pixel_values = np.array(patches)
- data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
- return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py
deleted file mode 100644
index f7d7cfbd2b1..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py
+++ /dev/null
@@ -1,322 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-_SAMPLING_EPS = 1e-5
-from fastdeploy.input.utils import process_stop_token_ids
-
-
-class PaddleOCRVLProcessor(TextProcessor):
- """
- PaddleOCR Vision-Language processor for handling multimodal inputs.
-
- This processor extends TextProcessor to support:
- - Image processing
- - Multimodal feature extraction
- - Tokenization and position encoding
- - Request processing and model input generation
-
- Attributes:
- processor (DataProcessor): Underlying data processor instance
- tokenizer: Text tokenizer instance
- limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
- """
-
- def __init__(
- self,
- config,
- model_name_or_path,
- limit_mm_per_prompt=None,
- mm_processor_kwargs=None,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- enable_processor_cache=False,
- ):
- """
- Initialize PaddleOCRVLProcessor instance.
-
- Args:
- config: Model configuration object
- model_name_or_path (str): Pretrained model name or path
- limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
- mm_processor_kwargs (dict, optional): Multimodal processor arguments
- reasoning_parser_obj: Reasoning parser instance
- tool_parser_obj: Tool parser instance
- """
- super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
- data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
- processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
- self.processor = DataProcessor(
- model_path=model_name_or_path,
- enable_processor_cache=enable_processor_cache,
- tokens_per_second=config.vision_config.tokens_per_second,
- tokenizer=self.tokenizer,
- **processor_kwargs,
- )
- self.image_patch_id = self.processor.image_patch_id
- self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """
- Process incoming request and generate model inputs.
-
- Args:
- request: Input request object
- max_model_len (int, optional): Maximum context length
- **kwargs: Additional processing parameters
-
- Returns:
- Request: Processed request with model inputs
- """
- task = request.to_dict()
- task["enable_thinking"] = kwargs.get("enable_thinking", False)
- self.process_request_dict(task, max_model_len)
- request = Request.from_dict(task)
- request = self._apply_default_parameters(request)
- return request
-
- def _parse_processor_kwargs(self, kwargs):
- """
- Parse and validate multimodal processor arguments.
-
- Args:
- kwargs (dict): Processor configuration arguments
-
- Returns:
- dict: Validated processor arguments
-
- Raises:
- ValueError: If arguments format is invalid
- """
- if not kwargs:
- return {}
-
- try:
- if not isinstance(kwargs, dict):
- raise ValueError("mm-processor-kwargs must be a dictionary")
-
- # Validate kwargs types against expected schema
- data_processor_logger.info(f"Processing kwargs: {kwargs}")
- expected_types = {
- "video_max_frames": int, # Maximum video frames parameter
- "video_min_frames": int, # Minimum video frames parameter
- }
-
- for key, value in kwargs.items():
- if key in expected_types and not isinstance(value, expected_types[key]):
- raise ValueError(
- f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
- )
-
- return kwargs
-
- except Exception as e:
- data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
- return {}
-
- def _parse_limits(self, limits):
- """
- Parse and validate multimodal input limits.
-
- Args:
- limits (dict): Input limits configuration
-
- Returns:
- dict: Validated limits with defaults
-
- Raises:
- ValueError: If limits format is invalid
- """
- DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
- if not limits:
- return DEFAULT_LIMITS
-
- try:
- if not isinstance(limits, dict):
- raise ValueError("limit-mm-per-prompt must be a dictionary")
- data_processor_logger.info(f"_parse_limits:{limits}")
- return {**DEFAULT_LIMITS, **limits}
- except Exception as e:
- data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
- return DEFAULT_LIMITS
-
- def _check_mm_limits(self, item):
- """
- Validate multimodal inputs against configured limits.
-
- Args:
- item: Input request item to validate
-
- Raises:
- ValueError: If input exceeds configured limits
- """
- if isinstance(item, dict):
- # 请求包含prompt和multi_modal_data
- mm_data = item
- else:
- # 请求包含messages
- mm_data = {"image": [], "video": []}
-
- for message in item:
- if isinstance(message.get("content"), list):
- for part in message["content"]:
- if part.get("type") in ["image_url", "image"]:
- mm_data["image"].append(part)
- elif part.get("type") in ["video_url", "video"]:
- mm_data["video"].append(part)
-
- for modality, data in mm_data.items():
- if modality in self.limit_mm_per_prompt:
- limit = self.limit_mm_per_prompt[modality]
- if len(data) > limit:
- raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """
- Process request dictionary into model inputs.
-
- Args:
- request (dict): Input request dictionary
- max_model_len (int, optional): Maximum context length
-
- Returns:
- dict: Processed request with model inputs
-
- Raises:
- ValueError: If request format is invalid
- """
-
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- if request.prompt:
- multimodal_data = request.multimodal_data
- if multimodal_data is None:
- multimodal_data = {}
- self._check_mm_limits(multimodal_data)
- images = multimodal_data.get("image", None)
- videos = multimodal_data.get("video", None)
- outputs = self.processor.text2ids(request.prompt, images, videos)
-
- elif request.messages:
- messages = request.messages
- self._check_mm_limits(messages)
- outputs = self.processor.request2ids(request)
-
- else:
- raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
- metadata = request.metadata
- # Handle continuation of previous generation by appending existing tokens
- if metadata and metadata.get("generated_token_ids"):
- self.append_generated_tokens(outputs, metadata["generated_token_ids"])
- outputs = self.pack_outputs(outputs)
-
- request.prompt_token_ids = outputs["input_ids"].tolist()
- request.prompt_token_ids_len = len(request.prompt_token_ids)
- request.multimodal_inputs = outputs
-
- # Handle prompt truncation if exceeds model context length
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[
- : max_model_len - 1
- ] # Leave space for at least 1 new token
-
- # Set default max_tokens if not specified
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-
- if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS:
- request.sampling_params.top_p = _SAMPLING_EPS
- request.sampling_params.top_k = 1
-
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.get("n", 1)
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
-
- return request
-
- def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
- """
- Append generated tokens to existing outputs.
-
- Args:
- outputs: Current model outputs
- generated_token_ids: Generated tokens to append
- """
- num_tokens = len(generated_token_ids)
- multimodal_inputs["input_ids"].extend(generated_token_ids)
- multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
- pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
- multimodal_inputs["position_ids"].append(pos_ids)
- multimodal_inputs["cur_position"] += num_tokens
-
- def pack_outputs(self, outputs):
- """
- Prepare final output dictionary for model.
-
- Args:
- outputs: Intermediate processing outputs
-
- Returns:
- dict: Packed output dictionary with all required fields
- """
- if not outputs["images"]:
- outputs["images"] = None # No images case
- outputs["grid_thw"] = None # No spatial dimensions
- outputs["image_type_ids"] = None # No type IDs
- else:
- outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically
- outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions
- outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array
-
- # Convert all outputs to numpy arrays with appropriate types
- outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64
- outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64
- outputs["position_ids"] = np.concatenate(
- outputs["position_ids"], axis=1, dtype=np.int64
- ) # Concatenate position ID
-
- outputs["image_patch_id"] = self.processor.image_token_id
- outputs["video_patch_id"] = self.processor.video_token_id
- outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
- outputs["mm_num_token_func"] = self.processor.mm_num_tokens
- return outputs
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process.py b/fastdeploy/input/v1/paddleocr_vl_processor/process.py
deleted file mode 100644
index ea6b63ee9d8..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/process.py
+++ /dev/null
@@ -1,622 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor
-from .process_video import sample_frames
-
-
-class DataProcessor(MMBaseDataProcessor):
- """
- Processes multimodal inputs (text, images, videos) into model-ready formats.
-
- Handles:
- - Tokenization of text with special tokens for visual content
- - Image and video preprocessing
- - Generation of 3D positional embeddings
- - Conversion of chat messages to model inputs
-
- Attributes:
- tokenizer: Text tokenizer instance
- image_processor: Image/video preprocessor
- image_token: Special token for image placeholders
- video_token: Special token for video placeholders
- vision_start: Token marking start of visual content
- """
-
- def __init__(
- self,
- model_path: str,
- enable_processor_cache: bool = False,
- video_min_frames: int = 4,
- video_max_frames: int = 768,
- video_target_frames: int = -1,
- video_fps: int = -1,
- tokens_per_second: int = 2,
- tokenizer=None,
- **kwargs,
- ) -> None:
- """
- Initialize the data processor.
-
- Args:
- model_path: Path to pretrained model
- video_min_frames: Minimum frames to sample from videos
- video_max_frames: Maximum frames to sample from videos
- tokens_per_second: Temporal resolution for positional embeddings
- **kwargs: Additional configuration
- """
- super().__init__()
- self.min_frames = video_min_frames
- self.max_frames = video_max_frames
- self.target_frames = video_target_frames
- self.fps = video_fps
-
- # Initialize tokenizer with left padding and fast tokenizer
- if tokenizer is None:
- self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
- self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation
- else:
- self.tokenizer = tokenizer
- self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor
- self.enable_processor_cache = enable_processor_cache
-
- # Convolution sizes for patch aggregation
- self.spatial_conv_size = self.image_processor.merge_size
- self.temporal_conv_size = self.image_processor.temporal_patch_size
-
- # Special tokens and IDs
- self.image_token = "<|IMAGE_PLACEHOLDER|>"
- self.video_token = "<|video_pad|>"
-
- self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
- self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
- self.image_patch_id = self.image_token_id
-
- self.vision_start = "<|IMAGE_START|>"
- self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
- self.tokens_per_second = tokens_per_second
-
- self.role_prefixes = {
- "system": "",
- "user": "User: ",
- "bot": "Assistant: ",
- "assistant": "Assistant: ",
- }
-
- @staticmethod
- def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
- """
- Calculate the number of tokens in the multimodal input.
- """
- if isinstance(grid_thw, paddle.Tensor):
- grid_thw = grid_thw.numpy()
-
- if len(grid_thw) == 0:
- return 0
-
- def calc_one(thw):
- t, h, w = map(int, thw)
- return t * h * w // 4
-
- if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
- return [calc_one(x) for x in grid_thw]
-
- return calc_one(grid_thw)
-
- def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
- """
- Convert text with image/video placeholders into model inputs.
-
- Args:
- text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
- images: List of PIL Images corresponding to image placeholders
- videos: List of video data corresponding to video placeholders
- image_uuid: List of unique identifiers for each image, used for caching or hashing.
- video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
- Returns:
- Dict containing:
- - input_ids: Token IDs
- - token_type_ids: Type identifiers (text/image/video)
- - position_ids: 3D positional embeddings
- - images: Preprocessed visual features
- - grid_thw: Spatial/temporal dimensions
- - image_type_ids: Visual content type (0=image, 1=video)
- """
-
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "fps": [],
- "mm_positions": [],
- "mm_hashes": [],
- "vit_seqlen": [],
- "vit_position_ids": [],
- }
-
- # Define placeholders and their lengths
- IMAGE_PLACEHOLDER = self.image_token
- VIDEO_PLACEHOLDER = self.video_token
- IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
- VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
- # Initialize tracking variables for text parsing
- st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter
- while st < len(text):
- # Find next image or video placeholder in text
- image_pos = text.find(IMAGE_PLACEHOLDER, st)
- image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found
- video_pos = text.find(VIDEO_PLACEHOLDER, st)
- video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found
- ed = min(image_pos, video_pos) # End position is first placeholder found
-
- self._add_text(text[st:ed], outputs)
- if ed == len(text):
- break
-
- if ed == image_pos:
- image = images[image_idx]
- uuid = image_uuid[image_idx] if image_uuid else None
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid)
- else:
- self._add_processed_image(image, outputs, uuid)
- image_idx += 1
- st = ed + IMAGE_PLACEHOLDER_LEN
- else:
- item = videos[video_idx]
- uuid = video_uuid[video_idx] if video_uuid else None
- if not isinstance(item, tuple):
- if isinstance(item, dict):
- frames, meta = self._load_and_process_video(item["video"], item)
- else:
- frames, meta = self._load_and_process_video(item, {})
- self._add_video(frames, meta, outputs, uuid)
- else:
- # cached frames are already processed
- self._add_processed_video(item, outputs, uuid)
- video_idx += 1
- st = ed + VIDEO_PLACEHOLDER_LEN
-
- return outputs
-
- def request2ids(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- """
- Convert chat request with multimodal messages into model inputs.
-
- Args:
- request: Dictionary containing:
- - messages: List of chat messages with text/image/video content
- - request_id: Unique identifier for logging
- tgts: Optional target sequences
-
- Returns:
- Dict with same structure as text2ids() output
- """
-
- # Parse and validate chat messages
- messages = parse_chat_messages(request.messages)
- mm_items = []
- for msg in messages:
- role = msg.get("role")
- assert role in self.role_prefixes, f"Unsupported role: {role}"
-
- # Normalize content to list format
- content = msg.get("content")
- if not isinstance(content, list):
- content = [content]
- # Collect all visual content items
- for item in content:
- if item.get("type") in ["image", "video"]:
- mm_items.append(item)
-
- missing_hashes, missing_idx = [], []
- for idx, item in enumerate(mm_items):
- if not item.get("data"):
- # raw data not provided, should be retrieved from processor cache
- missing_hashes.append(item.get("uuid"))
- missing_idx.append(idx)
-
- if len(missing_hashes) > 0 and not self.enable_processor_cache:
- raise ValueError("Missing items cannot be retrieved without processor cache.")
-
- if self.enable_processor_cache:
- context = zmq.Context()
- dealer = context.socket(zmq.DEALER)
- dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
- missing_items = self.get_processor_cache(dealer, missing_hashes)
- for idx in range(len(missing_items)):
- if not missing_items[idx]:
- raise ValueError(f"Missing item {idx} not found in processor cache")
- mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
- images, videos = [], []
- image_uuid, video_uuid = [], []
- for item in mm_items:
- if item.get("type") == "image":
- images.append(item["data"])
- image_uuid.append(item["uuid"])
- elif item.get("type") == "video":
- videos.append(item["data"])
- video_uuid.append(item["uuid"])
- else:
- raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat template.")
-
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- prompt = self.tokenizer.apply_chat_template(
- messages,
- tokenize=False,
- add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
- **chat_template_kwargs,
- )
- request.prompt_tokens = prompt
-
- outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- t, h, w = outputs["grid_thw"][idx]
- meta["thw"] = (t, h, w)
- meta["fps"] = outputs["fps"][idx]
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- return outputs
-
- def _add_text(self, tokens, outputs: Dict) -> None:
- """
- Add text tokens to model inputs dictionary.
-
- Args:
- tokens: Text string or already tokenized IDs
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles both raw text and pre-tokenized inputs
- - Updates position IDs for 3D embeddings
- """
- if not tokens:
- return None
-
- if isinstance(tokens, str):
- tokens_str = self.tokenizer.tokenize(tokens)
- tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
- num_tokens = len(tokens)
- outputs["input_ids"].extend(tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
- pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
- """
- Generate 3D positional embeddings for text tokens.
-
- Args:
- start_pos: Starting position index
- num_tokens: Number of tokens to generate positions for
-
- Returns:
- numpy.ndarray: 3D position IDs shaped (3, num_tokens)
- """
- text_array = np.arange(num_tokens).reshape(1, -1)
- text_index = np.broadcast_to(text_array, (3, num_tokens))
- position = text_index + start_pos
- return position
-
- def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
- """
- Add image data to model inputs dictionary.
-
- Args:
- img: PIL Image to process
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Preprocesses image and calculates spatial dimensions
- - Adds image token IDs and type markers
- - Generates appropriate position embeddings
- """
- ret = self.image_processor.preprocess(images=[img.convert("RGB")])
- num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["grid_thw"].tolist()
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
- outputs["num_input_image_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].append(0)
-
- # position_ids
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
- outputs["fps"].append(0)
- numel = h * w
- outputs["vit_seqlen"].append(numel)
- outputs["vit_position_ids"].append(np.arange(numel) % numel)
-
- def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
- img, meta = img_cache
- num_tokens = img.shape[0] // self.image_processor.merge_size**2
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
- _, h, w = meta["thw"]
- pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["images"].append(img)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[1, h, w]]))
- outputs["image_type_ids"].append(0)
-
- outputs["fps"].append(0)
-
- def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None:
- """
- Add video data to model inputs dictionary.
-
- Args:
- frames: Video frames as numpy array
- meta: Video metadata containing fps/duration
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles temporal dimension in position embeddings
- - Uses video-specific token IDs and type markers
- """
- ret = self.image_processor.preprocess(images=frames)
-
- num_tokens = ret["image_grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["image_grid_thw"].tolist()
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.video_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["num_input_video_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].extend([1] * grid_thw[0])
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
- outputs["fps"].append(fps)
- numel = h * w
- outputs["vit_seqlen"].append(numel)
- outputs["vit_position_ids"].append(np.arange(numel) % numel)
-
- def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
- frames, meta = frames_cache
- num_tokens = frames.shape[0] // self.image_processor.merge_size**2
-
- t, h, w = meta["thw"]
- outputs["images"].append(frames)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[t, h, w]]))
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["image_type_ids"].extend([1] * t)
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(fps)
-
- def _compute_vision_positions(
- self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
- ) -> np.ndarray:
- """
- Generate 3D position IDs for visual inputs.
-
- Args:
- start_pos: Base position in sequence
- t: Temporal patches (1 for images)
- h: Height in patches
- w: Width in patches
- second_per_grid_t: Time per temporal patch
-
- Returns:
- np.ndarray: Position IDs for [t,h,w] dimensions
- """
- h //= self.spatial_conv_size
- w //= self.spatial_conv_size
-
- tn = np.arange(t).reshape(-1, 1)
- tn = np.broadcast_to(tn, (t, h * w))
- tn = tn * int(second_per_grid_t) * self.tokens_per_second
- t_index = tn.flatten()
-
- hn = np.arange(h).reshape(1, -1, 1)
- h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
- wn = np.arange(w).reshape(1, 1, -1)
- w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
- position = np.stack([t_index, h_index, w_index]) + start_pos
- return position
-
- def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
- """
- Load and preprocess video into frames.
-
- Args:
- url: Video file path or bytes
- item: Dictionary containing processing parameters
-
- Returns:
- tuple: (frames, metadata) where:
- - frames: Processed video frames as numpy array
- - metadata: Updated video metadata dictionary
- """
- reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
- # Apply frame sampling if fps or target_frames specified
- fps = item.get("fps", self.fps)
- num_frames = item.get("target_frames", self.target_frames)
-
- frame_indices = list(range(meta["num_of_frame"]))
- if fps > 0 or num_frames > 0:
- # Get frame sampling constraints
- min_frames = item.get("min_frames", self.min_frames)
- max_frames = item.get("max_frames", self.max_frames)
-
- # Sample frames according to specifications
- frame_indices = sample_frames(
- frame_factor=self.temporal_conv_size, # Ensure divisible by temporal patch size
- min_frames=min_frames,
- max_frames=max_frames,
- metadata=meta,
- fps=fps,
- num_frames=num_frames,
- )
-
- # Update metadata with new frame count and fps
- meta["num_of_frame"] = len(frame_indices)
- if fps is not None:
- meta["fps"] = fps # Use specified fps
- meta["duration"] = len(frame_indices) / fps
- else:
- meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames
-
- frames = []
- for idx in frame_indices:
- frame = reader[idx].asnumpy()
- image = Image.fromarray(frame, "RGB")
- frames.append(image)
- frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
- return frames, meta
-
- def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
- """
- get cache correspond to given hash values
- """
- req = pickle.dumps(mm_hashes)
- socket.send_multipart([b"", req])
- _, resp = socket.recv_multipart()
- mm_items = pickle.loads(resp)
- data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
- return mm_items
-
- def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
- """
- update cache data
- """
- req = pickle.dumps((mm_hashes, mm_items))
- socket.send_multipart([b"", req])
- data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
-
- def apply_chat_template(self, request):
- """
- Apply chat template to convert messages into token sequence.
-
- Args:
- request: Dictionary containing chat messages
-
- Returns:
- List of token IDs
-
- Raises:
- ValueError: If model doesn't support chat templates
- """
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat_template.")
-
- raw_prompt = self.tokenizer.apply_chat_template(
- request["messages"],
- tokenize=False,
- add_generation_prompt=request.get("add_generation_prompt", True),
- chat_template=request.get("chat_template", None),
- )
- prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "")
- request["text_after_process"] = raw_prompt
-
- tokens = self.tokenizer.tokenize(prompt_token_str)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
- data_processor_logger.info(
- f"req_id:{request.get('request_id', ''), } prompt: {raw_prompt} tokens: {tokens}, token_ids: {token_ids}"
- )
- return token_ids
diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py b/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py
deleted file mode 100644
index c7089d26dc2..00000000000
--- a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import Optional, Union
-
-import numpy as np
-
-
-def sample_frames(
- frame_factor: int,
- min_frames: int,
- max_frames: int,
- metadata: Optional[dict] = None,
- fps: Optional[Union[int, float]] = None,
- num_frames: Optional[int] = None,
-):
- """
- Sample frames from video according to specified criteria.
-
- Args:
- frame_factor: Ensure sampled frames are multiples of this factor
- min_frames: Minimum number of frames to sample
- max_frames: Maximum number of frames to sample
- metadata: Video metadata containing fps information
- fps: Target frames per second for sampling
- num_frames: Exact number of frames to sample
-
- Returns:
- np.ndarray: Sampled video frames
-
- Raises:
- ValueError: If both fps and num_frames are specified,
- or if required metadata is missing,
- or if requested frames exceed available frames
- """
- if fps > 0 and num_frames > 0:
- raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
- total_num_frames = metadata["num_of_frame"]
-
- # If num_frames is not given but fps is, calculate num_frames from fps
- if num_frames > 0:
- num_frames = round(num_frames / frame_factor) * frame_factor
- elif fps > 0:
- if metadata is None:
- raise ValueError(
- "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
- "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
- )
- max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
- num_frames = total_num_frames / metadata["fps"] * fps
- num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
- num_frames = math.floor(num_frames / frame_factor) * frame_factor
- if num_frames > total_num_frames:
- raise ValueError(
- f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
- "Decrease `num_frames` or `fps` for sampling."
- )
-
- # Calculate frame indices based on sampling strategy
- if num_frames > 0:
- # Evenly spaced sampling for target frame count
- indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
- else:
- # Keep all frames if no sampling requested
- indices = np.arange(0, total_num_frames).astype(np.int32)
-
- return indices
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py b/fastdeploy/input/v1/qwen3_vl_processor/__init__.py
deleted file mode 100644
index 9f959610c8d..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .process import DataProcessor
-from .qwen3_vl_processor import Qwen3VLProcessor
-
-__all__ = [
- "DataProcessor",
- "Qwen3VLProcessor",
-]
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py
deleted file mode 100644
index 167f3e340db..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py
+++ /dev/null
@@ -1,413 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
- normalize,
- rescale,
- resize,
- to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
- ChannelDimension,
- ImageInput,
- PILImageResampling,
- get_image_size,
- infer_channel_dimension_format,
- make_list_of_images,
- to_numpy_array,
- valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-IMAGE_MEAN = [0.5, 0.5, 0.5]
-IMAGE_STD = [0.5, 0.5, 0.5]
-
-MIN_PIXELS = 65536
-MAX_PIXELS = 16777216
-
-
-VideoInput = Union[
- List["PIL.Image.Image"],
- "np.ndarray",
- "paddle.Tensor",
- List["np.ndarray"],
- List["paddle.Tensor"],
- List[List["PIL.Image.Image"]],
- List[List["np.ndarray"]],
- List[List["paddle.Tensor"]],
-]
-
-
-def round_by_factor(number: int, factor: int) -> int:
- return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
- return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
- return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
- """
- Smart image resizing that maintains aspect ratio and respects constraints.
-
- Args:
- height: Original image height
- width: Original image width
- factor: Patch size factor
- min_pixels: Minimum allowed pixels
- max_pixels: Maximum allowed pixels
- max_ratio: Maximum allowed aspect ratio
-
- Returns:
- tuple: (new_height, new_width)
-
- Raises:
- ValueError: If calculated dimensions are invalid
- """
- if max(height, width) / min(height, width) > max_ratio:
- if height > width:
- new_width = max(factor, round_by_factor(width, factor))
- new_height = floor_by_factor(new_width * max_ratio, factor)
- else:
- new_height = max(factor, round_by_factor(height, factor))
- new_width = floor_by_factor(new_height * max_ratio, factor)
-
- data_processor_logger.info(
- f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
- resize to {max(new_height, new_width) / min(new_height, new_width)}"
- )
-
- height = new_height
- width = new_width
-
- h_bar = max(factor, round_by_factor(height, factor))
- w_bar = max(factor, round_by_factor(width, factor))
- if h_bar * w_bar > max_pixels:
- beta = math.sqrt((height * width) / max_pixels)
- h_bar = floor_by_factor(height / beta, factor)
- w_bar = floor_by_factor(width / beta, factor)
- elif h_bar * w_bar < min_pixels:
- beta = math.sqrt(min_pixels / (height * width))
- h_bar = ceil_by_factor(height * beta, factor)
- w_bar = ceil_by_factor(width * beta, factor)
-
- if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
- raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
- return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
- """
- Check if image pixel values are already normalized to [0, 1] range.
-
- Args:
- image: Input image array
-
- Returns:
- bool: True if image is already scaled
- """
- if image.dtype == np.uint8:
- return False
-
- # It's possible the image has pixel values in [0, 255] but is of floating type
- return np.min(image) >= 0 and np.max(image) <= 1
-
-
-class ImageProcessor(BaseImageProcessor):
- """
- Adaptive image processor for dynamic image resizing and preprocessing.
-
- This processor handles image resizing, rescaling, normalization and format conversion.
- It dynamically adjusts image dimensions based on original size and specified constraints.
- """
-
- def __init__(
- self,
- patch_size: int = 16,
- merge_size: int = 2,
- temporal_patch_size: int = 2,
- min_pixels: int = MIN_PIXELS,
- max_pixels: int = MAX_PIXELS,
- image_mean: Union[float, List[float]] = IMAGE_MEAN,
- image_std: Union[float, List[float]] = IMAGE_STD,
- rescale_factor: float = 1 / 255,
- do_rescale: bool = True,
- do_normalize: bool = True,
- resample: PILImageResampling = PILImageResampling.BICUBIC,
- **kwargs,
- ) -> None:
- """
- Initialize image processor with configuration parameters.
-
- Args:
- patch_size (int): Spatial patch size for vision encoder
- merge_size (int): Merge size between vision and LLM encoders
- temporal_patch_size (int): Temporal patch size for video processing
- min_pixels (int): Minimum allowed pixels in resized image
- max_pixels (int): Maximum allowed pixels in resized image
- image_mean (float/list): Mean values for normalization per channel
- image_std (float/list): Std values for normalization per channel
- rescale_factor (float): Scaling factor for pixel values (default 1/255)
- do_rescale (bool): Whether to rescale images
- do_normalize (bool): Whether to normalize images
- resample: Resampling method for image resizing
- **kwargs: Additional base class arguments
- """
- super().__init__(**kwargs)
- self.patch_size = patch_size
- self.merge_size = merge_size
- self.temporal_patch_size = temporal_patch_size
-
- self.min_pixels = min_pixels
- self.max_pixels = max_pixels
-
- self.image_mean = image_mean
- self.image_std = image_std
- self.rescale_factor = rescale_factor
- self.do_rescale = do_rescale
- self.do_normalize = do_normalize
-
- self.resample = resample
-
- def _preprocess(
- self,
- images: Union[ImageInput, VideoInput],
- min_pixels: int,
- max_pixels: int,
- image_mean: Optional[Union[float, List[float]]],
- image_std: Optional[Union[float, List[float]]],
- rescale_factor: float,
- do_rescale: bool,
- do_normalize: bool,
- resample: PILImageResampling,
- data_format: Optional[ChannelDimension],
- input_data_format: Optional[Union[str, ChannelDimension]],
- ):
- """
- Internal method for image preprocessing pipeline.
-
- Args:
- images: Input image or batch of images
- min_pixels: Minimum allowed pixels in output
- max_pixels: Maximum allowed pixels in output
- image_mean: Normalization mean values
- image_std: Normalization std values
- rescale_factor: Pixel value scaling factor
- do_rescale: Whether to rescale pixel values
- do_normalize: Whether to normalize pixel values
- resample: Resampling method
- data_format: Output channel format
- input_data_format: Input channel format
-
- Returns:
- tuple: (flatten_patches, grid_dimensions)
- - flatten_patches: Flattened image patches
- - grid_dimensions: Grid dimensions [t, h, w]
- """
- images = make_list_of_images(images)
-
- # All transformations expect numpy arrays.
- images = [to_numpy_array(image) for image in images]
-
- if is_scaled_image(images[0]) and do_rescale:
- data_processor_logger.warning(
- "It looks like you are trying to rescale already rescaled images. If the input"
- " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
- )
- if input_data_format is None:
- # We assume that all images have the same channel dimension format.
- input_data_format = infer_channel_dimension_format(images[0])
-
- # Get original dimensions and calculate optimal resize dimensions
- height, width = get_image_size(images[0], channel_dim=input_data_format)
- resized_height, resized_width = smart_resize(
- height,
- width,
- factor=self.patch_size * self.merge_size, # Combine patch and merge factors
- min_pixels=min_pixels,
- max_pixels=max_pixels,
- )
-
- processed_images = []
- for image in images:
- if height != resized_height or width != resized_width:
- # Convert to uint8 before resizing to avoid double scaling
- image = image.astype("uint8")
- # Convert to PIL Image and resize
- image = Image.fromarray(image)
- image = resize(
- image,
- size=(resized_height, resized_width),
- resample=resample,
- data_format=input_data_format,
- )
-
- if do_rescale and do_normalize:
- # Adjust mean and std for combined rescale+normalize
- image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
- image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
- do_rescale = False # Skip separate rescale step
-
- # mutual exclusion and upper branch
- if do_rescale:
- image = image.astype(np.float32)
- image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
- if do_normalize:
- image = image.astype(np.float32)
- image = normalize(
- image=image,
- mean=image_mean,
- std=image_std,
- data_format=input_data_format,
- )
-
- image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
- processed_images.append(image)
-
- # Convert processed images to numpy array
- patches = np.array(processed_images)
-
- # Pad temporal dimension if needed
- if patches.shape[0] % self.temporal_patch_size != 0:
- repeats = np.repeat(
- patches[-1][np.newaxis],
- self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
- axis=0,
- )
- patches = np.concatenate([patches, repeats], axis=0)
-
- # Convert to channels-first format if needed
- if data_format == ChannelDimension.LAST:
- patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
-
- grid_t, channel = patches.shape[:2]
- grid_t = grid_t // self.temporal_patch_size
-
- grid_h, grid_w = (
- resized_height // self.patch_size,
- resized_width // self.patch_size,
- )
- # Reshape into hierarchical patch structure
- patches = patches.reshape(
- [
- grid_t,
- self.temporal_patch_size,
- channel,
- grid_h // self.merge_size,
- self.merge_size,
- self.patch_size,
- grid_w // self.merge_size,
- self.merge_size,
- self.patch_size,
- ]
- )
- # Reorder dimensions for better memory access pattern
- # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
- patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
- flatten_patches = patches.reshape(
- [
- grid_t * grid_h * grid_w,
- channel * self.temporal_patch_size * self.patch_size * self.patch_size,
- ]
- )
-
- return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
- def preprocess(
- self,
- images: Union[ImageInput, VideoInput],
- min_pixels: Optional[int] = None,
- max_pixels: Optional[int] = None,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- rescale_factor: Optional[float] = None,
- do_rescale: Optional[bool] = None,
- do_normalize: Optional[bool] = None,
- resample: Optional[PILImageResampling] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
- input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
- ):
- """
- Main preprocessing method for images/videos.
-
- Args:
- images: Input image/video data
- min_pixels: Override for minimum pixels
- max_pixels: Override for maximum pixels
- image_mean: Override for normalization mean
- image_std: Override for normalization std
- rescale_factor: Override for rescaling factor
- do_rescale: Override for rescaling flag
- do_normalize: Override for normalization flag
- resample: Override for resampling method
- return_tensors: Desired output tensor format
- data_format: Output channel dimension format
- input_data_format: Input channel dimension format
-
- Returns:
- BatchFeature: Processed features containing:
- - pixel_values: Preprocessed pixel data
- - grid_thw: Grid dimensions [temporal, height, width]
-
- Raises:
- ValueError: For invalid image types or dimensions
- """
- min_pixels = min_pixels if min_pixels is not None else self.min_pixels
- max_pixels = max_pixels if max_pixels is not None else self.max_pixels
- image_mean = image_mean if image_mean is not None else self.image_mean
- image_std = image_std if image_std is not None else self.image_std
- rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
- do_rescale = do_rescale if do_rescale is not None else self.do_rescale
- do_normalize = do_normalize if do_normalize is not None else self.do_normalize
- resample = resample if resample is not None else self.resample
-
- if images is not None and not valid_images(images):
- raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
- pixel_values, grid_thw = self._preprocess(
- images,
- min_pixels=min_pixels,
- max_pixels=max_pixels,
- image_mean=image_mean,
- image_std=image_std,
- rescale_factor=rescale_factor,
- do_rescale=do_rescale,
- do_normalize=do_normalize,
- resample=resample,
- data_format=data_format,
- input_data_format=input_data_format,
- )
- data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
- return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/process.py b/fastdeploy/input/v1/qwen3_vl_processor/process.py
deleted file mode 100644
index 0731d04972d..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/process.py
+++ /dev/null
@@ -1,814 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor
-
-VIDEO_MIN_PIXELS = 128 * 28 * 28
-VIDEO_MAX_PIXELS = 768 * 28 * 28
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 768
-
-
-def sample_frames(
- frame_factor: int,
- min_frames: int,
- max_frames: int,
- metadata: Optional[dict] = None,
- fps: Optional[Union[int, float]] = -1,
- num_frames: Optional[int] = -1,
-):
- """
- Sample frames from video according to specified criteria.
-
- Args:
- frame_factor: Ensure sampled frames are multiples of this factor
- min_frames: Minimum number of frames to sample
- max_frames: Maximum number of frames to sample
- metadata: Video metadata containing fps information
- fps: Target frames per second for sampling
- num_frames: Exact number of frames to sample
-
- Returns:
- np.ndarray: Sampled video frames
-
- Raises:
- ValueError: If both fps and num_frames are specified,
- or if required metadata is missing,
- or if requested frames exceed available frames
- """
- if fps > 0 and num_frames > 0:
- raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
- total_num_frames = metadata["num_of_frame"]
-
- # If num_frames is not given but fps is, calculate num_frames from fps
- if num_frames > 0:
- num_frames = round(num_frames / frame_factor) * frame_factor
- elif fps > 0:
- if metadata is None:
- raise ValueError(
- "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
- "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
- )
- # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
- min_frames = ceil_by_factor(min_frames, frame_factor)
- max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
- num_frames = total_num_frames / metadata["fps"] * fps
-
- if num_frames > total_num_frames:
- data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
- num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
- num_frames = floor_by_factor(num_frames, frame_factor)
-
- if num_frames > total_num_frames:
- raise ValueError(
- f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
- "Decrease `num_frames` or `fps` for sampling."
- )
-
- # Hack code ensures that num_frames can always be divided by 4
- # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
- if num_frames > 2 and num_frames % 4 != 0:
- num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
- total_num_frames = (total_num_frames // 4) * 4
- num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
- # Calculate frame indices based on sampling strategy
- if num_frames > 0:
- # Evenly spaced sampling for target frame count
- indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
- else:
- # Keep all frames if no sampling requested
- indices = np.arange(0, total_num_frames).astype(np.int32)
-
- return indices
-
-
-class DataProcessor(MMBaseDataProcessor):
- """
- Processes multimodal inputs (text, images, videos) into model-ready formats.
-
- Handles:
- - Tokenization of text with special tokens for visual content
- - Image and video preprocessing
- - Generation of 3D positional embeddings
- - Conversion of chat messages to model inputs
-
- Attributes:
- tokenizer: Text tokenizer instance
- image_processor: Image/video preprocessor
- image_token: Special token for image placeholders
- video_token: Special token for video placeholders
- vision_start: Token marking start of visual content
- """
-
- def __init__(
- self,
- model_path: str,
- enable_processor_cache: bool = False,
- video_min_frames: int = FPS_MIN_FRAMES,
- video_max_frames: int = FPS_MAX_FRAMES,
- video_target_frames: int = -1,
- video_fps: int = FPS,
- tokens_per_second: int = 2,
- tokenizer=None,
- **kwargs,
- ) -> None:
- """
- Initialize the data processor.
-
- Args:
- model_path: Path to pretrained model
- video_min_frames: Minimum frames to sample from videos
- video_max_frames: Maximum frames to sample from videos
- tokens_per_second: Temporal resolution for positional embeddings
- **kwargs: Additional configuration
- """
- super().__init__()
- self.min_frames = video_min_frames
- self.max_frames = video_max_frames
- self.target_frames = video_target_frames
- self.fps = video_fps
- self.frame_factor = FRAME_FACTOR
-
- # Initialize tokenizer with left padding and fast tokenizer
- if tokenizer is None:
- self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
- self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation
- else:
- self.tokenizer = tokenizer
-
- self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor
- self.enable_processor_cache = enable_processor_cache
-
- # Convolution sizes for patch aggregation
- self.spatial_conv_size = self.image_processor.merge_size
- self.temporal_conv_size = self.image_processor.temporal_patch_size
-
- # Special tokens and IDs
- self.image_token = "<|image_pad|>"
- self.video_token = "<|video_pad|>"
-
- self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
- self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-
- self.vision_start = "<|vision_start|>"
- self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
- self.tokens_per_second = tokens_per_second
-
- self.role_prefixes = {
- "system": "",
- "user": "User: ",
- "bot": "Assistant: ",
- "assistant": "Assistant: ",
- }
-
- @staticmethod
- def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
- """
- Calculate the number of tokens in the multimodal input.
- """
- if isinstance(grid_thw, paddle.Tensor):
- grid_thw = grid_thw.numpy()
-
- if len(grid_thw) == 0:
- return 0
-
- def calc_one(thw):
- t, h, w = map(int, thw)
- return t * h * w // 4
-
- if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
- return [calc_one(x) for x in grid_thw]
-
- return calc_one(grid_thw)
-
- def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
- """
- Convert text with image/video placeholders into model inputs.
-
- Args:
- text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
- images: List of PIL Images corresponding to image placeholders
- videos: List of video data corresponding to video placeholders
- image_uuid: List of unique identifiers for each image, used for caching or hashing.
- video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
- Returns:
- Dict containing:
- - input_ids: Token IDs
- - token_type_ids: Type identifiers (text/image/video)
- - position_ids: 3D positional embeddings
- - images: Preprocessed visual features
- - grid_thw: Spatial/temporal dimensions
- - image_type_ids: Visual content type (0=image, 1=video)
- """
-
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "fps": [],
- "mm_positions": [],
- "mm_hashes": [],
- }
-
- # Define placeholders and their lengths
- IMAGE_PLACEHOLDER = "<|image_pad|>"
- VIDEO_PLACEHOLDER = "<|video_pad|>"
- IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
- VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
- # Initialize tracking variables for text parsing
- st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter
- while st < len(text):
- # Find next image or video placeholder in text
- image_pos = text.find(IMAGE_PLACEHOLDER, st)
- image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found
- video_pos = text.find(VIDEO_PLACEHOLDER, st)
- video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found
- ed = min(image_pos, video_pos) # End position is first placeholder found
-
- self._add_text(text[st:ed], outputs)
- if ed == len(text):
- break
-
- if ed == image_pos:
- image = images[image_idx]
- uuid = image_uuid[image_idx] if image_uuid else None
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid)
- else:
- self._add_processed_image(image, outputs, uuid)
- image_idx += 1
- st = ed + IMAGE_PLACEHOLDER_LEN
- else:
- item = videos[video_idx]
- uuid = video_uuid[video_idx] if video_uuid else None
- if not isinstance(item, tuple):
- if isinstance(item, dict):
- frames, meta = self._load_and_process_video(item["video"], item)
- else:
- frames, meta = self._load_and_process_video(item, {})
- self._add_video(frames, meta, outputs, uuid)
- else:
- # cached frames are already processed
- self._add_processed_video(item, outputs, uuid)
- video_idx += 1
- st = ed + VIDEO_PLACEHOLDER_LEN
-
- return outputs
-
- def prompt_token_ids2outputs(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "fps": [],
- "mm_positions": [],
- "mm_hashes": [],
- }
- prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else []
- prompt_token_ids_len = len(prompt_token_ids)
-
- if not request.messages:
- self._add_text(prompt_token_ids, outputs)
- return outputs
-
- messages = parse_chat_messages(request.messages)
- mm_items = []
- for msg in messages:
- role = msg.get("role")
- assert role in self.role_prefixes, f"Unsupported role: {role}"
-
- content = msg.get("content")
- if not isinstance(content, list):
- content = [content]
- for item in content:
- if item.get("type") in ["image", "video"]:
- mm_items.append(item)
-
- missing_hashes, missing_idx = [], []
- for idx, item in enumerate(mm_items):
- if not item.get("data"):
- missing_hashes.append(item.get("uuid"))
- missing_idx.append(idx)
-
- if len(missing_hashes) > 0 and not self.enable_processor_cache:
- raise ValueError("Missing items cannot be retrieved without processor cache.")
-
- dealer = None
- if self.enable_processor_cache:
- context = zmq.Context()
- dealer = context.socket(zmq.DEALER)
- dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
- missing_items = self.get_processor_cache(dealer, missing_hashes)
- for idx in range(len(missing_items)):
- if not missing_items[idx]:
- raise ValueError(f"Missing item {idx} not found in processor cache")
- mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
- st, mm_idx = 0, 0
- while st < prompt_token_ids_len:
- if prompt_token_ids[st] != self.image_token_id:
- cur_idx = st
- while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
- cur_idx += 1
- self._add_text(prompt_token_ids[st:cur_idx], outputs)
- st = cur_idx
- continue
-
- if mm_idx >= len(mm_items):
- raise ValueError("prompt token ids has more multimodal placeholder than in messages")
-
- cur_idx = st
- while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
- cur_idx += 1
-
- item = mm_items[mm_idx]
- uuid = item.get("uuid")
- token_len = cur_idx - st
- if item.get("type") == "image":
- image = item.get("data")
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid, token_len)
- else:
- self._add_processed_image(image, outputs, uuid, token_len)
- elif item.get("type") == "video":
- video = item.get("data")
- if not isinstance(video, tuple):
- if isinstance(video, dict):
- frames, meta = self._load_and_process_video(video["video"], video)
- else:
- frames, meta = self._load_and_process_video(video, {})
- self._add_video(frames, meta, outputs, uuid, token_len)
- else:
- self._add_processed_video(video, outputs, uuid, token_len)
- else:
- raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
- mm_idx += 1
- st = cur_idx
-
- if mm_idx != len(mm_items):
- raise ValueError("number of multimodal items does not match prompt token ids")
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- grid_thw = np.asarray(outputs["grid_thw"][idx])
- if grid_thw.ndim > 1:
- t, h, w = grid_thw[0]
- else:
- t, h, w = grid_thw
- meta["thw"] = (int(t), int(h), int(w))
- meta["fps"] = outputs["fps"][idx]
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- if hashes_to_cache:
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- return outputs
-
- def request2ids(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- """
- Convert chat request with multimodal messages into model inputs.
-
- Args:
- request: Request containing:
- - messages: List of chat messages with text/image/video content
- - request_id: Unique identifier for logging
- tgts: Optional target sequences
-
- Returns:
- Dict with same structure as text2ids() output
- """
-
- messages = parse_chat_messages(request.messages)
- mm_items = []
- for msg in messages:
- role = msg.get("role")
- assert role in self.role_prefixes, f"Unsupported role: {role}"
-
- content = msg.get("content")
- if not isinstance(content, list):
- content = [content]
- for item in content:
- if item.get("type") in ["image", "video"]:
- mm_items.append(item)
-
- missing_hashes, missing_idx = [], []
- for idx, item in enumerate(mm_items):
- if not item.get("data"):
- missing_hashes.append(item.get("uuid"))
- missing_idx.append(idx)
-
- if len(missing_hashes) > 0 and not self.enable_processor_cache:
- raise ValueError("Missing items cannot be retrieved without processor cache.")
-
- if self.enable_processor_cache:
- context = zmq.Context()
- dealer = context.socket(zmq.DEALER)
- dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
- missing_items = self.get_processor_cache(dealer, missing_hashes)
- for idx in range(len(missing_items)):
- if not missing_items[idx]:
- raise ValueError(f"Missing item {idx} not found in processor cache")
- mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
- images, videos = [], []
- image_uuid, video_uuid = [], []
- for item in mm_items:
- if item.get("type") == "image":
- images.append(item["data"])
- image_uuid.append(item["uuid"])
- elif item.get("type") == "video":
- videos.append(item["data"])
- video_uuid.append(item["uuid"])
- else:
- raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat template.")
-
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- prompt = self.tokenizer.apply_chat_template(
- messages,
- tokenize=False,
- add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
- **chat_template_kwargs,
- )
- request.prompt_tokens = prompt
-
- outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- grid_thw = np.asarray(outputs["grid_thw"][idx])
- if grid_thw.ndim > 1:
- t, h, w = grid_thw[0]
- else:
- t, h, w = grid_thw
- meta["thw"] = (int(t), int(h), int(w))
- meta["fps"] = outputs["fps"][idx]
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- if hashes_to_cache:
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- return outputs
-
- def _add_text(self, tokens, outputs: Dict) -> None:
- """
- Add text tokens to model inputs dictionary.
-
- Args:
- tokens: Text string or already tokenized IDs
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles both raw text and pre-tokenized inputs
- - Updates position IDs for 3D embeddings
- """
- if not tokens:
- return None
-
- if isinstance(tokens, str):
- tokens_str = self.tokenizer.tokenize(tokens)
- tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
- num_tokens = len(tokens)
- outputs["input_ids"].extend(tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
- pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
- """
- Generate 3D positional embeddings for text tokens.
-
- Args:
- start_pos: Starting position index
- num_tokens: Number of tokens to generate positions for
-
- Returns:
- numpy.ndarray: 3D position IDs shaped (3, num_tokens)
- """
- text_array = np.arange(num_tokens).reshape(1, -1)
- text_index = np.broadcast_to(text_array, (3, num_tokens))
- position = text_index + start_pos
- return position
-
- def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None) -> None:
- """
- Add image data to model inputs dictionary.
-
- Args:
- img: PIL Image to process
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Preprocesses image and calculates spatial dimensions
- - Adds image token IDs and type markers
- - Generates appropriate position embeddings
- """
- ret = self.image_processor.preprocess(images=[img.convert("RGB")])
- num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["grid_thw"].tolist()
- if token_len is not None and token_len != num_tokens:
- raise ValueError("image tokens num not match the size")
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
- outputs["num_input_image_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].append(0)
-
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
-
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(0)
-
- def _add_processed_image(
- self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None
- ) -> None:
- img, meta = img_cache
- num_tokens = img.shape[0] // self.image_processor.merge_size**2
- if token_len is not None and token_len != num_tokens:
- raise ValueError("image tokens num not match the size")
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
- _, h, w = meta["thw"]
- pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["images"].append(img)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[1, h, w]]))
- outputs["image_type_ids"].append(0)
-
- outputs["fps"].append(0)
-
- def _add_video(
- self, frames, meta: Dict, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None
- ) -> None:
- """
- Add video data to model inputs dictionary.
-
- Args:
- frames: Video frames as numpy array
- meta: Video metadata containing fps/duration
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles temporal dimension in position embeddings
- - Uses video-specific token IDs and type markers
- """
- ret = self.image_processor.preprocess(
- images=frames,
- min_pixels=VIDEO_MIN_PIXELS,
- max_pixels=VIDEO_MAX_PIXELS,
- )
-
- num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["grid_thw"].tolist()
- if token_len is not None and token_len != num_tokens:
- raise ValueError("video tokens num not match the size")
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- # Hack code. In order to adapt to the framework, only image_token can be passed
- # The correct way should be to use [self.video_token_id] * num_tokens
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["num_input_video_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].extend([1] * grid_thw[0])
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(fps)
-
- def _add_processed_video(
- self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None
- ) -> None:
- frames, meta = frames_cache
- num_tokens = frames.shape[0] // self.image_processor.merge_size**2
- if token_len is not None and token_len != num_tokens:
- raise ValueError("video tokens num not match the size")
-
- t, h, w = meta["thw"]
- outputs["images"].append(frames)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[t, h, w]]))
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["image_type_ids"].extend([1] * t)
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(fps)
-
- def _compute_vision_positions(
- self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
- ) -> np.ndarray:
- """
- Generate 3D position IDs for visual inputs.
-
- Args:
- start_pos: Base position in sequence
- t: Temporal patches (1 for images)
- h: Height in patches
- w: Width in patches
- second_per_grid_t: Time per temporal patch
-
- Returns:
- np.ndarray: Position IDs for [t,h,w] dimensions
- """
- h //= self.spatial_conv_size
- w //= self.spatial_conv_size
-
- tn = np.arange(t).reshape(-1, 1)
- tn = np.broadcast_to(tn, (t, h * w))
- tn = tn * int(second_per_grid_t) * self.tokens_per_second
- t_index = tn.flatten()
-
- hn = np.arange(h).reshape(1, -1, 1)
- h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
- wn = np.arange(w).reshape(1, 1, -1)
- w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
- position = np.stack([t_index, h_index, w_index]) + start_pos
- return position
-
- def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
- """
- Load and preprocess video into frames.
-
- Args:
- url: Video file path or bytes
- item: Dictionary containing processing parameters
-
- Returns:
- tuple: (frames, metadata) where:
- - frames: Processed video frames as numpy array
- - metadata: Updated video metadata dictionary
- """
- reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
- # Apply frame sampling if fps or target_frames specified
- fps = item.get("fps", self.fps)
- num_frames = item.get("target_frames", self.target_frames)
-
- frame_indices = list(range(meta["num_of_frame"]))
- if fps > 0 or num_frames > 0:
- # Get frame sampling constraints
- min_frames = item.get("min_frames", self.min_frames)
- max_frames = item.get("max_frames", self.max_frames)
-
- # Sample frames according to specifications
- frame_indices = sample_frames(
- frame_factor=self.frame_factor, # Ensure divisible by temporal patch size
- min_frames=min_frames,
- max_frames=max_frames,
- metadata=meta,
- fps=fps,
- num_frames=num_frames,
- )
-
- # Update metadata with new frame count and fps
- meta["num_of_frame"] = len(frame_indices)
- if fps is not None:
- meta["fps"] = fps # Use specified fps
- meta["duration"] = len(frame_indices) / fps
- else:
- meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames
-
- frames = []
- for idx in frame_indices:
- frame = reader[idx].asnumpy()
- image = Image.fromarray(frame, "RGB")
- frames.append(image)
- frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
- return frames, meta
-
- def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
- """
- get cache correspond to given hash values
- """
- req = pickle.dumps(mm_hashes)
- socket.send_multipart([b"", req])
- _, resp = socket.recv_multipart()
- mm_items = pickle.loads(resp)
- data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
- return mm_items
-
- def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
- """
- update cache data
- """
- req = pickle.dumps((mm_hashes, mm_items))
- socket.send_multipart([b"", req])
- data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py
deleted file mode 100644
index c72de49bcc3..00000000000
--- a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-
-class Qwen3VLProcessor(TextProcessor):
- """
- Qwen Vision-Language processor for handling multimodal inputs.
-
- This processor extends TextProcessor to support:
- - Image and video processing
- - Multimodal feature extraction
- - Tokenization and position encoding
- - Request processing and model input generation
-
- Attributes:
- processor (DataProcessor): Underlying data processor instance
- tokenizer: Text tokenizer instance
- limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
- """
-
- def __init__(
- self,
- config,
- model_name_or_path,
- limit_mm_per_prompt=None,
- mm_processor_kwargs=None,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- enable_processor_cache=False,
- ):
- """
- Initialize QwenVLProcessor instance.
-
- Args:
- config: Model configuration object
- model_name_or_path (str): Pretrained model name or path
- limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
- mm_processor_kwargs (dict, optional): Multimodal processor arguments
- reasoning_parser_obj: Reasoning parser instance
- tool_parser_obj: Tool parser instance
- """
- super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-
- data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
- processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
- self.processor = DataProcessor(
- model_path=model_name_or_path,
- enable_processor_cache=enable_processor_cache,
- # tokens_per_second=config.vision_config.tokens_per_second,
- tokenizer=self.tokenizer,
- **processor_kwargs,
- )
- self.image_patch_id = self.processor.image_token_id
- self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
- def _parse_processor_kwargs(self, kwargs):
- """
- Parse and validate multimodal processor arguments.
-
- Args:
- kwargs (dict): Processor configuration arguments
-
- Returns:
- dict: Validated processor arguments
-
- Raises:
- ValueError: If arguments format is invalid
- """
- if not kwargs:
- return {}
-
- try:
- if not isinstance(kwargs, dict):
- raise ValueError("mm-processor-kwargs must be a dictionary")
-
- # Validate kwargs types against expected schema
- data_processor_logger.info(f"Processing kwargs: {kwargs}")
- expected_types = {
- "video_max_frames": int, # Maximum video frames parameter
- "video_min_frames": int, # Minimum video frames parameter
- }
-
- for key, value in kwargs.items():
- if key in expected_types and not isinstance(value, expected_types[key]):
- raise ValueError(
- f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
- )
-
- return kwargs
-
- except Exception as e:
- data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
- return {}
-
- def _parse_limits(self, limits):
- """
- Parse and validate multimodal input limits.
-
- Args:
- limits (dict): Input limits configuration
-
- Returns:
- dict: Validated limits with defaults
-
- Raises:
- ValueError: If limits format is invalid
- """
- DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
- if not limits:
- return DEFAULT_LIMITS
-
- try:
- if not isinstance(limits, dict):
- raise ValueError("limit-mm-per-prompt must be a dictionary")
- data_processor_logger.info(f"_parse_limits:{limits}")
- return {**DEFAULT_LIMITS, **limits}
- except Exception as e:
- data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
- return DEFAULT_LIMITS
-
- def _check_mm_limits(self, item):
- """
- Validate multimodal inputs against configured limits.
-
- Args:
- item: Input request item to validate
-
- Raises:
- ValueError: If input exceeds configured limits
- """
- if isinstance(item, dict):
- # 请求包含prompt和multi_modal_data
- mm_data = item
- else:
- # 请求包含messages
- mm_data = {"image": [], "video": []}
-
- for message in item:
- if isinstance(message.get("content"), list):
- for part in message["content"]:
- if part.get("type") in ["image_url", "image"]:
- mm_data["image"].append(part)
- elif part.get("type") in ["video_url", "video"]:
- mm_data["video"].append(part)
-
- for modality, data in mm_data.items():
- if modality in self.limit_mm_per_prompt:
- limit = self.limit_mm_per_prompt[modality]
- if len(data) > limit:
- raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """
- Process incoming request and generate model inputs.
-
- Args:
- request: Input request object
- max_model_len (int, optional): Maximum context length
- **kwargs: Additional processing parameters
-
- Returns:
- Request: Processed request with model inputs
- """
- task = request.to_dict()
- task["enable_thinking"] = kwargs.get("enable_thinking", False)
- self.process_request_dict(task, max_model_len)
- request = Request.from_dict(task)
- request = self._apply_default_parameters(request)
- return request
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """
- Process request dictionary into model inputs.
-
- Args:
- request Request: Input request dictionary
- max_model_len (int, optional): Maximum context length
-
- Returns:
- Request: Processed request with model inputs
-
- Raises:
- ValueError: If request format is invalid
- """
-
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- stop_sequences = request.sampling_params.stop
- if stop_sequences:
- stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
- request.sampling_params.stop_token_ids = stop_seqs
- request.sampling_params.stop_seqs_len = stop_seqs_len
-
- bad_words = request.sampling_params.bad_words
- bad_words_token_ids = request.sampling_params.bad_words_token_ids
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
- if request.prompt_token_ids:
- messages = request.messages
- if messages:
- self._check_mm_limits(messages)
- if request.enable_thinking is None:
- request.enable_thinking = kwargs.get("enable_thinking", False)
- outputs = self.processor.prompt_token_ids2outputs(request)
-
- elif request.prompt:
- multimodal_data = request.multimodal_data
- if multimodal_data is None:
- multimodal_data = {}
- self._check_mm_limits(multimodal_data)
- images = multimodal_data.get("image", None)
- videos = multimodal_data.get("video", None)
- outputs = self.processor.text2ids(request.prompt, images, videos)
-
- elif request.messages:
- messages = request.messages
- self._check_mm_limits(messages)
- chat_template_kwargs = request.chat_template_kwargs
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if getattr(request, k, None) is None:
- setattr(request, k, v)
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- if request.enable_thinking is None:
- request.enable_thinking = kwargs.get("enable_thinking", False)
- outputs = self.processor.request2ids(request)
- delattr(request, "chat_template_kwargs")
- else:
- raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
- # Handle continuation of previous generation by appending existing tokens
- if request.completion_token_ids:
- self.append_completion_tokens(outputs, request.completion_token_ids)
-
- # qwen25_vl not support thinking
- request.enable_thinking = False
-
- outputs = self.pack_outputs(outputs)
-
- request.prompt_token_ids = (
- outputs["input_ids"].tolist()
- if not getattr(request, "prompt_token_ids", None)
- else request.prompt_token_ids
- )
- request.prompt_token_ids_len = len(request.prompt_token_ids)
- request.multimodal_inputs = outputs
-
- # Handle prompt truncation if exceeds model context length
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[
- : max_model_len - 1
- ] # Leave space for at least 1 new token
-
- # Set default max_tokens if not specified
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
- data_processor_logger.info(f"Processed request {request}")
-
- return request
-
- def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
- """
- Append completion tokens to existing outputs.
-
- Args:
- outputs: Current model outputs
- completion_token_ids: completion tokens to append
- """
-
- num_tokens = len(completion_token_ids)
- multimodal_inputs["input_ids"].extend(completion_token_ids)
- multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
- pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
- multimodal_inputs["position_ids"].append(pos_ids)
- multimodal_inputs["cur_position"] += num_tokens
-
- def pack_outputs(self, outputs):
- """
- Prepare final output dictionary for model.
-
- Args:
- outputs: Intermediate processing outputs
-
- Returns:
- dict: Packed output dictionary with all required fields
- """
- if not outputs["images"]:
- outputs["images"] = None # No images case
- outputs["grid_thw"] = None # No spatial dimensions
- outputs["image_type_ids"] = None # No type IDs
- else:
- outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically
- outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions
- outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array
-
- # Convert all outputs to numpy arrays with appropriate types
- outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64
- outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64
- outputs["position_ids"] = np.concatenate(
- outputs["position_ids"], axis=1, dtype=np.int64
- ) # Concatenate position ID
-
- outputs["image_patch_id"] = self.processor.image_token_id
- outputs["video_patch_id"] = self.processor.video_token_id
- outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-
- outputs["mm_num_token_func"] = self.processor.mm_num_tokens
-
- return outputs
diff --git a/fastdeploy/input/v1/qwen_vl_processor/__init__.py b/fastdeploy/input/v1/qwen_vl_processor/__init__.py
deleted file mode 100644
index c876cde7125..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from .process import DataProcessor
-from .qwen_vl_processor import QwenVLProcessor
-
-__all__ = [
- "DataProcessor",
- "QwenVLProcessor",
-]
diff --git a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen_vl_processor/image_processor.py
deleted file mode 100644
index b6a1db19bc5..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
- normalize,
- rescale,
- resize,
- to_channel_dimension_format,
-)
-from paddleformers.transformers.image_utils import (
- ChannelDimension,
- ImageInput,
- PILImageResampling,
- get_image_size,
- infer_channel_dimension_format,
- make_list_of_images,
- to_numpy_array,
- valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-
-
-VideoInput = Union[
- List["PIL.Image.Image"],
- "np.ndarray",
- "paddle.Tensor",
- List["np.ndarray"],
- List["paddle.Tensor"],
- List[List["PIL.Image.Image"]],
- List[List["np.ndarray"]],
- List[List["paddle.Tensor"]],
-]
-
-
-def round_by_factor(number: int, factor: int) -> int:
- """
- Round number to nearest multiple of factor.
-
- Args:
- number: Input number to round
- factor: Rounding factor
-
- Returns:
- int: Rounded number
- """
- return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
- """
- Round number up to nearest multiple of factor.
-
- Args:
- number: Input number to round
- factor: Rounding factor
-
- Returns:
- int: Rounded number
- """
- return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
- """
- Round number down to nearest multiple of factor.
-
- Args:
- number: Input number to round
- factor: Rounding factor
-
- Returns:
- int: Rounded number
- """
- return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
- """
- Smart image resizing that maintains aspect ratio and respects constraints.
-
- Args:
- height: Original image height
- width: Original image width
- factor: Patch size factor
- min_pixels: Minimum allowed pixels
- max_pixels: Maximum allowed pixels
- max_ratio: Maximum allowed aspect ratio
-
- Returns:
- tuple: (new_height, new_width)
-
- Raises:
- ValueError: If calculated dimensions are invalid
- """
- if max(height, width) / min(height, width) > max_ratio:
- if height > width:
- new_width = max(factor, round_by_factor(width, factor))
- new_height = floor_by_factor(new_width * max_ratio, factor)
- else:
- new_height = max(factor, round_by_factor(height, factor))
- new_width = floor_by_factor(new_height * max_ratio, factor)
-
- data_processor_logger.info(
- f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
- resize to {max(new_height, new_width) / min(new_height, new_width)}"
- )
-
- height = new_height
- width = new_width
-
- h_bar = max(factor, round_by_factor(height, factor))
- w_bar = max(factor, round_by_factor(width, factor))
- if h_bar * w_bar > max_pixels:
- beta = math.sqrt((height * width) / max_pixels)
- h_bar = floor_by_factor(height / beta, factor)
- w_bar = floor_by_factor(width / beta, factor)
- elif h_bar * w_bar < min_pixels:
- beta = math.sqrt(min_pixels / (height * width))
- h_bar = ceil_by_factor(height * beta, factor)
- w_bar = ceil_by_factor(width * beta, factor)
-
- if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
- raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
- return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
- """
- Check if image pixel values are already normalized to [0, 1] range.
-
- Args:
- image: Input image array
-
- Returns:
- bool: True if image is already scaled
- """
- if image.dtype == np.uint8:
- return False
-
- # It's possible the image has pixel values in [0, 255] but is of floating type
- return np.min(image) >= 0 and np.max(image) <= 1
-
-
-class ImageProcessor(BaseImageProcessor):
- """
- Adaptive image processor for dynamic image resizing and preprocessing.
-
- This processor handles image resizing, rescaling, normalization and format conversion.
- It dynamically adjusts image dimensions based on original size and specified constraints.
- """
-
- def __init__(
- self,
- patch_size: int = 14,
- merge_size: int = 2,
- temporal_patch_size: int = 2,
- min_pixels: int = MIN_PIXELS,
- max_pixels: int = MAX_PIXELS,
- image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
- image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
- rescale_factor: float = 1 / 255,
- do_rescale: bool = True,
- do_normalize: bool = True,
- resample: PILImageResampling = PILImageResampling.BICUBIC,
- **kwargs,
- ) -> None:
- """
- Initialize image processor with configuration parameters.
-
- Args:
- patch_size (int): Spatial patch size for vision encoder
- merge_size (int): Merge size between vision and LLM encoders
- temporal_patch_size (int): Temporal patch size for video processing
- min_pixels (int): Minimum allowed pixels in resized image
- max_pixels (int): Maximum allowed pixels in resized image
- image_mean (float/list): Mean values for normalization per channel
- image_std (float/list): Std values for normalization per channel
- rescale_factor (float): Scaling factor for pixel values (default 1/255)
- do_rescale (bool): Whether to rescale images
- do_normalize (bool): Whether to normalize images
- resample: Resampling method for image resizing
- **kwargs: Additional base class arguments
- """
- super().__init__(**kwargs)
- self.patch_size = patch_size
- self.merge_size = merge_size
- self.temporal_patch_size = temporal_patch_size
-
- self.min_pixels = min_pixels
- self.max_pixels = max_pixels
-
- self.image_mean = image_mean
- self.image_std = image_std
- self.rescale_factor = rescale_factor
- self.do_rescale = do_rescale
- self.do_normalize = do_normalize
-
- self.resample = resample
-
- def _preprocess(
- self,
- images: Union[ImageInput, VideoInput],
- min_pixels: int,
- max_pixels: int,
- image_mean: Optional[Union[float, List[float]]],
- image_std: Optional[Union[float, List[float]]],
- rescale_factor: float,
- do_rescale: bool,
- do_normalize: bool,
- resample: PILImageResampling,
- data_format: Optional[ChannelDimension],
- input_data_format: Optional[Union[str, ChannelDimension]],
- ):
- """
- Internal method for image preprocessing pipeline.
-
- Args:
- images: Input image or batch of images
- min_pixels: Minimum allowed pixels in output
- max_pixels: Maximum allowed pixels in output
- image_mean: Normalization mean values
- image_std: Normalization std values
- rescale_factor: Pixel value scaling factor
- do_rescale: Whether to rescale pixel values
- do_normalize: Whether to normalize pixel values
- resample: Resampling method
- data_format: Output channel format
- input_data_format: Input channel format
-
- Returns:
- tuple: (flatten_patches, grid_dimensions)
- - flatten_patches: Flattened image patches
- - grid_dimensions: Grid dimensions [t, h, w]
- """
- images = make_list_of_images(images)
-
- # All transformations expect numpy arrays.
- images = [to_numpy_array(image) for image in images]
-
- if is_scaled_image(images[0]) and do_rescale:
- data_processor_logger.warning(
- "It looks like you are trying to rescale already rescaled images. If the input"
- " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
- )
- if input_data_format is None:
- # We assume that all images have the same channel dimension format.
- input_data_format = infer_channel_dimension_format(images[0])
-
- # Get original dimensions and calculate optimal resize dimensions
- height, width = get_image_size(images[0], channel_dim=input_data_format)
- resized_height, resized_width = smart_resize(
- height,
- width,
- factor=self.patch_size * self.merge_size, # Combine patch and merge factors
- min_pixels=min_pixels,
- max_pixels=max_pixels,
- )
-
- processed_images = []
- for image in images:
- if height != resized_height or width != resized_width:
- # Convert to uint8 before resizing to avoid double scaling
- image = image.astype("uint8")
- # Convert to PIL Image and resize
- image = Image.fromarray(image)
- image = resize(
- image,
- size=(resized_height, resized_width),
- resample=resample,
- data_format=input_data_format,
- )
-
- if do_rescale and do_normalize:
- # Adjust mean and std for combined rescale+normalize
- image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
- image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
- do_rescale = False # Skip separate rescale step
-
- if do_rescale:
- image = image.astype(np.float32)
- image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
- if do_normalize:
- image = image.astype(np.float32)
- image = normalize(
- image=image,
- mean=image_mean,
- std=image_std,
- data_format=input_data_format,
- )
-
- image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
- processed_images.append(image)
-
- # Convert processed images to numpy array
- patches = np.array(processed_images)
-
- # Pad temporal dimension if needed
- if patches.shape[0] % self.temporal_patch_size != 0:
- repeats = np.repeat(
- patches[-1][np.newaxis],
- self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
- axis=0,
- )
- patches = np.concatenate([patches, repeats], axis=0)
-
- # Convert to channels-first format if needed
- if data_format == ChannelDimension.LAST:
- patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
-
- grid_t, channel = patches.shape[:2]
- grid_t = grid_t // self.temporal_patch_size
-
- grid_h, grid_w = (
- resized_height // self.patch_size,
- resized_width // self.patch_size,
- )
- # Reshape into hierarchical patch structure
- patches = patches.reshape(
- [
- grid_t,
- self.temporal_patch_size,
- channel,
- grid_h // self.merge_size,
- self.merge_size,
- self.patch_size,
- grid_w // self.merge_size,
- self.merge_size,
- self.patch_size,
- ]
- )
- # Reorder dimensions for better memory access pattern
- # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
- patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
- flatten_patches = patches.reshape(
- [
- grid_t * grid_h * grid_w,
- channel * self.temporal_patch_size * self.patch_size * self.patch_size,
- ]
- )
-
- return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
- def preprocess(
- self,
- images: Union[ImageInput, VideoInput],
- min_pixels: Optional[int] = None,
- max_pixels: Optional[int] = None,
- image_mean: Optional[Union[float, List[float]]] = None,
- image_std: Optional[Union[float, List[float]]] = None,
- rescale_factor: Optional[float] = None,
- do_rescale: Optional[bool] = None,
- do_normalize: Optional[bool] = None,
- resample: Optional[PILImageResampling] = None,
- return_tensors: Optional[Union[str, TensorType]] = None,
- data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
- input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
- ):
- """
- Main preprocessing method for images/videos.
-
- Args:
- images: Input image/video data
- min_pixels: Override for minimum pixels
- max_pixels: Override for maximum pixels
- image_mean: Override for normalization mean
- image_std: Override for normalization std
- rescale_factor: Override for rescaling factor
- do_rescale: Override for rescaling flag
- do_normalize: Override for normalization flag
- resample: Override for resampling method
- return_tensors: Desired output tensor format
- data_format: Output channel dimension format
- input_data_format: Input channel dimension format
-
- Returns:
- BatchFeature: Processed features containing:
- - pixel_values: Preprocessed pixel data
- - grid_thw: Grid dimensions [temporal, height, width]
-
- Raises:
- ValueError: For invalid image types or dimensions
- """
- min_pixels = min_pixels if min_pixels is not None else self.min_pixels
- max_pixels = max_pixels if max_pixels is not None else self.max_pixels
- image_mean = image_mean if image_mean is not None else self.image_mean
- image_std = image_std if image_std is not None else self.image_std
- rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
- do_rescale = do_rescale if do_rescale is not None else self.do_rescale
- do_normalize = do_normalize if do_normalize is not None else self.do_normalize
- resample = resample if resample is not None else self.resample
-
- if images is not None and not valid_images(images):
- raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
- pixel_values, grid_thw = self._preprocess(
- images,
- min_pixels=min_pixels,
- max_pixels=max_pixels,
- image_mean=image_mean,
- image_std=image_std,
- rescale_factor=rescale_factor,
- do_rescale=do_rescale,
- do_normalize=do_normalize,
- resample=resample,
- data_format=data_format,
- input_data_format=input_data_format,
- )
- data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
- return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/fastdeploy/input/v1/qwen_vl_processor/process.py b/fastdeploy/input/v1/qwen_vl_processor/process.py
deleted file mode 100644
index 3aaade025cf..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/process.py
+++ /dev/null
@@ -1,591 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-from typing import Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import paddle
-import zmq
-from paddleformers.transformers import AutoTokenizer
-from PIL import Image
-
-from fastdeploy.engine.request import ImagePosition, Request
-from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
-from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.multimodal.hasher import MultimodalHasher
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ImageProcessor
-from .process_video import sample_frames
-
-FRAME_FACTOR = 2
-FPS = 2.0
-FPS_MIN_FRAMES = 4
-FPS_MAX_FRAMES = 768
-
-
-class DataProcessor(MMBaseDataProcessor):
- """
- Processes multimodal inputs (text, images, videos) into model-ready formats.
-
- Handles:
- - Tokenization of text with special tokens for visual content
- - Image and video preprocessing
- - Generation of 3D positional embeddings
- - Conversion of chat messages to model inputs
-
- Attributes:
- tokenizer: Text tokenizer instance
- image_processor: Image/video preprocessor
- image_token: Special token for image placeholders
- video_token: Special token for video placeholders
- vision_start: Token marking start of visual content
- """
-
- def __init__(
- self,
- model_path: str,
- enable_processor_cache: bool = False,
- video_min_frames: int = FPS_MIN_FRAMES,
- video_max_frames: int = FPS_MAX_FRAMES,
- video_target_frames: int = -1,
- video_fps: int = FPS,
- tokens_per_second: int = 2,
- tokenizer=None,
- **kwargs,
- ) -> None:
- """
- Initialize the data processor.
-
- Args:
- model_path: Path to pretrained model
- video_min_frames: Minimum frames to sample from videos
- video_max_frames: Maximum frames to sample from videos
- tokens_per_second: Temporal resolution for positional embeddings
- **kwargs: Additional configuration
- """
- super().__init__()
- self.min_frames = video_min_frames
- self.max_frames = video_max_frames
- self.target_frames = video_target_frames
- self.fps = video_fps
- self.frame_factor = FRAME_FACTOR
-
- # Initialize tokenizer with left padding and fast tokenizer
- if tokenizer is None:
- self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True)
- self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation
- else:
- self.tokenizer = tokenizer
- self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor
- self.enable_processor_cache = enable_processor_cache
-
- # Convolution sizes for patch aggregation
- self.spatial_conv_size = self.image_processor.merge_size
- self.temporal_conv_size = self.image_processor.temporal_patch_size
-
- # Special tokens and IDs
- self.image_token = "<|image_pad|>"
- self.video_token = "<|video_pad|>"
-
- self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
- self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-
- self.vision_start = "<|vision_start|>"
- self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start)
-
- self.tokens_per_second = tokens_per_second
-
- self.role_prefixes = {
- "system": "",
- "user": "User: ",
- "bot": "Assistant: ",
- "assistant": "Assistant: ",
- }
-
- @staticmethod
- def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
- """
- Calculate the number of tokens in the multimodal input.
- """
- if isinstance(grid_thw, paddle.Tensor):
- grid_thw = grid_thw.numpy()
-
- if len(grid_thw) == 0:
- return 0
-
- def calc_one(thw):
- t, h, w = map(int, thw)
- return t * h * w // 4
-
- if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
- return [calc_one(x) for x in grid_thw]
-
- return calc_one(grid_thw)
-
- def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
- """
- Convert text with image/video placeholders into model inputs.
-
- Args:
- text: Input text with <|image@placeholder|> and <|video@placeholder|> markers
- images: List of PIL Images corresponding to image placeholders
- videos: List of video data corresponding to video placeholders
- image_uuid: List of unique identifiers for each image, used for caching or hashing.
- video_uuid: List of unique identifiers for each video, used for caching or hashing.
-
- Returns:
- Dict containing:
- - input_ids: Token IDs
- - token_type_ids: Type identifiers (text/image/video)
- - position_ids: 3D positional embeddings
- - images: Preprocessed visual features
- - grid_thw: Spatial/temporal dimensions
- - image_type_ids: Visual content type (0=image, 1=video)
- """
-
- outputs = {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "fps": [],
- "mm_positions": [],
- "mm_hashes": [],
- }
-
- # Define placeholders and their lengths
- IMAGE_PLACEHOLDER = "<|image_pad|>"
- VIDEO_PLACEHOLDER = "<|video_pad|>"
- IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
- VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
-
- # Initialize tracking variables for text parsing
- st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter
- while st < len(text):
- # Find next image or video placeholder in text
- image_pos = text.find(IMAGE_PLACEHOLDER, st)
- image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found
- video_pos = text.find(VIDEO_PLACEHOLDER, st)
- video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found
- ed = min(image_pos, video_pos) # End position is first placeholder found
-
- self._add_text(text[st:ed], outputs)
- if ed == len(text):
- break
-
- if ed == image_pos:
- image = images[image_idx]
- uuid = image_uuid[image_idx] if image_uuid else None
- if not isinstance(image, tuple):
- self._add_image(image, outputs, uuid)
- else:
- self._add_processed_image(image, outputs, uuid)
- image_idx += 1
- st = ed + IMAGE_PLACEHOLDER_LEN
- else:
- item = videos[video_idx]
- uuid = video_uuid[video_idx] if video_uuid else None
- if not isinstance(item, tuple):
- if isinstance(item, dict):
- frames, meta = self._load_and_process_video(item["video"], item)
- else:
- frames, meta = self._load_and_process_video(item, {})
- self._add_video(frames, meta, outputs, uuid)
- else:
- # cached frames are already processed
- self._add_processed_video(item, outputs, uuid)
- video_idx += 1
- st = ed + VIDEO_PLACEHOLDER_LEN
-
- return outputs
-
- def request2ids(
- self, request: Request, tgts: List[str] = None
- ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
- """
- Convert chat request with multimodal messages into model inputs.
-
- Args:
- request: Dictionary containing:
- - messages: List of chat messages with text/image/video content
- - request_id: Unique identifier for logging
- tgts: Optional target sequences
-
- Returns:
- Dict with same structure as text2ids() output
- """
-
- # Parse and validate chat messages
- messages = parse_chat_messages(request.messages)
- mm_items = []
- for msg in messages:
- role = msg.get("role")
- assert role in self.role_prefixes, f"Unsupported role: {role}"
-
- # Normalize content to list format
- content = msg.get("content")
- if not isinstance(content, list):
- content = [content]
- # Collect all visual content items
- for item in content:
- if item.get("type") in ["image", "video"]:
- mm_items.append(item)
-
- missing_hashes, missing_idx = [], []
- for idx, item in enumerate(mm_items):
- if not item.get("data"):
- # raw data not provided, should be retrieved from processor cache
- missing_hashes.append(item.get("uuid"))
- missing_idx.append(idx)
-
- if len(missing_hashes) > 0 and not self.enable_processor_cache:
- raise ValueError("Missing items cannot be retrieved without processor cache.")
-
- if self.enable_processor_cache:
- context = zmq.Context()
- dealer = context.socket(zmq.DEALER)
- dealer.connect("ipc:///dev/shm/processor_cache.ipc")
-
- missing_items = self.get_processor_cache(dealer, missing_hashes)
- for idx in range(len(missing_items)):
- if not missing_items[idx]:
- raise ValueError(f"Missing item {idx} not found in processor cache")
- mm_items[missing_idx[idx]]["data"] = missing_items[idx]
-
- images, videos = [], []
- image_uuid, video_uuid = [], []
- for item in mm_items:
- if item.get("type") == "image":
- images.append(item["data"])
- image_uuid.append(item["uuid"])
- elif item.get("type") == "video":
- videos.append(item["data"])
- video_uuid.append(item["uuid"])
- else:
- raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
-
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat template.")
-
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- prompt = self.tokenizer.apply_chat_template(
- messages,
- tokenize=False,
- add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True,
- **chat_template_kwargs,
- )
- request.prompt_tokens = prompt
-
- outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
-
- if self.enable_processor_cache:
- missing_idx = set(missing_idx)
- hashes_to_cache, items_to_cache = [], []
- for idx in range(len(mm_items)):
- if idx in missing_idx:
- continue
- meta = {}
- t, h, w = outputs["grid_thw"][idx]
- meta["thw"] = (t, h, w)
- meta["fps"] = outputs["fps"][idx]
- hashes_to_cache.append(outputs["mm_hashes"][idx])
- items_to_cache.append((outputs["images"][idx], meta))
- self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
-
- return outputs
-
- def _add_text(self, tokens, outputs: Dict) -> None:
- """
- Add text tokens to model inputs dictionary.
-
- Args:
- tokens: Text string or already tokenized IDs
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles both raw text and pre-tokenized inputs
- - Updates position IDs for 3D embeddings
- """
- if not tokens:
- return None
-
- if isinstance(tokens, str):
- tokens_str = self.tokenizer.tokenize(tokens)
- tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
-
- num_tokens = len(tokens)
- outputs["input_ids"].extend(tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
- pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray:
- """
- Generate 3D positional embeddings for text tokens.
-
- Args:
- start_pos: Starting position index
- num_tokens: Number of tokens to generate positions for
-
- Returns:
- numpy.ndarray: 3D position IDs shaped (3, num_tokens)
- """
- text_array = np.arange(num_tokens).reshape(1, -1)
- text_index = np.broadcast_to(text_array, (3, num_tokens))
- position = text_index + start_pos
- return position
-
- def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None:
- """
- Add image data to model inputs dictionary.
-
- Args:
- img: PIL Image to process
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Preprocesses image and calculates spatial dimensions
- - Adds image token IDs and type markers
- - Generates appropriate position embeddings
- """
- ret = self.image_processor.preprocess(images=[img.convert("RGB")])
- num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["grid_thw"].tolist()
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
- outputs["num_input_image_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].append(0)
-
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
-
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(0)
-
- def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
- img, meta = img_cache
- num_tokens = img.shape[0] // self.image_processor.merge_size**2
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
-
- _, h, w = meta["thw"]
- pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["images"].append(img)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[1, h, w]]))
- outputs["image_type_ids"].append(0)
-
- outputs["fps"].append(0)
-
- def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None:
- """
- Add video data to model inputs dictionary.
-
- Args:
- frames: Video frames as numpy array
- meta: Video metadata containing fps/duration
- outputs: Dictionary accumulating model inputs
-
- Note:
- - Handles temporal dimension in position embeddings
- - Uses video-specific token IDs and type markers
- """
- ret = self.image_processor.preprocess(images=frames)
-
- num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
- grid_thw = ret["grid_thw"].tolist()
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- # Hack code. In order to adapt to the framework, only image_token can be passed
- # The correct way should be to use [self.video_token_id] * num_tokens
- outputs["input_ids"].extend([self.image_token_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["num_input_video_tokens"] += int(num_tokens)
-
- outputs["images"].append(ret["pixel_values"])
- if not uuid:
- outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
- else:
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(grid_thw)
- outputs["image_type_ids"].extend([1] * grid_thw[0])
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- t, h, w = grid_thw
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
-
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(fps)
-
- def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None:
- frames, meta = frames_cache
- num_tokens = frames.shape[0] // self.image_processor.merge_size**2
-
- t, h, w = meta["thw"]
- outputs["images"].append(frames)
- outputs["mm_hashes"].append(uuid)
- outputs["grid_thw"].append(np.array([[t, h, w]]))
-
- outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
- outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
- outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
- outputs["image_type_ids"].extend([1] * t)
-
- fps = meta["fps"]
- second_per_grid_t = self.temporal_conv_size / fps
- pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
- outputs["position_ids"].append(pos_ids)
- outputs["cur_position"] = pos_ids.max() + 1
-
- outputs["fps"].append(fps)
-
- def _compute_vision_positions(
- self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float
- ) -> np.ndarray:
- """
- Generate 3D position IDs for visual inputs.
-
- Args:
- start_pos: Base position in sequence
- t: Temporal patches (1 for images)
- h: Height in patches
- w: Width in patches
- second_per_grid_t: Time per temporal patch
-
- Returns:
- np.ndarray: Position IDs for [t,h,w] dimensions
- """
- h //= self.spatial_conv_size
- w //= self.spatial_conv_size
-
- tn = np.arange(t).reshape(-1, 1)
- tn = np.broadcast_to(tn, (t, h * w))
- tn = tn * int(second_per_grid_t) * self.tokens_per_second
- t_index = tn.flatten()
-
- hn = np.arange(h).reshape(1, -1, 1)
- h_index = np.broadcast_to(hn, (t, h, w)).flatten()
-
- wn = np.arange(w).reshape(1, 1, -1)
- w_index = np.broadcast_to(wn, (t, h, w)).flatten()
-
- position = np.stack([t_index, h_index, w_index]) + start_pos
- return position
-
- def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]:
- """
- Load and preprocess video into frames.
-
- Args:
- url: Video file path or bytes
- item: Dictionary containing processing parameters
-
- Returns:
- tuple: (frames, metadata) where:
- - frames: Processed video frames as numpy array
- - metadata: Updated video metadata dictionary
- """
- reader, meta, _ = read_video_decord(url, save_to_disk=False)
-
- # Apply frame sampling if fps or target_frames specified
- fps = item.get("fps", self.fps)
- num_frames = item.get("target_frames", self.target_frames)
-
- frame_indices = list(range(meta["num_of_frame"]))
- if fps > 0 or num_frames > 0:
- # Get frame sampling constraints
- min_frames = item.get("min_frames", self.min_frames)
- max_frames = item.get("max_frames", self.max_frames)
-
- # Sample frames according to specifications
- frame_indices = sample_frames(
- frame_factor=self.frame_factor, # Ensure divisible by temporal patch size
- min_frames=min_frames,
- max_frames=max_frames,
- metadata=meta,
- fps=fps,
- num_frames=num_frames,
- )
-
- # Update metadata with new frame count and fps
- meta["num_of_frame"] = len(frame_indices)
- if fps is not None:
- meta["fps"] = fps # Use specified fps
- meta["duration"] = len(frame_indices) / fps
- else:
- meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames
-
- frames = []
- for idx in frame_indices:
- frame = reader[idx].asnumpy()
- image = Image.fromarray(frame, "RGB")
- frames.append(image)
- frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
-
- return frames, meta
-
- def get_processor_cache(self, socket, mm_hashes: list[str]) -> list:
- """
- get cache correspond to given hash values
- """
- req = pickle.dumps(mm_hashes)
- socket.send_multipart([b"", req])
- _, resp = socket.recv_multipart()
- mm_items = pickle.loads(resp)
- data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
-
- return mm_items
-
- def update_processor_cache(self, socket, mm_hashes: list[str], mm_items):
- """
- update cache data
- """
- req = pickle.dumps((mm_hashes, mm_items))
- socket.send_multipart([b"", req])
- data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
diff --git a/fastdeploy/input/v1/qwen_vl_processor/process_video.py b/fastdeploy/input/v1/qwen_vl_processor/process_video.py
deleted file mode 100644
index 891f272033b..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/process_video.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from typing import Optional, Union
-
-import numpy as np
-
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ceil_by_factor, floor_by_factor
-
-
-def sample_frames(
- frame_factor: int,
- min_frames: int,
- max_frames: int,
- metadata: Optional[dict] = None,
- fps: Optional[Union[int, float]] = -1,
- num_frames: Optional[int] = -1,
-):
- """
- Sample frames from video according to specified criteria.
-
- Args:
- frame_factor: Ensure sampled frames are multiples of this factor
- min_frames: Minimum number of frames to sample
- max_frames: Maximum number of frames to sample
- metadata: Video metadata containing fps information
- fps: Target frames per second for sampling
- num_frames: Exact number of frames to sample
-
- Returns:
- np.ndarray: Sampled video frames
-
- Raises:
- ValueError: If both fps and num_frames are specified,
- or if required metadata is missing,
- or if requested frames exceed available frames
- """
- if fps > 0 and num_frames > 0:
- raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
- total_num_frames = metadata["num_of_frame"]
-
- # If num_frames is not given but fps is, calculate num_frames from fps
- if num_frames > 0:
- num_frames = round(num_frames / frame_factor) * frame_factor
- elif fps > 0:
- if metadata is None:
- raise ValueError(
- "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
- "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
- )
- # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
- min_frames = ceil_by_factor(min_frames, frame_factor)
- max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
- num_frames = total_num_frames / metadata["fps"] * fps
-
- if num_frames > total_num_frames:
- data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
- num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
- num_frames = floor_by_factor(num_frames, frame_factor)
-
- if num_frames > total_num_frames:
- raise ValueError(
- f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
- "Decrease `num_frames` or `fps` for sampling."
- )
-
- # Hack code ensures that num_frames can always be divided by 4
- # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
- if num_frames > 2 and num_frames % 4 != 0:
- num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
- total_num_frames = (total_num_frames // 4) * 4
- num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
- # Calculate frame indices based on sampling strategy
- if num_frames > 0:
- # Evenly spaced sampling for target frame count
- indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
- else:
- # Keep all frames if no sampling requested
- indices = np.arange(0, total_num_frames).astype(np.int32)
-
- return indices
diff --git a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py
deleted file mode 100644
index e0d846d53a2..00000000000
--- a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py
+++ /dev/null
@@ -1,338 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import numpy as np
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.utils import process_stop_token_ids
-from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor
-from fastdeploy.utils import data_processor_logger
-
-from .process import DataProcessor
-
-
-class QwenVLProcessor(TextProcessor):
- """
- Qwen Vision-Language processor for handling multimodal inputs.
-
- This processor extends TextProcessor to support:
- - Image and video processing
- - Multimodal feature extraction
- - Tokenization and position encoding
- - Request processing and model input generation
-
- Attributes:
- processor (DataProcessor): Underlying data processor instance
- tokenizer: Text tokenizer instance
- limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt
- """
-
- def __init__(
- self,
- config,
- model_name_or_path,
- limit_mm_per_prompt=None,
- mm_processor_kwargs=None,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- enable_processor_cache=False,
- ):
- """
- Initialize QwenVLProcessor instance.
-
- Args:
- config: Model configuration object
- model_name_or_path (str): Pretrained model name or path
- limit_mm_per_prompt (dict, optional): Limits for multimodal inputs
- mm_processor_kwargs (dict, optional): Multimodal processor arguments
- reasoning_parser_obj: Reasoning parser instance
- tool_parser_obj: Tool parser instance
- """
- super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj)
-
- data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
- processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
- self.processor = DataProcessor(
- model_path=model_name_or_path,
- enable_processor_cache=enable_processor_cache,
- tokens_per_second=config.vision_config.tokens_per_second,
- tokenizer=self.tokenizer,
- **processor_kwargs,
- )
- self.image_patch_id = self.processor.image_token_id
- self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
-
- def _parse_processor_kwargs(self, kwargs):
- """
- Parse and validate multimodal processor arguments.
-
- Args:
- kwargs (dict): Processor configuration arguments
-
- Returns:
- dict: Validated processor arguments
-
- Raises:
- ValueError: If arguments format is invalid
- """
- if not kwargs:
- return {}
-
- try:
- if not isinstance(kwargs, dict):
- raise ValueError("mm-processor-kwargs must be a dictionary")
-
- # Validate kwargs types against expected schema
- data_processor_logger.info(f"Processing kwargs: {kwargs}")
- expected_types = {
- "video_max_frames": int, # Maximum video frames parameter
- "video_min_frames": int, # Minimum video frames parameter
- }
-
- for key, value in kwargs.items():
- if key in expected_types and not isinstance(value, expected_types[key]):
- raise ValueError(
- f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}"
- )
-
- return kwargs
-
- except Exception as e:
- data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
- return {}
-
- def _parse_limits(self, limits):
- """
- Parse and validate multimodal input limits.
-
- Args:
- limits (dict): Input limits configuration
-
- Returns:
- dict: Validated limits with defaults
-
- Raises:
- ValueError: If limits format is invalid
- """
- DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1}
-
- if not limits:
- return DEFAULT_LIMITS
-
- try:
- if not isinstance(limits, dict):
- raise ValueError("limit-mm-per-prompt must be a dictionary")
- data_processor_logger.info(f"_parse_limits:{limits}")
- return {**DEFAULT_LIMITS, **limits}
- except Exception as e:
- data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
- return DEFAULT_LIMITS
-
- def _check_mm_limits(self, item):
- """
- Validate multimodal inputs against configured limits.
-
- Args:
- item: Input request item to validate
-
- Raises:
- ValueError: If input exceeds configured limits
- """
- if isinstance(item, dict):
- # 请求包含prompt和multi_modal_data
- mm_data = item
- else:
- # 请求包含messages
- mm_data = {"image": [], "video": []}
-
- for message in item:
- if isinstance(message.get("content"), list):
- for part in message["content"]:
- if part.get("type") in ["image_url", "image"]:
- mm_data["image"].append(part)
- elif part.get("type") in ["video_url", "video"]:
- mm_data["video"].append(part)
-
- for modality, data in mm_data.items():
- if modality in self.limit_mm_per_prompt:
- limit = self.limit_mm_per_prompt[modality]
- if len(data) > limit:
- raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """
- Process incoming request and generate model inputs.
-
- Args:
- request: Input request object
- max_model_len (int, optional): Maximum context length
- **kwargs: Additional processing parameters
-
- Returns:
- Request: Processed request with model inputs
- """
- task = request.to_dict()
- task["enable_thinking"] = kwargs.get("enable_thinking", False)
- self.process_request_dict(task, max_model_len)
- request = Request.from_dict(task)
- request = self._apply_default_parameters(request)
- return request
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """
- Process request dictionary into model inputs.
-
- Args:
- request (dict): Input request dictionary
- max_model_len (int, optional): Maximum context length
-
- Returns:
- dict: Processed request with model inputs
-
- Raises:
- ValueError: If request format is invalid
- """
-
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- bad_words = request.sampling_params.bad_words
- bad_words_token_ids = request.sampling_params.bad_words_token_ids
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
- if request.prompt:
- multimodal_data = request.multimodal_data
- if multimodal_data is None:
- multimodal_data = {}
- self._check_mm_limits(multimodal_data)
- images = multimodal_data.get("image", None)
- videos = multimodal_data.get("video", None)
- outputs = self.processor.text2ids(request.prompt, images, videos)
-
- elif request.messages:
- messages = request.messages
- self._check_mm_limits(messages)
- chat_template_kwargs = request.chat_template_kwargs
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if getattr(request, k, v):
- setattr(request, k, v)
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- if getattr(request, "enable_thinking") is None:
- setattr(request, "enable_thinking", True)
- outputs = self.processor.request2ids(request)
- delattr(request, "chat_template_kwargs")
- else:
- raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
-
- # Handle continuation of previous generation by appending existing tokens
- if request.completion_token_ids:
- self.append_completion_tokens(outputs, request.completion_token_ids)
-
- # qwen25_vl not support thinking
- request.enable_thinking = False
-
- outputs = self.pack_outputs(outputs)
-
- request.prompt_token_ids = outputs["input_ids"].tolist()
- request.prompt_token_ids_len = len(request.prompt_token_ids)
- request.multimodal_inputs = outputs
-
- # Handle prompt truncation if exceeds model context length
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[
- : max_model_len - 1
- ] # Leave space for at least 1 new token
-
- # Set default max_tokens if not specified
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.sampling_params.n or 1
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
- data_processor_logger.info(f"Processed request {request}")
-
- return request
-
- def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
- """
- Append completion tokens to existing outputs.
-
- Args:
- outputs: Current model outputs
- completion_token_ids: completion tokens to append
- """
-
- num_tokens = len(completion_token_ids)
- multimodal_inputs["input_ids"].extend(completion_token_ids)
- multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
- pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
- multimodal_inputs["position_ids"].append(pos_ids)
- multimodal_inputs["cur_position"] += num_tokens
-
- def pack_outputs(self, outputs):
- """
- Prepare final output dictionary for model.
-
- Args:
- outputs: Intermediate processing outputs
-
- Returns:
- dict: Packed output dictionary with all required fields
- """
- if not outputs["images"]:
- outputs["images"] = None # No images case
- outputs["grid_thw"] = None # No spatial dimensions
- outputs["image_type_ids"] = None # No type IDs
- else:
- outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically
- outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions
- outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array
-
- # Convert all outputs to numpy arrays with appropriate types
- outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64
- outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64
- outputs["position_ids"] = np.concatenate(
- outputs["position_ids"], axis=1, dtype=np.int64
- ) # Concatenate position ID
-
- outputs["image_patch_id"] = self.processor.image_token_id
- outputs["video_patch_id"] = self.processor.video_token_id
- outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-
- outputs["mm_num_token_func"] = self.processor.mm_num_tokens
- return outputs
diff --git a/fastdeploy/input/v1/text_processor.py b/fastdeploy/input/v1/text_processor.py
deleted file mode 100644
index f83a0e0f12e..00000000000
--- a/fastdeploy/input/v1/text_processor.py
+++ /dev/null
@@ -1,925 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from abc import ABC, abstractmethod
-from collections import OrderedDict
-
-import numpy as np
-from paddleformers.generation import GenerationConfig
-from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer
-
-from fastdeploy import envs
-from fastdeploy.input.utils import process_stop_token_ids
-from fastdeploy.utils import data_processor_logger
-
-_SAMPLING_EPS = 1e-5
-
-
-class BaseDataProcessor(ABC):
- """base class for data processor"""
-
- def __init__(self):
- """
- Returns:
- None
- """
- self.tokenizer = self._load_tokenizer()
- self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.bos_token)
- self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.cls_token)
- self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.sep_token)
- self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token)
- self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.mask_token)
- data_processor_logger.info(
- (
- f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ",
- f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, "
- f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, "
- f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, "
- f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}",
- )
- )
- self._tokenize_cache = OrderedDict()
- self._tokenize_cache_capacity = 128
-
- def _apply_default_parameters(self, request):
- """
- Apply default value for parameters in request
- """
-
- def set_value(req, key, value):
- value = getattr(self.generation_config, key, value)
- if getattr(req.sampling_params, key) is None:
- setattr(req.sampling_params, key, value)
-
- set_value(request, "top_p", 0.7)
- set_value(request, "temperature", 1.0)
- set_value(request, "repetition_penalty", 1.0)
- set_value(request, "frequency_penalty", 0.0)
- set_value(request, "presence_penalty", 0.0)
- return request
-
- @abstractmethod
- def process_request_dict(self, request, **kwargs):
- """
- Preprocess the request
-
- Args:
- request Request: may contain text and messages fields
- **kwargs: others
-
- Returns:
- bool: Whether preprocessing is successful
- str: error message
- """
- raise NotImplementedError
-
- @abstractmethod
- def process_response_dict(self, response_obj):
- """
- Preprocess the response
-
- Args:
- response_obj RequestOutput: response for engine, contain ids fields
-
- Returns:
- RequestOutput: response contain text fields
- """
- raise NotImplementedError
-
- def text2ids(self, text, max_model_len=None):
- """
- text to token ids
-
- Args:
- text (str): text
-
- Returns:
- List[int]: token ids list
- """
- raise NotImplementedError
-
- def encode_with_cache(self, text, max_model_len=None, add_special_tokens=False):
- """
- Encode text into token ids with a small LRU cache.
- """
- if not hasattr(self, "_tokenize_cache"):
- self._tokenize_cache = OrderedDict()
- self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128)
- key = (text, bool(add_special_tokens))
- cached = self._tokenize_cache.get(key)
- if cached is not None:
- self._tokenize_cache.move_to_end(key)
- return cached
- token_ids = self.text2ids(text, max_model_len, add_special_tokens=add_special_tokens)
- if hasattr(token_ids, "tolist"):
- token_ids = token_ids.tolist()
- elif not isinstance(token_ids, list):
- token_ids = list(token_ids)
- self._tokenize_cache[key] = token_ids
- if len(self._tokenize_cache) > self._tokenize_cache_capacity:
- self._tokenize_cache.popitem(last=False)
- return token_ids
-
- def _encode_literal_text_with_cache(self, text):
- if not hasattr(self, "_tokenize_cache"):
- self._tokenize_cache = OrderedDict()
- self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128)
- key = ("literal_text", text)
- cached = self._tokenize_cache.get(key)
- if cached is not None:
- self._tokenize_cache.move_to_end(key)
- return cached
- token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
- if hasattr(token_ids, "tolist"):
- token_ids = token_ids.tolist()
- elif not isinstance(token_ids, list):
- token_ids = list(token_ids)
- self._tokenize_cache[key] = token_ids
- if len(self._tokenize_cache) > self._tokenize_cache_capacity:
- self._tokenize_cache.popitem(last=False)
- return token_ids
-
- def messages2ids(self, messages):
- """
- Convert multi-turn messages into ID sequences.
-
- Args:
- messages (List[List[Dict[str, Any]]]): multi-turn messages.
-
- Returns:
- List[int]: ID sequences
- """
- raise NotImplementedError
-
- def _get_think_token_ids(self):
- think_token_ids = getattr(self, "_think_token_ids", None)
- if think_token_ids is not None:
- return think_token_ids
- tokenizer = getattr(self, "tokenizer", None)
- vocab = tokenizer.get_vocab() if tokenizer is not None else {}
- think_start_id = vocab.get("", -1)
- think_end_id = vocab.get("", -1)
- self._think_token_ids = (think_start_id, think_end_id)
- return self._think_token_ids
-
- def _prepare_think_stop_sentence(self, logits_processors_args, max_model_len=None):
- if not isinstance(logits_processors_args, dict):
- return logits_processors_args
- think_stop_sentence = logits_processors_args.get("think_stop_sentence")
- if isinstance(think_stop_sentence, str) and think_stop_sentence:
- sentence_token_ids = self._encode_literal_text_with_cache(think_stop_sentence)
- logits_processors_args["think_stop_sentence_token_ids"] = sentence_token_ids
- logits_processors_args.pop("think_stop_sentence", None)
- return logits_processors_args
-
- def _update_thinking_prompt_state(self, prompt_token_ids, logits_processors_args):
- if not isinstance(logits_processors_args, dict):
- return logits_processors_args
- thinking_budget = logits_processors_args.get("thinking_budget")
- if thinking_budget is None or not isinstance(thinking_budget, int) or thinking_budget < 0:
- return logits_processors_args
- if logits_processors_args.get("think_prompt_checked"):
- return logits_processors_args
- if prompt_token_ids is None:
- return logits_processors_args
- token_len = getattr(prompt_token_ids, "size", None) or len(prompt_token_ids)
- if token_len == 0:
- return logits_processors_args
- think_start_id, think_end_id = self._get_think_token_ids()
- if think_start_id < 0 or think_end_id < 0:
- return logits_processors_args
-
- if hasattr(prompt_token_ids, "tolist"):
- token_list = prompt_token_ids.tolist()
- else:
- token_list = list(prompt_token_ids)
-
- started = False
- ended = False
- tokens_after_start = 0
- last_token_id = None
- in_thinking = False
- for token_id in token_list:
- if token_id == think_start_id:
- started = True
- ended = False
- in_thinking = True
- elif token_id == think_end_id and in_thinking:
- ended = True
- in_thinking = False
- if started and token_list:
- # Align with operator-level reasoning_max_tokens: prompt-side tokens
- # inside do not consume thinking budget.
- last_token_id = int(token_list[-1])
-
- logits_processors_args["think_prompt_checked"] = True
- logits_processors_args["think_prompt_started"] = started
- logits_processors_args["think_prompt_ended"] = ended
- logits_processors_args["think_prompt_tokens_after_start"] = tokens_after_start
- if last_token_id is not None:
- logits_processors_args["think_prompt_last_token_id"] = last_token_id
- else:
- logits_processors_args.pop("think_prompt_last_token_id", None)
- return logits_processors_args
-
- def ids2tokens(self, token_id, task_id=None):
- """
- token ids to strings
-
- Args:
- token_id (List[int]): token id
- task_id (str): task id
-
- Returns:
- List[str]: strings
- """
- raise NotImplementedError
-
- @abstractmethod
- def _load_tokenizer(self):
- """
- load tokenizer
-
- Returns:
- tokenizer (AutoTokenizer)
- """
- raise NotImplementedError
-
-
-class DataProcessor(BaseDataProcessor):
- def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None):
- """
- Initializes the DecodeStatus object.
-
- Args:
- model_name_or_path (str): The name or path of the pre-trained model to be loaded.
- Can also be a path to a directory containing the pre-trained model file.
-
- Returns:
- None.
-
- Raises:
- None.
- """
-
- self.model_name_or_path = model_name_or_path
-
- # Generation config
- try:
- self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
- except Exception as e:
- data_processor_logger.warning(
- f"Can't find generation config: {e}, so it will not use generation_config field in the model config"
- )
- self.generation_config = None
-
- self.decode_status = dict()
- self.model_status_dict = dict()
- self.tool_parser_dict = dict()
- self.tokenizer = self._load_tokenizer()
- self._tokenize_cache = OrderedDict()
- self._tokenize_cache_capacity = 128
- data_processor_logger.info(
- f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \
- eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
- )
-
- try:
- from paddleformers.trl.llm_utils import get_eos_token_id
- except Exception:
- from paddleformers.cli.utils.llm_utils import get_eos_token_id
-
- self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config)
- data_processor_logger.info(
- f"The eos_token_ids obtained by merging tokenizer and generation_config is {self.eos_token_ids}"
- )
- self.eos_token_id_len = len(self.eos_token_ids)
- self.pad_token_id = self.get_pad_id()
- self.reasoning_parser = None
- self.tool_parser_obj = tool_parser_obj
- if reasoning_parser_obj:
- self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
- self.tokenizer.pad_token_id = self.pad_token_id
-
- def process_request(self, request, max_model_len=None, **kwargs):
- """
- Preprocess the request
-
- Args:
- request (Dict): may contain text and messages fields
-
- Returns:
- bool: Whether preprocessing is successful
- str: error message
- """
- data_processor_logger.info(f"Start processing request: {request}")
- request = self._apply_default_parameters(request)
- if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- # processing bad_words
- bad_words = request.get("bad_words")
- bad_words_token_ids = request.get("bad_words_token_ids")
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request["bad_words_token_ids"] = bad_words_token_ids
-
- logits_processors_args = self._prepare_think_stop_sentence(
- request.get("logits_processors_args") or {}, max_model_len
- )
- request["logits_processors_args"] = logits_processors_args
-
- # processing prompt_token_ids
- if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
- if request.prompt is not None:
- prompt = request.prompt
- add_special_tokens = request.get("add_special_tokens", False)
- assert isinstance(prompt, str) or (
- isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
- ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
- if isinstance(prompt, list): # if prompt is a token id list
- request.prompt_token_ids = prompt
- else:
- request.prompt_token_ids = self.text2ids(
- request.prompt, max_model_len, add_special_tokens=add_special_tokens
- )
- elif request.messages is not None:
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat_template.")
- task = request.to_dict()
- chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if k not in task or task[k] is None:
- task[k] = v
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- task.setdefault("enable_thinking", True)
- request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs)
- else:
- raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
-
- if len(request.prompt_token_ids) == 0:
- raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
- # truncate prompts that exceed the length limit
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
-
- logits_processors_args = request.get("logits_processors_args") or {}
- logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args)
- request["logits_processors_args"] = logits_processors_args
-
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if request.get("max_tokens") is None:
- request.set("max_tokens", max(1, max_tokens))
- else:
- request.set("max_tokens", min(max_tokens, request.get("max_tokens")))
- if request.get("temperature") < _SAMPLING_EPS:
- # zero temperature means greedy decoding: set top_k=1 to force argmax
- request.set("temperature", 1)
- request.set("top_k", 1)
- if request.get("top_p") < _SAMPLING_EPS:
- request.set("top_p", _SAMPLING_EPS)
- request.set("top_k", 1)
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.get("n", 1)
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
-
- data_processor_logger.info(f"Processed request: {request}")
- return request
-
- def process_request_dict(self, request, max_model_len=None, **kwargs):
- """
- Preprocess the request
-
- Args:
- request Request: may contain text and messages fields
-
- Returns:
- bool: Whether preprocessing is successful
- str: error message
- """
- data_processor_logger.info(f"Start processing request: {request}")
- request = self._apply_default_parameters(request)
- if not request.eos_token_ids:
- request.eos_token_ids = self.eos_token_ids
-
- # processing stop_sequences and stop_token_ids
- process_stop_token_ids(request, self.update_stop_seq)
-
- # processing bad_words
- bad_words = request.sampling_params.bad_words
- bad_words_token_ids = request.sampling_params.bad_words_token_ids
- if bad_words:
- bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
- request.sampling_params.bad_words_token_ids = bad_words_token_ids
-
- logits_processors_args = self._prepare_think_stop_sentence(
- getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len
- )
- request.sampling_params.logits_processors_args = logits_processors_args
-
- # processing prompt_token_ids
- if not request.prompt_token_ids:
- if request.prompt:
- prompt = request.prompt
- add_special_tokens = getattr(request, "add_special_tokens", None) or False
- assert isinstance(prompt, str) or (
- isinstance(prompt, list) and all([isinstance(t, int) for t in prompt])
- ), f"prompt must be a string or a list of integers, but got {type(prompt)}"
- if isinstance(prompt, list): # if prompt is a token id list
- request.prompt_token_ids = prompt
- else:
- request.prompt_token_ids = self.text2ids(
- request.prompt, max_model_len, add_special_tokens=add_special_tokens
- ).tolist()
- elif request.messages:
- if self.tokenizer.chat_template is None:
- raise ValueError("This model does not support chat_template.")
- chat_template_kwargs = kwargs.get("chat_template_kwargs", {})
- if not chat_template_kwargs:
- chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {}
- if chat_template_kwargs:
- if isinstance(chat_template_kwargs, dict):
- for k, v in chat_template_kwargs.items():
- if not getattr(request, k, None):
- setattr(request, k, v)
- else:
- raise ValueError("Invalid input: chat_template_kwargs must be a dict")
- if getattr(request, "enable_thinking") is None:
- setattr(request, "enable_thinking", True)
- request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs)
- delattr(request, "chat_template_kwargs")
- else:
- raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
-
- if len(request.prompt_token_ids) == 0:
- raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
-
- # truncate prompts that exceed the length limit
- if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
- request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
- logits_processors_args = getattr(request.sampling_params, "logits_processors_args", None) or {}
- logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args)
- request.sampling_params.logits_processors_args = logits_processors_args
-
- max_tokens = max_model_len - len(request.prompt_token_ids)
- if getattr(request.sampling_params, "max_tokens", None) is None:
- request.sampling_params.max_tokens = max(1, max_tokens)
- else:
- request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens)
-
- if request.sampling_params.temperature < _SAMPLING_EPS:
- # zero temperature means greedy decoding: set top_k=1 to force argmax
- request.sampling_params.temperature = 1
- request.sampling_params.top_k = 1
- if request.sampling_params.top_p < _SAMPLING_EPS:
- request.sampling_params.top_p = _SAMPLING_EPS
- request.sampling_params.top_k = 1
- if self.reasoning_parser:
- model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
- parts = request.request_id.split("_")
- if len(parts) > 1:
- real_req_id = parts[0]
- index = int(parts[1])
- n = request.sampling_params.n or 1
- for idx in range(index * n, (index + 1) * n):
- self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
- else:
- self.model_status_dict[request.request_id] = model_status
- request.enable_thinking = model_status == "think_start"
-
- data_processor_logger.info(f"Processed request: {request}")
- return request
-
- def process_logprob_response(self, token_ids, **kwargs):
- full_text = self.tokenizer.decode(token_ids, **kwargs)
- return full_text
-
- def process_response(self, response_dict, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_dict (Dict): response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- req_id = response_dict.request_id
- token_ids = response_dict.outputs.token_ids
- if token_ids[-1] == self.tokenizer.eos_token_id:
- token_ids = token_ids[:-1]
- full_text = self.tokenizer.decode(token_ids)
- response_dict.outputs.text = full_text
- if self.reasoning_parser:
- reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
- full_text, response_dict, self.model_status_dict[req_id]
- )
- response_dict.outputs.text = text
- response_dict.outputs.reasoning_content = reasoning_content
- if self.tool_parser_obj:
- tool_parser = self.tool_parser_obj(self.tokenizer)
- tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict)
- if tool_call_info.tools_called:
- response_dict.outputs.tool_calls = tool_call_info.tool_calls
- response_dict.outputs.text = tool_call_info.content
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
-
- return response_dict
-
- def process_response_obj_normal(self, response_obj, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_obj :response for engine, contain ids fields
-
- Returns:
- RequestOutput: response contain text fields
- """
- output = response_obj.outputs
- token_ids = output.token_ids
- is_end = response_obj.finished
- req_id = response_obj.request_id
- request = kwargs.get("request", None)
- if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
- if token_ids[-1] in self.eos_token_ids:
- token_ids = token_ids[:-1]
- delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
- if is_end:
- full_text = previous_texts + delta_text
- response_obj.outputs.completion_tokens = full_text
- response_obj.outputs.text = full_text
- if self.reasoning_parser:
- reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
- full_text,
- request,
- self.model_status_dict[req_id],
- )
- response_obj.outputs.text = text
- response_obj.outputs.reasoning_content = reasoning_content
- reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
- response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
- if self.tool_parser_obj:
- tool_parser = self.tool_parser_obj(self.tokenizer)
- tool_call_info = tool_parser.extract_tool_calls(full_text, request)
- if tool_call_info.tools_called:
- response_obj.outputs.tool_calls = tool_call_info.tool_calls
- response_obj.outputs.text = tool_call_info.content
- data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
- del self.decode_status[req_id]
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- return response_obj
-
- def process_response_obj_streaming(self, response_obj, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_obj : response for engine, contain ids fields
-
- Returns:
- RequestOutput: response contain text fields
- """
- output = response_obj.outputs
- token_ids = output.token_ids
- is_end = response_obj.finished
- req_id = response_obj.request_id
- request = kwargs.get("request", None)
-
- if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"):
- if token_ids[-1] in self.eos_token_ids:
- token_ids = token_ids[:-1]
- delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id)
- response_obj.outputs.completion_tokens = delta_text
- if self.reasoning_parser:
- reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming(
- previous_texts,
- previous_texts + delta_text,
- delta_text,
- previous_token_ids,
- previous_token_ids + token_ids,
- token_ids,
- self.model_status_dict[req_id],
- )
- response_obj.outputs.delta_message = reasoning_delta_message
- reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None
- reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else []
- response_obj.outputs.reasoning_token_num = len(reasoning_tokens)
- if self.tool_parser_obj:
- if req_id not in self.tool_parser_dict:
- self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer)
- tool_parser = self.tool_parser_dict[req_id]
- tool_call = tool_parser.extract_tool_calls_streaming(
- previous_texts,
- previous_texts + delta_text,
- delta_text,
- previous_token_ids,
- previous_token_ids + token_ids,
- token_ids,
- request,
- )
- if tool_call is None or tool_call.tool_calls:
- response_obj.outputs.delta_message = tool_call
- response_obj.outputs.text = delta_text
- if is_end:
- data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
- del self.decode_status[req_id]
- if req_id in self.tool_parser_dict:
- del self.tool_parser_dict[req_id]
- if req_id in self.model_status_dict:
- del self.model_status_dict[req_id]
- return response_obj
-
- def process_response_dict(self, response_dict, **kwargs):
- """
- Preprocess the response
-
- Args:
- response_obj: response for engine, contain ids fields
-
- Returns:
- Dict: response contain text fields
- """
- stream = kwargs.get("stream", True)
- if stream:
- return self.process_response_obj_streaming(response_dict, **kwargs)
- else:
- return self.process_response_obj_normal(
- response_dict,
- **kwargs,
- )
-
- def text2ids(self, text, max_model_len, **kwargs):
- """
- text to token ids
-
- Args:
- text (str): text
-
- Returns:
- List[int]: token ids list
- """
-
- add_special_tokens = kwargs.get("add_special_tokens")
- if envs.FD_USE_HF_TOKENIZER:
- tokens = self.tokenizer(
- text,
- return_tensors="np",
- padding=True,
- truncation=True,
- )
- else:
- text = [text] if isinstance(text, str) else text
-
- tokens = self.tokenizer(
- text,
- return_tensors="np",
- padding=True,
- truncation=True,
- max_length=max_model_len,
- add_special_tokens=add_special_tokens,
- )
-
- return tokens["input_ids"][0]
-
- def messages2ids(self, request, **kwargs):
- """
- Convert multi-turn messages into ID sequences.
-
- Args:
- messages (List[List[Dict[str, Any]]]): multi-turn messages.
-
- Returns:
- List[int]: ID sequences
- """
- message_dict = {
- key: getattr(request, key, None)
- for key in ["messages", "tools", "documents", "enable_thinking", "system"]
- if getattr(request, key, None) is not None
- }
- if "add_generation_prompt" not in kwargs:
- kwargs["add_generation_prompt"] = (
- request.add_generation_prompt if request.add_generation_prompt is not None else True
- )
- spliced_message = self.tokenizer.apply_chat_template(
- message_dict,
- tokenize=False,
- split_special_tokens=False,
- add_special_tokens=False,
- **kwargs,
- )
- request.prompt_tokens = spliced_message
- tokens = self.tokenizer.tokenize(spliced_message)
- req_id = getattr(request, "request_id", None)
- token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
- data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
- return token_ids
-
- def ids2tokens(self, token_id, task_id):
- """
- token ids to strings
-
- Args:
- token_ids (List[int]): token ids
- task_id (str): task id
-
- Returns:
- List[str]: strings
- """
- if envs.FD_USE_HF_TOKENIZER:
- if task_id not in self.decode_status:
- # history token ids & history token strings & befer decode str
- self.decode_status[task_id] = [[], [], ""]
-
- previous_token_ids = self.decode_status[task_id][0]
- decode_str = self.tokenizer.batch_decode(
- [previous_token_ids + token_id],
- skip_special_tokens=True,
- clean_up_tokenization_spaces=False,
- )
- if isinstance(decode_str, list) and len(decode_str):
- new_str = decode_str[0].replace(self.decode_status[task_id][2], "", 1)
- self.decode_status[task_id][1].append(new_str)
- self.decode_status[task_id][2] = decode_str[0]
- else:
- new_str = ""
- self.decode_status[task_id][0] += token_id
- return new_str
- else:
- if task_id not in self.decode_status:
- # prefix offset & read offset & history token ids & history token strings
- self.decode_status[task_id] = [0, 0, [], ""]
-
- prefix_offset = self.decode_status[task_id][0]
- read_offset = self.decode_status[task_id][1]
- previous_token_ids = self.decode_status[task_id][2]
- previous_texts = self.decode_status[task_id][3]
- decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
- previous_token_ids + token_id, prefix_offset, read_offset
- )
- self.decode_status[task_id][0] = prefix_offset
- self.decode_status[task_id][1] = read_offset
- self.decode_status[task_id][2] += token_id
- self.decode_status[task_id][3] += decode_str
-
- return decode_str, previous_token_ids, previous_texts
-
- def _load_tokenizer(self):
- """
- load tokenizer
-
- Returns:
- tokenizer (AutoTokenizer)
- """
- if envs.FD_USE_HF_TOKENIZER:
- from transformers import AutoTokenizer
-
- return AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=False)
- else:
- from paddleformers.transformers import AutoTokenizer
-
- return AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
-
- def clear_request_status(self, task_id):
- """
- clear request status
-
- Args:
- task_id (str): task id
-
- Returns:
- results_all (str): all token strings
- """
- results_all = ""
- if task_id in self.decode_status:
- if envs.FD_USE_HF_TOKENIZER:
- results_all = self.decode_status[task_id][2]
- else:
- results_all = "".join(self.decode_status[task_id][3])
- del self.decode_status[task_id]
- return results_all
-
- def get_pad_id(self):
- """
- get pad_token_id, if not pad_token_id, use eos_token
-
- Returns:
- int: pad_token_id
- """
- if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
- return self.tokenizer.eos_token
- return self.tokenizer.pad_token_id
-
- def pad_batch_data(
- self,
- insts,
- pad_id=0,
- return_seq_len=False,
- return_array=True,
- pad_style="right",
- ):
- """Pad the instances to the max sequence length in batch."""
- if len(insts) == 0:
- padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]]
- if return_seq_len:
- seq_len = np.array([], dtype=np.int64) if return_array else []
- return padded_insts, seq_len
- return padded_insts
-
- max_len = max(map(len, insts))
- if pad_style == "left":
- padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]
- else:
- padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]
- if return_array:
- padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len])
-
- if return_seq_len:
- seq_len = [len(inst) for inst in insts]
- if return_array:
- seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
- return padded_insts, seq_len
- return padded_insts
-
- def update_stop_seq(self, stop_sequences):
- """
- Update stop sequences from request.
- """
- stop_seqs = []
- for seq in stop_sequences:
- if seq != self.tokenizer.eos_token_id:
- stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
- stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
- data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
- return stop_seqs, stop_seqs_len
-
- def update_bad_words(self, bad_words, bad_words_token_ids):
- """Support bad words"""
-
- token_ids = bad_words_token_ids
-
- if token_ids is None:
- token_ids = []
- for bad_word in bad_words:
- # To prohibit words both at the beginning
- # and in the middle of text
- # (related to add_prefix_space tokenizer parameter)
- for add_prefix_space in [False, True]:
- prefix = " " if add_prefix_space else ""
- prompt = prefix + bad_word.lstrip()
- prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
-
- if len(prompt_token_ids) != 1:
- if not add_prefix_space:
- data_processor_logger.warning(
- f"Skip bad_words: <{prompt}>."
- f"Bad words should be a single token."
- f"Got tokens: {prompt_token_ids}."
- )
- continue
-
- if prompt_token_ids[0] > self.tokenizer.vocab_size:
- if not add_prefix_space:
- data_processor_logger.warning(
- f"Skip bad_words: <{prompt}>."
- f"All token id values should be satisfying:"
- f" 0 <= token_id < {self.tokenizer.vocab_size}."
- f"Got token: {prompt_token_ids}."
- )
- continue
-
- if prompt_token_ids not in token_ids:
- token_ids.extend(prompt_token_ids)
- return token_ids
diff --git a/fastdeploy/inter_communicator/zmq_server.py b/fastdeploy/inter_communicator/zmq_server.py
index 7073edb48a5..c88fe96eb08 100644
--- a/fastdeploy/inter_communicator/zmq_server.py
+++ b/fastdeploy/inter_communicator/zmq_server.py
@@ -169,10 +169,7 @@ def pack_aggregated_data(self, data):
if len(data) > 1:
for response in data[1:]:
result.add(response)
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- result = ForkingPickler.dumps([result.to_dict()])
- else:
- result = ForkingPickler.dumps([result])
+ result = ForkingPickler.dumps([result.to_dict()])
return result
def receive_json_once(self, block=False):
@@ -303,10 +300,7 @@ def _send_response_per_query(self, req_id, data):
if self.aggregate_send:
result = self.pack_aggregated_data(new_data)
else:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- result = ForkingPickler.dumps([response.to_dict() for response in new_data])
- else:
- result = ForkingPickler.dumps(new_data)
+ result = ForkingPickler.dumps([response.to_dict() for response in new_data])
with self.response_token_lock:
_zmq_metrics_stats = ZMQMetricsStats()
@@ -349,13 +343,10 @@ def _send_batch_response(self, batch_data, worker_pid=None):
metrics_address = self.address or self.worker_push_addresses.get(worker_pid, "unknown")
try:
- if not envs.ENABLE_V1_DATA_PROCESSOR:
- result = msgpack.packb(
- [[output.to_dict() for output in outputs] for outputs in batch_data],
- default=_msgpack_default,
- )
- else:
- result = ForkingPickler.dumps(batch_data)
+ result = msgpack.packb(
+ [[output.to_dict() for output in outputs] for outputs in batch_data],
+ default=_msgpack_default,
+ )
result_len = len(result)
# Only hold lock for the actual socket send
diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py
index 5a6241c4433..69c6db2a753 100644
--- a/tests/engine/test_common_engine.py
+++ b/tests/engine/test_common_engine.py
@@ -1267,7 +1267,6 @@ def receive_json_once(self, block):
with (
patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False),
patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False),
patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
):
@@ -3277,7 +3276,6 @@ def __init__(self):
with (
patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True),
patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False),
- patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False),
patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()),
patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None),
):
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 8af369c396a..1b33405503f 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -20,7 +20,6 @@
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, Mock, patch
-import numpy as np
import paddle
import fastdeploy.envs as envs
@@ -307,51 +306,29 @@ async def test_create_chat_completion_request_id_and_v1_stream(self):
self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(
side_effect=ParameterError("param", "bad")
)
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
- resp = await self.chat_completion_handler.create_chat_completion(
- ChatCompletionRequest(
- messages=[{"role": "user", "content": "Hello"}],
- request_id="abc",
- stream=False,
- )
+ with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
+ resp = await self.chat_completion_handler.create_chat_completion(
+ ChatCompletionRequest(
+ messages=[{"role": "user", "content": "Hello"}],
+ request_id="abc",
+ stream=False,
)
+ )
self.assertEqual(resp.error.param, "param")
self.assertIn("bad", resp.error.message)
self.assertEqual(mock_trace.call_args.kwargs["rid"], "chatcmpl-abc")
self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(side_effect=RuntimeError("boom"))
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"):
- resp = await self.chat_completion_handler.create_chat_completion(
- ChatCompletionRequest(
- messages=[{"role": "user", "content": "Hello"}],
- request_id="err",
- stream=False,
- )
+ with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"):
+ resp = await self.chat_completion_handler.create_chat_completion(
+ ChatCompletionRequest(
+ messages=[{"role": "user", "content": "Hello"}],
+ request_id="err",
+ stream=False,
)
+ )
self.assertIn("generator error", resp.error.message)
- self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=np.array([1, 2]))
- stream_mock = Mock(return_value="streamed")
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
- with patch(
- "fastdeploy.entrypoints.openai.serving_chat.Request.from_generic_request",
- return_value={"metrics": {}, "prompt_tokens": "pt", "max_tokens": 3},
- ):
- with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace:
- with patch.object(self.chat_completion_handler, "chat_completion_stream_generator", stream_mock):
- result = await self.chat_completion_handler.create_chat_completion(
- ChatCompletionRequest(
- messages=[{"role": "user", "content": "Hello"}],
- user="user",
- stream=True,
- )
- )
- self.assertEqual(result, "streamed")
- self.assertTrue(mock_trace.call_args.kwargs["rid"].startswith("chatcmpl-user-"))
- self.assertEqual(stream_mock.call_args.args[3], [1, 2])
-
async def test_create_chat_completion_full_and_waiting_errors(self):
"""Test full generator error and waiting error handling."""
self.chat_completion_handler.engine_client.is_master = True
@@ -361,15 +338,14 @@ async def test_create_chat_completion_full_and_waiting_errors(self):
self.chat_completion_handler.engine_client.semaphore.status = Mock(return_value="ok")
self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=[1, 2])
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- with patch.object(
- self.chat_completion_handler,
- "chat_completion_full_generator",
- AsyncMock(side_effect=RuntimeError("boom")),
- ):
- resp = await self.chat_completion_handler.create_chat_completion(
- ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False)
- )
+ with patch.object(
+ self.chat_completion_handler,
+ "chat_completion_full_generator",
+ AsyncMock(side_effect=RuntimeError("boom")),
+ ):
+ resp = await self.chat_completion_handler.create_chat_completion(
+ ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False)
+ )
self.assertIn("full generator error", resp.error.message)
with patch(
diff --git a/tests/entrypoints/test_serving_completion.py b/tests/entrypoints/test_serving_completion.py
index b76d798afc5..9c2beb678df 100644
--- a/tests/entrypoints/test_serving_completion.py
+++ b/tests/entrypoints/test_serving_completion.py
@@ -20,7 +20,6 @@
import numpy as np
import paddle
-import fastdeploy.envs as envs
import fastdeploy.metrics.trace as tracing
from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
from fastdeploy.utils import ErrorCode, ParameterError
@@ -124,40 +123,21 @@ async def test_create_completion_branches(self):
ec = _make_engine_client()
ec.format_and_add_data = AsyncMock(side_effect=ParameterError("max_tokens", "bad"))
serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens")
+ res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens")
ec.semaphore.release.assert_called_once()
ec = _make_engine_client()
- ec.format_and_add_data = AsyncMock(side_effect=ValueError("bad"))
- serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
-
- def fake_from_generic_request(_, request_id):
- return {"prompt": "hi", "request_id": request_id, "prompt_tokens": [1], "max_tokens": 2, "metrics": {}}
-
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
- with patch(
- "fastdeploy.entrypoints.openai.serving_completion.Request.from_generic_request",
- side_effect=fake_from_generic_request,
- ):
- await _assert_error(self, serving, _make_request(prompt="hi"), code=ErrorCode.INVALID_VALUE)
- ec = _make_engine_client()
ec.format_and_add_data = AsyncMock(return_value=np.array([1, 2]))
serving = OpenAIServingCompletion(ec, None, "pid", None, -1)
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))):
- await _assert_error(
- self, serving, _make_request(prompt="hi"), contains="completion_full_generator error"
- )
+ with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))):
+ await _assert_error(self, serving, _make_request(prompt="hi"), contains="completion_full_generator error")
serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1)
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- with patch.object(serving, "completion_stream_generator", return_value="streamed"):
- res = await serving.create_completion(_make_request(request_id="req123", stream=True))
+ with patch.object(serving, "completion_stream_generator", return_value="streamed"):
+ res = await serving.create_completion(_make_request(request_id="req123", stream=True))
self.assertEqual(res, "streamed")
serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1)
- with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- await _assert_error(
- self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error"
- )
+ await _assert_error(
+ self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error"
+ )
async def test_completion_full_generator_branches(self):
ec = _make_engine_client()
diff --git a/tests/input/test_preprocess.py b/tests/input/test_preprocess.py
index b4659261a8e..4196b729fbb 100644
--- a/tests/input/test_preprocess.py
+++ b/tests/input/test_preprocess.py
@@ -63,10 +63,8 @@ def test_create_processor_text_normal_path(self):
mock_dp = MagicMock()
with (
patch.dict("sys.modules", {"fastdeploy.plugins": None, "fastdeploy.plugins.input_processor": None}),
- patch("fastdeploy.input.preprocess.envs") as mock_envs,
patch("fastdeploy.input.text_processor.DataProcessor", return_value=mock_dp),
):
- mock_envs.ENABLE_V1_DATA_PROCESSOR = False
pp.create_processor()
self.assertIs(pp.processor, mock_dp)
diff --git a/tests/input/v1/test_ernie4_5_processor.py b/tests/input/v1/test_ernie4_5_processor.py
deleted file mode 100644
index 13cfdb10747..00000000000
--- a/tests/input/v1/test_ernie4_5_processor.py
+++ /dev/null
@@ -1,448 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-
-from fastdeploy.engine.request import Request, RequestOutput
-
-MODULE_PATH = "fastdeploy.input.v1.ernie4_5_processor"
-
-from fastdeploy.input.v1.ernie4_5_processor import _SAMPLING_EPS, Ernie4_5Processor
-
-
-class MockTokenizer:
- """A simple mock tokenizer used to simulate tokenization behavior in unit tests."""
-
- def __init__(self):
- self.bos_token = ""
- self.bos_token_id = 101
- self.eos_token = ""
- self.eos_token_id = 102
- self.pad_token_id = 0
- self.vocab_size = 200
- # Non-None value indicates chat_template support
- self.chat_template = "dummy"
-
- def tokenize(self, text):
- """Return multi-token output for 'multi*' to test branching; otherwise return single-token."""
- if text.startswith("multi"):
- return ["multi", "word"]
- return [text]
-
- def convert_tokens_to_ids(self, tokens):
- """Map tokens to synthetic IDs for branch coverage."""
- mapping = {
- "bad": 5,
- " bad": 6,
- "multi": 7,
- "word": 8,
- "oov": 250,
- " oov": 251,
- "hello": 9,
- "REASON": 42,
- }
- return [mapping.get(t, 1) for t in tokens]
-
- def decode(self, token_ids, **kwargs):
- """Simple decode implementation returning a space-separated string."""
- return " ".join(str(t) for t in token_ids)
-
- def decode_token(self, token_ids, prefix_offset, read_offset):
- """Incremental decode used to test streaming behavior."""
- new_tokens = token_ids[read_offset:]
- decode_str = " ".join(str(t) for t in new_tokens)
- new_read_offset = len(token_ids)
- return decode_str, prefix_offset, new_read_offset
-
- def apply_chat_template(self, request_or_messages, tokenize, split_special_tokens, add_special_tokens, **kwargs):
- """Minimal chat template implementation used by messages2ids."""
- if isinstance(request_or_messages, dict) and "messages" in request_or_messages:
- return " | ".join(m["content"] for m in request_or_messages["messages"])
- return str(request_or_messages)
-
-
-class ErnieX1ReasoningParser:
- """Mock reasoning parser to trigger reasoning-related branches during streaming."""
-
- def __init__(self, tokenizer):
- self.tokenizer = tokenizer
-
- def extract_reasoning_content(self, full_text, response_dict, model_status):
- """Extract reasoning content for non-streaming responses."""
-
- class ReasoningContent:
- def __init__(self):
- self.reasoning_content = "mock_reasoning"
- self.content = "mock_content"
-
- return ReasoningContent()
-
- def extract_reasoning_content_streaming(
- self,
- previous_texts,
- full_text,
- delta_text,
- previous_token_ids,
- all_token_ids,
- delta_token_ids,
- model_status,
- ):
- """Return a simple object with reasoning_content to cover reasoning branch."""
-
- class ReasoningDelta:
- def __init__(self, content):
- self.reasoning_content = content
-
- return ReasoningDelta(delta_text)
-
-
-class MockToolParser:
- """Mock tool parser to cover tool-related branches in both normal and streaming responses."""
-
- def __init__(self, tokenizer):
- self.tokenizer = tokenizer
-
- class ToolDelta:
- """Simple container representing detected tool calls."""
-
- def __init__(self):
- self.tool_calls = [{"name": "fake_tool"}]
- self.tools_called = True
- self.content = "tool_content"
-
- def extract_tool_calls(self, full_text, response_dict):
- """Used in process_response and process_response_obj_normal."""
- return MockToolParser.ToolDelta()
-
- def extract_tool_calls_streaming(
- self,
- previous_texts,
- full_text,
- delta_text,
- previous_token_ids,
- all_token_ids,
- delta_token_ids,
- response_dict,
- ):
- """Used in process_response_obj_streaming."""
- return MockToolParser.ToolDelta()
-
-
-class TestErnie4_5Processor(unittest.TestCase):
- """Unit tests for Ernie4_5Processor focusing on preprocessing and postprocessing logic."""
-
- def setUp(self):
- """Patch external dependencies: tokenizer, generation config, eos token resolution."""
- self.gen_patcher = patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", return_value=MagicMock())
- self.tokenizer_patcher = patch(
- f"{MODULE_PATH}.Ernie4_5Tokenizer.from_pretrained", side_effect=lambda path: MockTokenizer()
- )
- self.eos_patcher = patch(
- "paddleformers.cli.utils.llm_utils.get_eos_token_id",
- side_effect=lambda tokenizer, cfg: [tokenizer.eos_token_id],
- )
-
- self.gen_patcher.start()
- self.tokenizer_patcher.start()
- self.eos_patcher.start()
-
- def tearDown(self):
- """Stop all patches after each test."""
- self.gen_patcher.stop()
- self.tokenizer_patcher.stop()
- self.eos_patcher.stop()
-
- def _make_processor(self, reasoning=False, tool=False):
- """Helper for creating a processor with optional reasoning/tool parser support."""
- reasoning_cls = ErnieX1ReasoningParser if reasoning else None
- tool_cls = MockToolParser if tool else None
- proc = Ernie4_5Processor("dummy-model", reasoning_parser_obj=reasoning_cls, tool_parser_obj=tool_cls)
- proc._apply_default_parameters = lambda req: req
- proc.model_status_dict = {"req-1": "think_start"}
- return proc
-
- def test_update_bad_words(self):
- """Verify filtering, multi-token skipping, and OOV behavior in update_bad_words."""
- proc = self._make_processor()
-
- bad_words = ["bad", "multi", "oov"]
- token_ids = proc.update_bad_words(bad_words, bad_words_token_ids=None)
-
- self.assertEqual(token_ids, [5, 6, 1])
-
- def test_process_request_dict_with_prompt_string(self):
- """Test prompt-based tokenization, truncation, and temperature/top_p correction."""
- proc = self._make_processor()
- req = {
- "request_id": "test_0",
- "prompt": "hello",
- "temperature": 0.0,
- "top_p": 0.0,
- }
- req = Request.from_dict(req)
-
- processed = proc.process_request_dict(req, max_model_len=10)
-
- self.assertTrue(hasattr(processed, "eos_token_ids"))
- self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id])
-
- expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello"))
- self.assertEqual(processed.prompt_token_ids, expected_ids)
-
- self.assertEqual(processed.sampling_params.max_tokens, max(1, 10 - len(expected_ids)))
- self.assertEqual(processed.sampling_params.temperature, 1)
- self.assertEqual(processed.sampling_params.top_k, 1)
- self.assertAlmostEqual(processed.sampling_params.top_p, _SAMPLING_EPS)
- self.assertEqual(processed.prompt_tokens, "hello")
-
- def test_pad_batch_data_right_and_left_and_empty(self):
- """Test left/right padding and empty input behavior."""
- proc = self._make_processor()
-
- insts = [[1, 2], [3]]
-
- padded, seq_len = proc.pad_batch_data(
- insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="right"
- )
- np.testing.assert_array_equal(padded, np.array([[1, 2], [3, 0]], dtype=np.int64))
- np.testing.assert_array_equal(seq_len, np.array([[2], [1]], dtype=np.int64))
-
- padded_left, seq_len_left = proc.pad_batch_data(
- insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="left"
- )
- np.testing.assert_array_equal(padded_left, np.array([[1, 2], [0, 3]], dtype=np.int64))
- np.testing.assert_array_equal(seq_len_left, np.array([[2], [1]], dtype=np.int64))
-
- padded_empty, seq_len_empty = proc.pad_batch_data(
- [], pad_id=0, return_seq_len=True, return_array=True, pad_style="right"
- )
- np.testing.assert_array_equal(padded_empty, np.array([[]], dtype=np.int64))
- np.testing.assert_array_equal(seq_len_empty, np.array([], dtype=np.int64))
-
- def test_process_response_obj_streaming_with_reasoning_and_tool(self):
- """Ensure streaming mode handles reasoning and tool-call parsing correctly."""
- proc = self._make_processor(reasoning=True, tool=True)
-
- response = {
- "finished": True,
- "request_id": "req-1",
- "outputs": {"token_ids": [10, 11]},
- }
- response = RequestOutput.from_dict(response)
-
- result = proc.process_response_obj_streaming(response, enable_thinking=False, include_stop_str_in_output=False)
-
- outputs = result.outputs
-
- self.assertTrue(hasattr(outputs, "completion_tokens"))
- self.assertTrue(hasattr(outputs, "text"))
- self.assertEqual(outputs.completion_tokens, outputs.reasoning_content)
-
- self.assertTrue(hasattr(outputs, "reasoning_token_num"))
- self.assertGreaterEqual(outputs.reasoning_token_num, 0)
-
- self.assertTrue(hasattr(outputs, "delta_message"))
- delta_msg = outputs.delta_message
- self.assertTrue(hasattr(delta_msg, "tool_calls"))
-
- self.assertNotIn("req-1", proc.decode_status)
- self.assertNotIn("req-1", proc.tool_parser_dict)
-
- def test_update_stop_seq(self):
- """Test stop sequence tokenization and padding."""
- proc = self._make_processor()
-
- stop_seqs, stop_lens = proc.update_stop_seq("stop")
- self.assertIsInstance(stop_seqs, list)
- self.assertIsInstance(stop_lens, list)
-
- stop_seqs2, stop_lens2 = proc.update_stop_seq(["stop", "hello"])
- self.assertEqual(len(stop_seqs2), 2)
- self.assertEqual(len(stop_lens2), 2)
-
- def test_process_request_chat_template_kwargs(self):
- """Test chat_template_kwargs application inside process_request_dict."""
-
- proc = self._make_processor()
-
- request = {
- "request_id": "test_0",
- "messages": [{"role": "user", "content": "hello"}],
- "temperature": 0.5,
- "top_p": 0.5,
- }
- request = Request.from_dict(request)
-
- processed = proc.process_request_dict(request, max_model_len=20, chat_template_kwargs={"extra": "VALUE"})
-
- self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id])
-
- expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello"))
- self.assertIsNotNone(processed.prompt_token_ids)
- self.assertEqual(processed.prompt_token_ids, expected_ids)
-
- self.assertTrue(hasattr(processed.sampling_params, "max_tokens"))
- self.assertEqual(processed.sampling_params.max_tokens, max(1, 20 - len(expected_ids)))
-
- def test_process_request_dict_chat_template_kwargs(self):
- """Test chat_template_kwargs insertion in process_request_dict."""
- proc = self._make_processor()
-
- req = {
- "request_id": "test_0",
- "messages": [{"role": "user", "content": "hey"}],
- "chat_template_kwargs": {"A": "B"},
- "temperature": 0.5,
- "top_p": 0.5,
- }
- req = Request.from_dict(req)
- req.chat_template_kwargs = {"A": "B"}
-
- result = proc.process_request_dict(req, max_model_len=30)
-
- self.assertTrue(hasattr(result, "prompt_token_ids"))
- self.assertEqual(getattr(result, "A"), "B")
-
- def test_init_generation_config_exception(self):
- """Test fallback behavior when GenerationConfig loading fails."""
- with patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", side_effect=Exception("fail")):
- proc = self._make_processor()
- self.assertIsNone(proc.generation_config)
-
- # def test_process_response_with_tool_parser(self):
- # """Verify tool_call extraction in process_response."""
- # proc = self._make_processor(tool=True)
-
- # class RespObj:
- # """Mock response carrying token_ids and index for testing."""
-
- # def __init__(self):
- # self.request_id = "reqx"
- # self.outputs = MagicMock()
- # self.outputs.token_ids = [9, proc.tokenizer.eos_token_id]
- # self.outputs.index = 0
-
- # resp = RespObj()
- # result = proc.process_response(resp)
-
- # self.assertTrue(hasattr(result.outputs, "tool_calls"))
- # self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool")
-
- def test_process_response_obj_normal_with_tool(self):
- """Verify tool_call extraction in normal (non-streaming) response mode."""
- proc = self._make_processor(tool=True)
-
- resp = {
- "finished": True,
- "request_id": "task-99",
- "outputs": {"token_ids": [10, 11], "text": ""},
- }
- resp = RequestOutput.from_dict(resp)
-
- result = proc.process_response_obj_normal(resp, enable_thinking=False, include_stop_str_in_output=False)
-
- self.assertTrue(hasattr(result.outputs, "tool_calls"))
- self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool")
-
- def test_process_request_greedy_sets_top_k(self):
- """process_request with temperature=0 should set top_k=1 for greedy decoding."""
- proc = self._make_processor()
- proc.messages2ids = MagicMock(return_value=[9])
-
- request = Request.from_dict(
- {
- "request_id": "test_greedy",
- "prompt": "hello",
- "temperature": 0.0,
- "top_p": 0.5,
- }
- )
- result = proc.process_request(request, max_model_len=10)
- self.assertEqual(result.get("temperature"), 1)
- self.assertEqual(result.get("top_k"), 1)
-
- def test_process_request(self):
- """Test process_request method with various input types."""
- proc = self._make_processor()
-
- # Test with prompt string
- request = Request.from_dict(
- {
- "request_id": "test_1",
- "prompt": "hello",
- "temperature": 0.5,
- "top_p": 0.5,
- }
- )
- result = proc.process_request(request, max_model_len=10)
- self.assertEqual(result.prompt_token_ids, [9])
-
- # Test with prompt token ids
- request = Request.from_dict(
- {
- "request_id": "test_2",
- "prompt_token_ids": [1, 2, 3],
- "temperature": 0.5,
- "top_p": 0.5,
- }
- )
- result = proc.process_request(request, max_model_len=5)
- self.assertEqual(result.prompt_token_ids, [1, 2, 3])
-
- # Test with messages - mock the messages2ids method to avoid dict attribute error
- proc.messages2ids = MagicMock(return_value=[9])
- request = Request.from_dict(
- {
- "request_id": "test_3",
- "messages": [{"role": "user", "content": "hello"}],
- "temperature": 0.5,
- "top_p": 0.5,
- }
- )
- result = proc.process_request(request, max_model_len=10)
- self.assertEqual(result.prompt_token_ids, [9])
- proc.messages2ids.assert_called_once()
-
- def test_process_response(self):
- """Test process_response method with various scenarios."""
- # Test without reasoning parser to avoid model_status_dict dependency
- proc = self._make_processor(reasoning=False)
-
- # Test basic response
- response = RequestOutput.from_dict(
- {"request_id": "test_4", "outputs": {"token_ids": [10, 11, proc.tokenizer.eos_token_id], "index": 0}}
- )
- result = proc.process_response(response)
- self.assertEqual(result.outputs.text, "10 11")
-
- # Test another response to ensure consistency
- response = RequestOutput.from_dict(
- {"request_id": "test_5", "outputs": {"token_ids": [20, 21, proc.tokenizer.eos_token_id], "index": 0}}
- )
- result = proc.process_response(response)
- self.assertEqual(result.outputs.text, "20 21")
-
- # Test response without eos_token at the end
- response = RequestOutput.from_dict({"request_id": "test_6", "outputs": {"token_ids": [30, 31], "index": 0}})
- result = proc.process_response(response)
- self.assertEqual(result.outputs.text, "30 31")
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_ernie_processor.py b/tests/input/v1/test_ernie_processor.py
deleted file mode 100644
index 437e4029a5d..00000000000
--- a/tests/input/v1/test_ernie_processor.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-from fastdeploy.engine.request import Request, RequestOutput
-from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor
-
-
-class MockReasoningParser:
- def get_model_status(self, prompt_token_ids):
- return "think_start"
-
-
-class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase):
- def setUp(self):
- # 创建 Ernie4_5Processor 实例的模拟对象
- with patch.object(Ernie4_5Processor, "__init__", return_value=None) as mock_init:
- self.processor = Ernie4_5Processor("model_path")
- mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
-
- # 设置必要的属性
- self.processor.tokenizer = MagicMock()
- self.processor.tokenizer.eos_token_id = 1
- self.processor.decode_status = {"test": []}
- self.processor.reasoning_end_dict = {}
- self.processor.tool_parser_dict = {}
- self.processor.generation_config = MagicMock()
- self.processor.eos_token_ids = [1]
- self.processor.reasoning_parser = MockReasoningParser()
- self.processor.model_status_dict = {"request-id_0": "think_start", "test": "think_start"}
-
- # 模拟 ids2tokens 方法
- def mock_ids2tokens(token_ids, task_id):
- self.processor.decode_status[task_id] = "mock_decode_status"
- return "delta_text", [2, 3], "previous_texts"
-
- self.processor.ids2tokens = mock_ids2tokens
-
- def mock_messages2ids(request, **kwargs):
- if "chat_template" in kwargs:
- return [1]
- else:
- return [0]
-
- def mock_apply_default_parameters(request):
- return request
-
- self.processor.messages2ids = mock_messages2ids
- self.processor._apply_default_parameters = mock_apply_default_parameters
-
- # 模拟推理解析器
- self.mock_reasoning_parser = MagicMock()
- self.mock_reasoning_parser.__class__.__name__ = "ErnieX1ReasoningParser"
- # self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text")
- self.processor.reasoning_parser = self.mock_reasoning_parser
-
- # 模拟工具解析器
- self.mock_tool_parser = MagicMock()
- self.mock_tool_parser.extract_tool_calls_streaming.return_value = None
- self.mock_tool_parser_obj = MagicMock()
- self.mock_tool_parser_obj.return_value = self.mock_tool_parser
- self.processor.tool_parser_obj = self.mock_tool_parser_obj
-
- def test_process_response_obj_streaming_normal_case(self):
- """测试正常情况下的流式响应处理"""
- # 准备输入
- response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}}
- kwargs = {"enable_thinking": True}
- response = RequestOutput.from_dict(response_dict)
-
- # 调用方法
- result = self.processor.process_response_obj_streaming(response, **kwargs)
-
- # 验证结果
- self.assertEqual(result.outputs.completion_tokens, "delta_text")
-
- def test_process_request_dict(self):
- request_dict = {
- "request_id": "123",
- "messages": [{"role": "user", "content": "Hello!"}],
- "chat_template_kwargs": {"chat_template": "Hello!"},
- "eos_token_ids": [1],
- "temperature": 1,
- "top_p": 1,
- }
- request = Request.from_dict(request_dict)
- request.chat_template_kwargs = {"chat_template": "Hello!"}
- result = self.processor.process_request_dict(request, 100)
- self.assertEqual(result.prompt_token_ids, [1])
-
- def test_process_response_obj_normal(self):
- mock_tokens = ["reasoning", "token", "list"]
- self.processor.tokenizer.tokenize = MagicMock(return_value=mock_tokens)
- self.processor.reasoning_parser.extract_reasoning_content = MagicMock(
- return_value=("Mock reasoning content", "Mock final text")
- )
-
- self.processor.tool_parser_obj = None
-
- response_dict = {
- "request_id": "request-id_0",
- "outputs": {"token_ids": [2, 3, 4, 5, 1], "text": "Initial text", "top_logprobs": []},
- # "finish_reason": "stop",
- "finished": True,
- }
- response = RequestOutput.from_dict(response_dict)
- kwargs = {"enable_thinking": True}
-
- with patch("fastdeploy.input.ernie4_5_processor.data_processor_logger"):
- result = self.processor.process_response_obj_normal(response, **kwargs)
-
- self.mock_reasoning_parser.extract_reasoning_content.assert_called_once()
- self.assertEqual(result.outputs.reasoning_content, "Mock reasoning content")
- self.assertEqual(result.outputs.reasoning_token_num, len(mock_tokens))
- self.assertEqual(result.outputs.text, "Mock final text")
- self.assertTrue(hasattr(result.outputs, "completion_tokens"))
-
- def test_think_status(self):
- """测试 思考机制"""
- request = {
- "prompt": "hello",
- "request_id": "test_1",
- "prompt_token_ids": [1, 2, 3],
- "temperature": 0.7,
- "top_p": 0.9,
- }
- request = Request.from_dict(request)
- self.processor.reasoning_parser = MagicMock()
- self.processor.reasoning_parser.get_model_status.return_value = "think_start"
- self.processor.model_status_dict = {}
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- request = {
- "prompt": "hello",
- "request_id": "test",
- "prompt_token_ids": [1, 2, 3],
- "temperature": 0.7,
- "top_p": 0.9,
- }
- request = Request.from_dict(request)
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_ernie_vl_processor.py b/tests/input/v1/test_ernie_vl_processor.py
deleted file mode 100644
index 132b423f3cb..00000000000
--- a/tests/input/v1/test_ernie_vl_processor.py
+++ /dev/null
@@ -1,1460 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from types import SimpleNamespace
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import CompletionOutput, Request, RequestOutput
-from fastdeploy.engine.sampling_params import SamplingParams
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
-from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.v1.ernie4_5_vl_processor import Ernie4_5_VLProcessor
-from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
- AdaptiveImageProcessor,
-)
-from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor
-
-
-class MockReasoningParser:
- def get_model_status(self, prompt_token_ids):
- return "think_start"
-
-
-class TestErnie4_5VLProcessorProcessResponseDictStreaming(unittest.TestCase):
- def setUp(self):
- # Create mock object for Ernie4_5Processor instance
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None) as mock_init:
- self.processor = Ernie4_5_VLProcessor("model_path")
- mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}")
-
- # Set necessary attributes
- self.processor.tokenizer = MagicMock()
- self.processor.tokenizer.eos_token_id = 1
- self.processor.decode_status = {"test": []}
- self.processor.reasoning_end_dict = {}
- self.processor.tool_parser_dict = {}
- self.processor.generation_config = MagicMock()
- self.processor.eos_token_ids = [1]
- self.processor.reasoning_parser = MockReasoningParser()
- self.processor.model_status_dict = {"test": "think_start"}
- self.processor.ernie4_5_processor = MagicMock()
-
- # Mock ids2tokens method
- def mock_ids2tokens(token_ids, task_id):
- return "delta_text", [2, 3], "previous_texts"
-
- self.processor.ids2tokens = mock_ids2tokens
-
- def mock_request2ids(request, **kwargs):
- return {"input_ids": np.array([1, 2, 3]), "prompt_token_ids": [0]}
-
- def mock_check_mm_limits(item):
- pass
-
- def mock_apply_default_parameters(request):
- return request
-
- def mock_pack_outputs(outputs):
- # Ensure input_ids is numpy array if it exists
- result = outputs.copy() if isinstance(outputs, dict) else outputs
- if isinstance(result, dict):
- if "input_ids" in result and isinstance(result["input_ids"], list):
- result["input_ids"] = np.array(result["input_ids"])
- if "token_type_ids" in result and isinstance(result["token_type_ids"], list):
- result["token_type_ids"] = np.array(result["token_type_ids"])
- if "position_ids" in result and isinstance(result["position_ids"], list):
- result["position_ids"] = np.array(result["position_ids"])
- return result
-
- def mock_prompt_token_ids2outputs(request):
- return {
- "input_ids": np.array([1, 1, 1]),
- "token_type_ids": np.array([0, 0, 0]),
- "position_ids": np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]),
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "cur_position": 3,
- }
-
- self.processor._apply_default_parameters = mock_apply_default_parameters
- self.processor._check_mm_limits = mock_check_mm_limits
- self.processor.ernie4_5_processor.request2ids = mock_request2ids
- self.processor.ernie4_5_processor.prompt_token_ids2outputs = mock_prompt_token_ids2outputs
- self.processor.pack_outputs = mock_pack_outputs
-
- # Mock reasoning parser
- self.mock_reasoning_parser = MagicMock()
- self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = None
- self.processor.reasoning_parser = self.mock_reasoning_parser
-
- # Mock tool parser
- self.mock_tool_parser = MagicMock()
- self.mock_tool_parser.extract_tool_calls_streaming.return_value = None
- self.mock_tool_parser_obj = MagicMock()
- self.mock_tool_parser_obj.return_value = self.mock_tool_parser
- self.processor.tool_parser_obj = self.mock_tool_parser_obj
-
- def test_think_status(self):
- """测试 思考机制"""
- request = {
- "prompt": "hello",
- "request_id": "test_1",
- "prompt_token_ids": [1, 2, 3],
- }
- request = Request.from_dict(request)
- self.processor.reasoning_parser = MagicMock()
- self.processor.reasoning_parser.get_model_status.return_value = "think_start"
- self.processor.model_status_dict = {}
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- request = {
- "prompt": "hello",
- "request_id": "test",
- "prompt_token_ids": [1, 2, 3],
- }
- request = Request.from_dict(request)
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- def test_init(self):
- """Test __init__ method"""
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"):
- mock_dp = MagicMock()
- mock_dp.image_patch_id = 1001
- mock_dp.spatial_conv_size = 14
- mock_dp.tokenizer = MagicMock()
- mock_dp.tokenizer.pad_token_id = 0
- mock_dp.eval = MagicMock()
-
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.DataProcessor"
- ) as mock_dp_class:
- mock_dp_class.return_value = mock_dp
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.GenerationConfig"
- ) as mock_gen_config:
- mock_gen_config.from_pretrained.return_value = MagicMock()
- with patch("paddleformers.cli.utils.llm_utils.get_eos_token_id") as mock_get_eos:
- mock_get_eos.return_value = [1, 2]
-
- # Test normal initialization
- mock_reasoning_parser_class = MagicMock()
- processor = Ernie4_5_VLProcessor(
- "model_path",
- limit_mm_per_prompt={"image": 2, "video": 1},
- mm_processor_kwargs={"spatial_conv_size": 14},
- reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser_class,
- tool_parser_obj=MagicMock(),
- enable_processor_cache=True,
- )
-
- self.assertEqual(processor.image_patch_id, 1001)
- self.assertEqual(processor.spatial_conv_size, 14)
- self.assertIsNotNone(processor.tokenizer)
- self.assertIsNotNone(processor.generation_config)
- self.assertEqual(processor.eos_token_ids, [1, 2])
- self.assertEqual(processor.limit_mm_per_prompt["image"], 2)
- self.assertEqual(processor.limit_mm_per_prompt["video"], 1)
- mock_dp.eval.assert_called_once()
-
- # Test with generation config exception
- mock_gen_config.from_pretrained.side_effect = Exception("Config not found")
- processor2 = Ernie4_5_VLProcessor("model_path")
- self.assertIsNone(processor2.generation_config)
-
- # Test with reasoning_parser_obj
- mock_reasoning_parser = MagicMock()
- processor3 = Ernie4_5_VLProcessor(
- "model_path", reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser
- )
- self.assertIsNotNone(processor3.reasoning_parser)
-
- def test_parse_processor_kwargs(self):
- """Test _parse_processor_kwargs with various inputs"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor._parse_processor_kwargs = Ernie4_5_VLProcessor._parse_processor_kwargs.__get__(
- processor, Ernie4_5_VLProcessor
- )
-
- # Test with valid kwargs
- valid_kwargs = {
- "spatial_conv_size": 14,
- "temporal_conv_size": 2,
- "image_min_pixels": 1000,
- "image_max_pixels": 10000,
- }
- result = processor._parse_processor_kwargs(valid_kwargs)
- self.assertEqual(result, valid_kwargs)
-
- # Test with invalid type (implementation catches exception and returns empty dict)
- invalid_kwargs = {"spatial_conv_size": "invalid"} # Should be int
- result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, invalid_kwargs)
- self.assertEqual(result, {})
-
- # Test with non-dict input (implementation catches exception and returns empty dict)
- result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, "not a dict")
- self.assertEqual(result, {})
-
- # Test exception handling with None
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"):
- result = processor._parse_processor_kwargs(None)
- self.assertEqual(result, {})
-
- def test_parse_limits(self):
- """Test _parse_limits with various inputs"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor._parse_limits = Ernie4_5_VLProcessor._parse_limits.__get__(processor, Ernie4_5_VLProcessor)
-
- # Test with valid limits
- valid_limits = {"image": 5, "video": 3}
- result = processor._parse_limits(valid_limits)
- self.assertEqual(result["image"], 5)
- self.assertEqual(result["video"], 3)
- self.assertEqual(result["audio"], 1) # Default value
-
- # Test with empty input (None)
- result = processor._parse_limits(None)
- self.assertEqual(result["image"], 1)
- self.assertEqual(result["video"], 1)
- self.assertEqual(result["audio"], 1)
-
- # Test with invalid type (implementation catches exception and returns default limits)
- result = Ernie4_5_VLProcessor._parse_limits(processor, "not a dict")
- self.assertEqual(result["image"], 1)
- self.assertEqual(result["video"], 1)
- self.assertEqual(result["audio"], 1)
-
- def test_check_mm_limits(self):
- """Test _check_mm_limits with various inputs"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor._check_mm_limits = Ernie4_5_VLProcessor._check_mm_limits.__get__(processor, Ernie4_5_VLProcessor)
-
- # Test with dict input (should not raise)
- processor.limit_mm_per_prompt = {"image": 2, "video": 1}
- mm_data = {"image": [1, 2], "video": [1]}
- processor._check_mm_limits(mm_data)
-
- # Test with messages input (should not raise)
- messages = [
- {"role": "user", "content": [{"type": "image", "data": "img1"}]},
- {"role": "user", "content": [{"type": "video", "data": "vid1"}]},
- ]
- processor._check_mm_limits(messages)
-
- # Test when limit is exceeded (should raise ValueError)
- processor.limit_mm_per_prompt = {"image": 1, "video": 1}
- mm_data = {"image": [1, 2, 3], "video": []} # 3 images, limit is 1
- with self.assertRaises(ValueError) as context:
- processor._check_mm_limits(mm_data)
- self.assertIn("Too many image items", str(context.exception))
-
- def test_process_request_dict(self):
- """Test process_request_dict method"""
- # from fastdeploy.engine.request import Request
-
- # Mock the process_request_dict method
- self.processor.process_request_dict = MagicMock()
-
- # Create a mock Request object
- mock_request = MagicMock(spec=Request)
- mock_request.to_dict.return_value = {"messages": [{"role": "user", "content": "Hello"}]}
-
- # Mock Request.from_dict to return a mock request
- with patch.object(Request, "from_dict") as mock_from_dict:
- mock_result_request = MagicMock(spec=Request)
- mock_from_dict.return_value = mock_result_request
-
- self.processor.process_request_dict(mock_request, max_model_len=100, chat_template_kwargs={"key": "value"})
-
- # Verify process_request_dict was called
- self.processor.process_request_dict.assert_called_once()
-
- def test_get_pad_id(self):
- """Test get_pad_id method"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor.tokenizer = MagicMock()
- processor.tokenizer.pad_token_id = 100
- processor.get_pad_id = Ernie4_5_VLProcessor.get_pad_id.__get__(processor, Ernie4_5_VLProcessor)
-
- result = processor.get_pad_id()
- self.assertEqual(result, 100)
-
- def test_load_tokenizer(self):
- """Test _load_tokenizer method"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- mock_tokenizer = MagicMock()
- processor.ernie4_5_processor = MagicMock()
- processor.ernie4_5_processor.tokenizer = mock_tokenizer
- processor._load_tokenizer = Ernie4_5_VLProcessor._load_tokenizer.__get__(processor, Ernie4_5_VLProcessor)
-
- processor._load_tokenizer()
- self.assertEqual(processor.tokenizer, mock_tokenizer)
-
- def test_append_completion_tokens(self):
- """Test append_completion_tokens method"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor.append_completion_tokens = Ernie4_5_VLProcessor.append_completion_tokens.__get__(
- processor, Ernie4_5_VLProcessor
- )
-
- multimodal_inputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
- "cur_position": 3,
- }
- completion_token_ids = [10, 11, 12]
-
- processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
- self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11, 12])
- self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0, 0])
- self.assertEqual(len(multimodal_inputs["position_ids"]), 6)
- self.assertEqual(multimodal_inputs["cur_position"], 6)
-
- def test_pack_outputs(self):
- """Test pack_outputs with and without images"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor.image_patch_id = 1001
- processor.ernie4_5_processor = SimpleNamespace(mm_num_tokens=lambda **kwargs: 123)
- processor.pack_outputs = Ernie4_5_VLProcessor.pack_outputs.__get__(processor, Ernie4_5_VLProcessor)
- # Test with images
- outs_with_images = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
- "images": [np.array([[1, 2], [3, 4]])],
- "grid_thw": [np.array([[1, 2, 2]])],
- "image_type_ids": [0],
- }
-
- result = processor.pack_outputs(outs_with_images)
- self.assertIsNotNone(result["images"])
- self.assertIsNotNone(result["grid_thw"])
- self.assertIsNotNone(result["image_type_ids"])
- self.assertEqual(result["image_patch_id"], 1001)
- self.assertIsInstance(result["input_ids"], np.ndarray)
- self.assertIsInstance(result["token_type_ids"], np.ndarray)
- self.assertIsInstance(result["position_ids"], np.ndarray)
-
- # Test without images
- outs_without_images = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- }
-
- result = processor.pack_outputs(outs_without_images)
- self.assertIsNone(result["images"])
- self.assertIsNone(result["grid_thw"])
- self.assertIsNone(result["image_type_ids"])
-
- def test_process_response_dict(self):
- """Test process_response_dict with different parameters"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor.process_response_dict = Ernie4_5_VLProcessor.process_response_dict.__get__(
- processor, Ernie4_5_VLProcessor
- )
-
- response = RequestOutput(
- request_id="test_0",
- outputs=CompletionOutput(text="response", index=0, send_idx=0, token_ids=[1, 2, 3]),
- )
- # Test with stream=True
- processor.process_response_obj_streaming = MagicMock(return_value=response)
- response_obj = RequestOutput(request_id="test_0")
- result = processor.process_response_dict(response_obj, stream=True)
- processor.process_response_obj_streaming.assert_called_once()
- self.assertEqual(result, response)
-
- # Test with stream=False
- processor.process_response_obj_normal = MagicMock(return_value=response)
- response_obj = RequestOutput(request_id="test_0")
- result = processor.process_response_dict(response_obj, stream=False)
- processor.process_response_obj_normal.assert_called_once()
- self.assertEqual(result, response)
-
- def test_apply_default_parameters(self):
- """Test _apply_default_parameters with dict and object request"""
- with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None):
- processor = Ernie4_5_VLProcessor("model_path")
- processor.generation_config = MagicMock()
- processor.generation_config.top_p = 0.8
- processor.generation_config.temperature = 0.9
- processor._apply_default_parameters = Ernie4_5_VLProcessor._apply_default_parameters.__get__(
- processor, Ernie4_5_VLProcessor
- )
-
- # Test with dict request
- request = Request(request_id="test_0")
- request.sampling_params = SamplingParams()
- result = processor._apply_default_parameters(request)
- self.assertEqual(result.sampling_params.top_p, 0.8)
- self.assertEqual(result.sampling_params.temperature, 0.9)
-
- # Test with object request
- class MockRequest:
- def __init__(self):
- self.sampling_params = SamplingParams()
- self.sampling_params.top_p = None
- self.sampling_params.temperature = None
-
- def get(self, key):
- return getattr(self.sampling_params, key, None)
-
- def set(self, key, value):
- setattr(self.sampling_params, key, value)
-
- request = MockRequest()
- result = processor._apply_default_parameters(request)
- self.assertEqual(result.sampling_params.top_p, 0.8)
-
-
-class TestDataProcessorTargetMethods(unittest.TestCase):
- def setUp(self):
- self.mock_tokenizer = MagicMock(spec=Ernie4_5Tokenizer)
- self.mock_tokenizer.ignored_index = -100
- self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids
- self.mock_tokenizer.chat_template = "mock_template"
- self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>"
- # Mock encode method for _add_text
- self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
-
- def mock_load_tokenizer(dp_instance):
- dp_instance.tokenizer = self.mock_tokenizer
-
- with patch.object(DataProcessor, "_load_tokenizer", side_effect=mock_load_tokenizer, autospec=True):
- with patch.object(AdaptiveImageProcessor, "from_pretrained") as mock_image_preprocessor:
- mock_image_preprocessor.return_value = MagicMock()
- self.data_processor = DataProcessor(
- tokenizer_name="mock_tokenizer",
- image_preprocessor_name="mock_image_preprocessor",
- enable_processor_cache=False,
- )
- self.data_processor.image_patch_id = 1001
- self.data_processor.image_start_id = 1002
- self.data_processor.image_end_id = 1003
- self.data_processor.video_start_id = 1004
- self.data_processor.video_end_id = 1005
- self.data_processor.role_prefixes = {"user": "User: ", "assistant": "Assistant: "}
- self.data_processor.enable_processor_cache = False
- # Note: extract_mm_items is not mocked by default, only when needed
- self.data_processor.extract_mm_items = MagicMock(return_value=([], [], [], [], None, [], []))
-
- def _restore_real_extract_mm_items(self):
- """Helper method to restore real extract_mm_items method for testing"""
- from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor
-
- original_extract_mm_items = DataProcessor.extract_mm_items
- self.data_processor.extract_mm_items = original_extract_mm_items.__get__(self.data_processor, DataProcessor)
-
- def _mock_convert_tokens_to_ids(self, token):
- token_id_map = {
- "<|begin_of_sentence|>": 101,
- "<|end_of_sentence|>": 102,
- "": 103,
- "<|IMAGE_PLACEHOLDER|>": 1001,
- "<|IMAGE_START|>": 1002,
- "<|IMAGE_END|>": 1003,
- "<|VIDEO_START|>": 1004,
- "<|VIDEO_END|>": 1005,
- }
- return token_id_map.get(token, 999)
-
- def test_prompt_token_ids2outputs_only_prompt_token_ids(self):
- test_prompt_token_ids = [101, 999, 998, 997, 102]
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- }
- request = Request.from_dict(request)
-
- outputs = self.data_processor.prompt_token_ids2outputs(request)
-
- prompt_len = len(test_prompt_token_ids)
-
- self.assertEqual(
- outputs["input_ids"],
- test_prompt_token_ids,
- f"input_ids mismatch: actual {outputs['input_ids']}, expected {test_prompt_token_ids}",
- )
-
- self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
-
- expected_position_ids = [[i] * 3 for i in range(prompt_len)]
- self.assertEqual(outputs["position_ids"], expected_position_ids)
-
- self.assertEqual(outputs["cur_position"], prompt_len)
-
- self.assertEqual(len(outputs["images"]), 0)
- self.assertEqual(len(outputs["grid_thw"]), 0)
- self.assertEqual(len(outputs["mm_positions"]), 0)
- self.assertEqual(len(outputs["mm_hashes"]), 0)
- self.assertEqual(outputs["video_cnt"], 0)
- self.assertEqual(outputs["num_input_image_tokens"], 0)
- self.assertEqual(outputs["num_input_video_tokens"], 0)
-
- def test_prompt_token_ids2outputs_with_messages_no_mm(self):
- test_prompt_token_ids = [101, 999, 998, 997, 102]
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [{"role": "user", "content": "Hello World"}],
- }
- request = Request.from_dict(request)
-
- self.data_processor.extract_mm_items.return_value = ([], [], [], [], None, [], [])
-
- outputs = self.data_processor.prompt_token_ids2outputs(request)
-
- prompt_len = len(test_prompt_token_ids)
-
- self.assertEqual(outputs["input_ids"], test_prompt_token_ids)
-
- self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len)
-
- expected_position_ids = [[i] * 3 for i in range(prompt_len)]
- self.assertEqual(outputs["position_ids"], expected_position_ids)
-
- self.assertEqual(outputs["cur_position"], prompt_len)
-
- self.assertEqual(len(outputs["images"]), 0)
- self.assertEqual(outputs["video_cnt"], 0)
- self.assertEqual(outputs["num_input_image_tokens"], 0)
-
- def test_prompt_token_ids2outputs_add_image(self):
- test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
- mock_img = MagicMock()
- mock_img.height = 224
- mock_img.width = 224
- mock_img.convert.return_value = mock_img
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [mock_img],
- [],
- ["img_uuid"],
- [],
- None,
- [],
- [{"type": "image", "data": mock_img}],
- )
- mock_resize = (None, (2, 4))
- self.data_processor.image_preprocessor.get_smarted_resize.return_value = mock_resize
- mock_preprocess = {"pixel_values": np.random.randn(1, 16, 16, 3), "image_grid_thw": np.array([[2, 4]])}
- self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
- # self.data_processor._compute_3d_positions = MagicMock(return_value=[[i]*3 for i in range(4)])
- outputs = self.data_processor.prompt_token_ids2outputs(request)
- self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
- self.assertEqual(
- outputs["token_type_ids"],
- [
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["image"],
- IDS_TYPE_FLAG["image"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- ],
- )
- self.assertEqual(len(outputs["position_ids"]), 6)
- self.assertEqual(outputs["cur_position"], 6)
- self.assertEqual(len(outputs["images"]), 1)
- self.assertIsNotNone(outputs["images"][0])
- self.assertEqual(outputs["num_input_image_tokens"], 2)
- self.assertEqual(len(outputs["mm_positions"]), 1)
- self.assertEqual(len(outputs["mm_hashes"]), 1)
- self.assertEqual(len(outputs["grid_thw"]), 1)
- self.assertEqual(len(outputs["image_type_ids"]), 1)
-
- def test_prompt_token_ids2outputs_add_processed_image(self):
- test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
- mock_img_data = np.random.randn(8, 28, 28)
- mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [mock_img_cache],
- [],
- ["img_uuid"],
- [],
- None,
- [],
- [{"type": "image", "data": mock_img_cache}],
- )
- outputs = self.data_processor.prompt_token_ids2outputs(request)
- self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102])
- self.assertEqual(
- outputs["token_type_ids"],
- [
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["image"],
- IDS_TYPE_FLAG["image"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- ],
- )
- self.assertEqual(len(outputs["position_ids"]), 20)
- self.assertEqual(outputs["cur_position"], 8)
- self.assertEqual(len(outputs["images"]), 1)
- self.assertIsNotNone(outputs["images"][0])
- self.assertEqual(len(outputs["mm_positions"]), 1)
- self.assertEqual(outputs["mm_hashes"][0], "img_uuid")
- self.assertEqual(len(outputs["grid_thw"]), 1)
- self.assertEqual(len(outputs["image_type_ids"]), 1)
-
- def test_prompt_token_ids2outputs_add_video(self):
- test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
- mock_frame1 = MagicMock()
- mock_frame1.height = 224
- mock_frame1.width = 224
- mock_frame1.convert.return_value = mock_frame1
- mock_frame2 = MagicMock()
- mock_frame2.height = 224
- mock_frame2.width = 224
- mock_frame2.convert.return_value = mock_frame2
- frames = [mock_frame1, mock_frame2]
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [],
- [frames],
- [],
- ["vid_uuid"],
- None,
- [],
- [{"type": "video", "data": frames}],
- )
- self.data_processor._load_and_process_video = MagicMock(return_value=frames)
- patches_h, patches_w = 4, 4
- self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
- mock_preprocess = {
- "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
- "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
- }
- self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
- outputs = self.data_processor.prompt_token_ids2outputs(request)
- self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
- self.assertEqual(
- outputs["token_type_ids"],
- [
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- ],
- )
- self.assertEqual(len(outputs["position_ids"]), 8)
- self.assertEqual(outputs["cur_position"], 6)
- self.assertEqual(len(outputs["images"]), 1)
- self.assertIsNotNone(outputs["images"][0])
- self.assertEqual(len(outputs["mm_positions"]), 1)
- self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
- self.assertEqual(len(outputs["grid_thw"]), 1)
- self.assertEqual(len(outputs["image_type_ids"]), 2)
- self.assertEqual(outputs["num_input_video_tokens"], 4)
-
- def test_prompt_token_ids2outputs_add_processed_video(self):
- test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]
- t, h, w = 2, 4, 4
- spatial_conv_size = self.data_processor.spatial_conv_size
- temporal_conv_size = self.data_processor.temporal_conv_size
- token_per_frame = (h // spatial_conv_size) * (w // spatial_conv_size)
- num_tokens = (t // temporal_conv_size) * token_per_frame
- mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
- mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [],
- [mock_frames_cache],
- [],
- ["vid_uuid"],
- None,
- [],
- [{"type": "video", "data": mock_frames_cache}],
- )
- outputs = self.data_processor.prompt_token_ids2outputs(request)
- self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102])
- self.assertEqual(
- outputs["token_type_ids"],
- [
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["video"],
- IDS_TYPE_FLAG["text"],
- IDS_TYPE_FLAG["text"],
- ],
- )
- self.assertEqual(len(outputs["position_ids"]), 8)
- self.assertEqual(outputs["cur_position"], 6)
- self.assertEqual(len(outputs["images"]), 1)
- self.assertIsNotNone(outputs["images"][0])
- self.assertEqual(len(outputs["mm_positions"]), 1)
- self.assertEqual(outputs["mm_hashes"][0], "vid_uuid")
- self.assertEqual(len(outputs["grid_thw"]), 1)
- self.assertEqual(len(outputs["image_type_ids"]), 2)
-
- def test_prompt_token_ids2outputs_add_image_token_len_mismatch(self):
- test_prompt_token_ids = [101, 1002, 1001, 1001, 1001, 1003, 102]
- mock_img = MagicMock()
- mock_img.height = 224
- mock_img.width = 224
- mock_img.convert.return_value = mock_img
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [mock_img],
- [],
- ["img_uuid"],
- [],
- None,
- [],
- [{"type": "image", "data": mock_img}],
- )
- patches_h, patches_w = 8, 8
- self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
- mock_preprocess = {
- "pixel_values": np.random.randn(1, patches_h, patches_w, 3),
- "image_grid_thw": np.array([[patches_h, patches_w]]),
- }
- self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
- with self.assertRaises(ValueError) as ctx:
- self.data_processor.prompt_token_ids2outputs(request)
- self.assertIn("image tokens num not match the size", str(ctx.exception))
-
- def test_prompt_token_ids2outputs_add_processed_image_token_len_mismatch(self):
- test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102]
- spatial_conv_size = self.data_processor.spatial_conv_size
- num_tokens = 4
- mock_img_data = np.random.randn(num_tokens * (spatial_conv_size**2), 28, 28)
- mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)})
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [mock_img_cache],
- [],
- ["img_uuid"],
- [],
- None,
- [],
- [{"type": "image", "data": mock_img_cache}],
- )
- with self.assertRaises(ValueError) as ctx:
- self.data_processor.prompt_token_ids2outputs(request)
- self.assertIn("image tokens num not match the size", str(ctx.exception))
-
- def test_prompt_token_ids2outputs_add_video_token_len_mismatch(self):
- test_prompt_token_ids = [101, 1004, 1001, 1001, 1005, 102]
- mock_frame1 = MagicMock()
- mock_frame1.height = 224
- mock_frame1.width = 224
- mock_frame1.convert.return_value = mock_frame1
- mock_frame2 = MagicMock()
- mock_frame2.height = 224
- mock_frame2.width = 224
- mock_frame2.convert.return_value = mock_frame2
- frames = [mock_frame1, mock_frame2]
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [],
- [frames],
- [],
- ["vid_uuid"],
- None,
- [],
- [{"type": "video", "data": frames}],
- )
- self.data_processor._load_and_process_video = MagicMock(return_value=frames)
- patches_h, patches_w = 8, 8
- self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w))
- mock_preprocess = {
- "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3),
- "video_grid_thw": np.array([[patches_h, patches_w]] * 2),
- }
- self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess
- with self.assertRaises(ValueError) as ctx:
- self.data_processor.prompt_token_ids2outputs(request)
- self.assertIn("video tokens num not match the size", str(ctx.exception))
-
- def test_prompt_token_ids2outputs_add_processed_video_token_len_mismatch(self):
- test_prompt_token_ids = [101, 1004, 1001, 1005, 102]
- t, h, w = 2, 8, 8
- spatial_conv_size = self.data_processor.spatial_conv_size
- temporal_conv_size = self.data_processor.temporal_conv_size
-
- num_tokens = 4
- mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28)
- mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)})
- request = {
- "request_id": "test_0",
- "prompt_token_ids": test_prompt_token_ids,
- "messages": [
- {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]}
- ],
- }
- request = Request.from_dict(request)
- self.data_processor.extract_mm_items.return_value = (
- [],
- [mock_frames_cache],
- [],
- ["vid_uuid"],
- None,
- [],
- [{"type": "video", "data": mock_frames_cache}],
- )
- with self.assertRaises(ValueError) as ctx:
- self.data_processor.prompt_token_ids2outputs(request)
- self.assertIn("video tokens num not match the size", str(ctx.exception))
-
- def test_extract_mm_items(self):
- """Test extract_mm_items with various scenarios: basic items, video, and missing data error"""
- self._restore_real_extract_mm_items()
-
- # Test basic multimodal items (image + video)
- request = {
- "request_id": "test_0",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "Hello"},
- {"type": "image", "data": Image.new("RGB", (224, 224)), "uuid": "img1"},
- {"type": "video", "data": [Image.new("RGB", (224, 224))], "uuid": "vid1"},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = request.messages
- images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = (
- self.data_processor.extract_mm_items(request)
- )
- self.assertEqual(len(images), 1)
- self.assertEqual(len(videos), 1)
- self.assertEqual(image_uuid[0], "img1")
- self.assertEqual(video_uuid[0], "vid1")
- self.assertEqual(len(mm_items), 2)
-
- # Test missing data error when cache is disabled
- self.data_processor.enable_processor_cache = False
- request = {
- "request_id": "test_0",
- "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}],
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = request.messages
- with self.assertRaises(ValueError) as ctx:
- self.data_processor.extract_mm_items(request)
- self.assertIn("Missing items cannot be retrieved", str(ctx.exception))
-
-
-class TestDataProcessor(unittest.TestCase):
- def setUp(self):
- """Set up test environment"""
- self.mock_tokenizer = MagicMock()
-
- def mock_convert_tokens_to_ids(x):
- if isinstance(x, list):
- return [hash(str(token)) % 10000 for token in x]
- return hash(str(x)) % 10000
-
- self.mock_tokenizer.convert_tokens_to_ids = MagicMock(side_effect=mock_convert_tokens_to_ids)
- self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
- self.mock_tokenizer.decode = MagicMock(return_value="decoded_text")
- self.mock_tokenizer.tokenize = MagicMock(return_value=["token1", "token2"])
- self.mock_tokenizer.ignored_index = -100
- self.mock_tokenizer.chat_template = MagicMock()
- self.mock_tokenizer.apply_chat_template = MagicMock(return_value="formatted_prompt")
-
- self.mock_image_preprocessor = MagicMock()
- self.mock_image_preprocessor.get_smarted_resize = MagicMock(return_value=((224, 224), (16, 16)))
- self.mock_image_preprocessor.preprocess = MagicMock(
- return_value={
- "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32),
- "image_grid_thw": np.array([[1, 16, 16]]),
- }
- )
- self.mock_image_preprocessor.from_pretrained = MagicMock(return_value=self.mock_image_preprocessor)
-
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.process.AdaptiveImageProcessor",
- self.mock_image_preprocessor,
- ):
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.Ernie4_5Tokenizer") as mock_tokenizer_class:
- mock_tokenizer_class.from_pretrained = MagicMock(return_value=self.mock_tokenizer)
- mock_tokenizer_class.resource_files_names = {"vocab_file": "tokenizer.model"}
- with patch("os.path.exists", return_value=True):
- self.processor = DataProcessor(
- tokenizer_name="test_model",
- image_preprocessor_name="test_model",
- )
-
- def _create_outputs(self):
- """Helper to create outputs dict"""
- return {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "mm_positions": [],
- "mm_hashes": [],
- "cur_position": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- }
-
- def _mock_video_processing(self, mock_frames=None):
- """Helper to mock video processing"""
- if mock_frames is None:
- mock_frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
- mock_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_video_decord")
- mock_frames_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_frames_decord")
- mock_render = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.render_frame_timestamp")
- return mock_read, mock_frames_read, mock_render, mock_frames
-
- def _setup_video_mocks(self, mock_read, mock_frames_read, mock_render, mock_frames):
- """Setup video processing mocks"""
- mock_read.return_value = (None, {"duration": 2.0}, "test_path")
- mock_frames_read.return_value = (
- [np.array(f) for f in mock_frames],
- None,
- [0.0, 0.5, 1.0, 1.5] if len(mock_frames) == 4 else [float(i) * 0.5 for i in range(len(mock_frames))],
- )
- mock_render.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
- self.mock_image_preprocessor.preprocess.return_value = {
- "pixel_values_videos": np.random.rand(len(mock_frames), 256, 3 * 14 * 14).astype(np.float32),
- "video_grid_thw": np.array([[len(mock_frames), 16, 16]]),
- }
-
- def test_train_and_eval(self):
- """Test training and evaluation mode switching"""
- self.assertTrue(self.processor.is_training)
- self.processor.eval()
- self.assertFalse(self.processor.is_training)
- self.processor.train()
- self.assertTrue(self.processor.is_training)
-
- def test_build_token_type_mapping(self):
- """Test token type mapping construction"""
- mapping = self.processor._build_token_type_mapping()
- for token in [
- self.processor.IMG_START,
- self.processor.IMG_END,
- self.processor.VID_START,
- self.processor.VID_END,
- ]:
- self.assertEqual(mapping[token], IDS_TYPE_FLAG["image"])
- self.assertEqual(mapping[self.processor.image_patch_id], IDS_TYPE_FLAG["image"])
-
- def test_add_text_and_special_token(self):
- """Test adding text and special tokens"""
- outputs = self._create_outputs()
- self.processor._add_text("hello", outputs)
- self.assertEqual(len(outputs["input_ids"]), 3)
- self.assertEqual(outputs["cur_position"], 3)
-
- outputs2 = self._create_outputs()
- self.processor._add_text([1, 2, 3, 4, 5], outputs2)
- self.assertEqual(len(outputs2["input_ids"]), 5)
-
- outputs3 = self._create_outputs()
- self.processor._add_special_token("<|begin_of_sentence|>", outputs3)
- self.processor._add_special_token(12345, outputs3)
- self.assertEqual(len(outputs3["input_ids"]), 2)
-
- def test_compute_3d_positions(self):
- """Test 3D position computation"""
- pos_ids = self.processor._compute_3d_positions(t=2, h=16, w=16, start_idx=10)
- self.assertIsInstance(pos_ids, list)
- self.assertGreater(len(pos_ids), 0)
- self.assertEqual(len(pos_ids[0]), 3)
-
- pos_ids2 = self.processor._compute_3d_positions(t=1, h=16, w=16, start_idx=0)
- expected_len = 1 * (16 // self.processor.spatial_conv_size) ** 2
- self.assertEqual(len(pos_ids2), expected_len)
-
- def test_set_video_frame_args_comprehensive(self):
- """Test _set_video_frame_args with various scenarios"""
- # Valid cases
- result = self.processor._set_video_frame_args(
- {
- "target_frames": 32,
- "fps": -1,
- "min_frames": 16,
- "max_frames": 64,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- self.assertEqual(result["target_frames"], 32)
-
- result = self.processor._set_video_frame_args(
- {
- "target_frames": -1,
- "fps": 2,
- "min_frames": 16,
- "max_frames": 64,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- self.assertIsNotNone(result)
-
- # Error cases
- with self.assertRaises(ValueError):
- self.processor._set_video_frame_args(
- {
- "target_frames": -1,
- "fps": -1,
- "min_frames": 16,
- "max_frames": 64,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- with self.assertRaises(ValueError):
- self.processor._set_video_frame_args(
- {
- "target_frames": 10,
- "fps": 2,
- "min_frames": 1,
- "max_frames": 100,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- with self.assertRaises(ValueError):
- self.processor._set_video_frame_args(
- {
- "target_frames": 5,
- "fps": -1,
- "min_frames": 10,
- "max_frames": 100,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- with self.assertRaises(ValueError):
- self.processor._set_video_frame_args(
- {
- "target_frames": 200,
- "fps": -1,
- "min_frames": 1,
- "max_frames": 100,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
- with self.assertRaises(ValueError):
- self.processor._set_video_frame_args(
- {
- "target_frames": -1,
- "fps": 2,
- "min_frames": 100,
- "max_frames": 10,
- "frames_sample": "leading",
- },
- {"duration": 10.0},
- )
-
- # Adjustment cases
- result = self.processor._set_video_frame_args(
- {
- "target_frames": -1,
- "fps": 1,
- "min_frames": 10,
- "max_frames": 100,
- "frames_sample": "leading",
- },
- {"duration": 1.0},
- )
- self.assertEqual(result["target_frames"], 10)
- self.assertEqual(result["fps"], -1)
-
- result = self.processor._set_video_frame_args(
- {
- "target_frames": -1,
- "fps": 10,
- "min_frames": 1,
- "max_frames": 100,
- "frames_sample": "leading",
- },
- {"duration": 100.0},
- )
- self.assertEqual(result["target_frames"], 100)
- self.assertEqual(result["fps"], -1)
-
- def test_text2ids_comprehensive(self):
- """Test text2ids with various scenarios"""
- # Text only
- outputs = self.processor.text2ids("Hello world")
- self.assertIn("input_ids", outputs)
- self.assertEqual(len(outputs["images"]), 0)
-
- # Empty text
- outputs = self.processor.text2ids("")
- self.assertEqual(len(outputs["input_ids"]), 0)
-
- # With image placeholder
- mock_image = Image.new("RGB", (224, 224))
- outputs = self.processor.text2ids("Hello <|image@placeholder|> world", images=[mock_image])
- self.assertGreater(len(outputs["input_ids"]), 0)
- self.assertGreater(len(outputs["images"]), 0)
-
- # With cached image
- cached_image = (
- np.random.rand(256, 3 * 14 * 14).astype(np.float32),
- {"thw": (1, 16, 16)},
- )
- outputs = self.processor.text2ids(
- "Hello <|image@placeholder|> world",
- images=[cached_image],
- image_uuid=["uuid"],
- )
- self.assertGreater(len(outputs["input_ids"]), 0)
-
- # Multiple images
- outputs = self.processor.text2ids(
- "Hello <|image@placeholder|> world <|image@placeholder|> end",
- images=[mock_image, mock_image],
- )
- self.assertEqual(len(outputs["images"]), 2)
-
- # With video placeholder
- mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
- with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
- mr.return_value = (None, {"duration": 2.0}, "test_path")
- mfr.return_value = (
- [np.array(f) for f in mock_frames],
- None,
- [0.0, 0.5, 1.0, 1.5],
- )
- mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
- self.mock_image_preprocessor.preprocess.return_value = {
- "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
- "video_grid_thw": np.array([[4, 16, 16]]),
- }
- outputs = self.processor.text2ids("Hello <|video@placeholder|> world", videos=["test_video.mp4"])
- self.assertGreater(len(outputs["input_ids"]), 0)
-
- # Cached video
- cached_video = (
- np.random.rand(256, 3 * 14 * 14).astype(np.float32),
- {"thw": (4, 16, 16)},
- )
- outputs = self.processor.text2ids(
- "Hello <|video@placeholder|> world",
- videos=[cached_video],
- video_uuid=["uuid"],
- )
- self.assertGreater(len(outputs["input_ids"]), 0)
-
- # Video dict format
- mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
- with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
- mr.return_value = (None, {"duration": 2.0}, "test_path")
- mfr.return_value = (
- [np.array(f) for f in mock_frames],
- None,
- [0.0, 0.5, 1.0, 1.5],
- )
- mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
- self.mock_image_preprocessor.preprocess.return_value = {
- "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
- "video_grid_thw": np.array([[4, 16, 16]]),
- }
- outputs = self.processor.text2ids(
- "Hello <|video@placeholder|> world",
- videos=[{"video": "test.mp4", "fps": 2}],
- )
- self.assertGreater(len(outputs["input_ids"]), 0)
-
- # Image and video together
- mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing()
- with mock_read as mr, mock_frames_read as mfr, mock_render as mren:
- mr.return_value = (None, {"duration": 2.0}, "test_path")
- mfr.return_value = (
- [np.array(f) for f in mock_frames],
- None,
- [0.0, 0.5, 1.0, 1.5],
- )
- mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img)
- self.mock_image_preprocessor.preprocess.side_effect = [
- {
- "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32),
- "image_grid_thw": np.array([[1, 16, 16]]),
- },
- {
- "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32),
- "video_grid_thw": np.array([[4, 16, 16]]),
- },
- ]
- outputs = self.processor.text2ids(
- "Hello <|image@placeholder|> world <|video@placeholder|> end",
- images=[mock_image],
- videos=["test_video.mp4"],
- )
- self.assertGreater(len(outputs["input_ids"]), 0)
- self.mock_image_preprocessor.preprocess.side_effect = None
-
- def test_request2ids_comprehensive(self):
- """Test request2ids with various scenarios"""
- self.processor.is_training = False
-
- # Basic request with multimodal content - covers both text and image branches in one call
- mock_image = Image.new("RGB", (224, 224))
- request = {
- "request_id": "test_0",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "What's in this image?"},
- {"type": "image", "data": mock_image, "uuid": "img1"},
- ],
- }
- ],
- "add_generation_prompt": True,
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = request.messages
- outputs = self.processor.request2ids(request)
- self.assertIn("input_ids", outputs)
-
- # Error case: missing chat_template
- self.processor.tokenizer.chat_template = None
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}]
- with self.assertRaises(ValueError):
- self.processor.request2ids(request)
- self.processor.tokenizer.chat_template = MagicMock()
-
- # Error case: unsupported role
- request = {
- "request_id": "test_0",
- "messages": [{"role": "invalid_role", "content": "Hello"}],
- "add_generation_prompt": True,
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = [{"role": "invalid_role", "content": [{"type": "text", "text": "Hello"}]}]
- with self.assertRaises(AssertionError):
- self.processor.request2ids(request)
-
- # Error case: missing cache when cache is disabled
- self.processor.enable_processor_cache = False
- request = {
- "request_id": "test_0",
- "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}],
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = request.messages
- with self.assertRaises(ValueError):
- self.processor.request2ids(request)
-
- def test_extract_labels(self):
- """Test label extraction"""
- outputs = {"input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5], "labels": []}
- self.processor.is_training = True
- self.processor._extract_labels(outputs, ["target text"])
- self.assertEqual(len(outputs["labels"]), len(outputs["input_ids"]))
-
- # Multiple targets
- outputs2 = {
- "input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5, self.processor.sep_token_id, 6, 7],
- "labels": [],
- }
- self.processor._extract_labels(outputs2, ["target1", "target2"])
- self.assertEqual(len(outputs2["labels"]), len(outputs2["input_ids"]))
-
- # Error case
- outputs3 = {"input_ids": [1, 2, 3, self.processor.sep_token_id], "labels": []}
- with self.assertRaises(AssertionError):
- self.processor._extract_labels(outputs3, ["target1", "target2"])
-
- def test_fancy_print(self):
- """Test fancy_print function"""
- from fastdeploy.input.v1.ernie4_5_vl_processor.process import fancy_print
-
- test_cases = [
- ([1, 2, 3, self.processor.image_patch_id, 4, 5], self.processor.image_patch_id, None),
- (
- [
- 1,
- 2,
- self.processor.image_patch_id,
- self.processor.image_patch_id,
- self.processor.image_patch_id,
- 4,
- 5,
- ],
- self.processor.image_patch_id,
- "<|IMAGE@",
- ),
- ([1, 2, 3, 4, 5], self.processor.image_patch_id, None),
- ]
- for input_ids, image_patch_id, expected_contains in test_cases:
- result = fancy_print(input_ids, self.mock_tokenizer, image_patch_id)
- self.assertIsInstance(result, str)
- if expected_contains:
- self.assertIn(expected_contains, result)
-
- def test_processor_cache_operations(self):
- """Test processor cache get/update and request2ids with cache"""
- # Test get_processor_cache
- mock_socket = MagicMock()
- mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data"))
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle:
- mock_pickle.loads = MagicMock(return_value=[{"data": "cached_item"}])
- result = self.processor.get_processor_cache(mock_socket, ["hash1", "hash2"])
- self.assertEqual(len(result), 1)
-
- # Test update_processor_cache
- mock_socket2 = MagicMock()
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle"):
- self.processor.update_processor_cache(
- mock_socket2,
- ["hash1"],
- [(np.array([1, 2, 3]), {"meta": "data"})],
- )
- mock_socket2.send_multipart.assert_called_once()
-
- # Test request2ids with processor cache update
- self.processor.is_training = False
- self.processor.enable_processor_cache = True
- mock_image = Image.new("RGB", (224, 224))
- request = {
- "request_id": "test_0",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "Hello"},
- {"type": "image", "data": mock_image, "uuid": "img1"},
- ],
- }
- ],
- "add_generation_prompt": True,
- }
- request = Request.from_dict(request)
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.zmq") as mock_zmq:
- mock_context = MagicMock()
- mock_socket = MagicMock()
- mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data"))
- mock_context.socket.return_value = mock_socket
- mock_zmq.Context.return_value = mock_context
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse:
- mock_parse.return_value = request.messages
- with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle:
- mock_pickle.loads = MagicMock(return_value=[])
- with patch.object(self.processor, "text2ids") as mock_text2ids:
- mock_text2ids.return_value = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0] * 3,
- "position_ids": [[i] * 3 for i in range(3)],
- "images": [np.random.rand(256, 3 * 14 * 14).astype(np.float32)],
- "grid_thw": [np.array([[1, 16, 16]])],
- "image_type_ids": [0],
- "cur_position": 3,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "mm_positions": [],
- "mm_hashes": ["hash1"],
- }
- with patch.object(self.processor, "update_processor_cache") as mock_update:
- self.processor.request2ids(request)
- mock_update.assert_called_once()
- self.processor.enable_processor_cache = False
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_image_preprocessor_adaptive.py b/tests/input/v1/test_image_preprocessor_adaptive.py
deleted file mode 100644
index 5a15244d1fd..00000000000
--- a/tests/input/v1/test_image_preprocessor_adaptive.py
+++ /dev/null
@@ -1,499 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
- AdaptiveImageProcessor,
- ceil_by_factor,
- floor_by_factor,
- is_scaled_image,
- make_batched_images,
- make_batched_videos,
- round_by_factor,
- smart_resize,
-)
-
-
-class TestImagePreprocessorAdaptive(unittest.TestCase):
- def setUp(self):
- """Set up test environment"""
- self.processor = AdaptiveImageProcessor(
- min_pixels=56 * 56,
- max_pixels=28 * 28 * 1280,
- patch_size=14,
- temporal_conv_size=2,
- merge_size=2,
- )
-
- def test_init(self):
- """Test initialization"""
- self.assertEqual(self.processor.min_pixels, 56 * 56)
- self.assertEqual(self.processor.max_pixels, 28 * 28 * 1280)
- self.assertEqual(self.processor.patch_size, 14)
- self.assertEqual(self.processor.temporal_conv_size, 2)
- self.assertEqual(self.processor.merge_size, 2)
-
- def test_set_pixels(self):
- """Test setting pixels with valid and invalid values (lines 205-214)"""
- # Test setting only min_pixels
- self.processor.set_pixels(min_pixels=100, msg="test")
- self.assertEqual(self.processor.min_pixels, 100)
- self.assertEqual(self.processor.size["min_pixels"], 100)
-
- # Test setting only max_pixels
- self.processor.set_pixels(max_pixels=200, msg="test")
- self.assertEqual(self.processor.max_pixels, 200)
- self.assertEqual(self.processor.size["max_pixels"], 200)
-
- # Test setting both
- self.processor.set_pixels(min_pixels=150, max_pixels=250, msg="test")
- self.assertEqual(self.processor.min_pixels, 150)
- self.assertEqual(self.processor.max_pixels, 250)
- self.assertEqual(self.processor.size["min_pixels"], 150)
- self.assertEqual(self.processor.size["max_pixels"], 250)
-
- # Invalid cases
- with self.assertRaises(AssertionError):
- self.processor.set_pixels(min_pixels=-1)
- with self.assertRaises(AssertionError):
- self.processor.set_pixels(max_pixels=0)
-
- def test_get_smarted_resize(self):
- """Test get_smarted_resize with default and custom pixels"""
- height, width = 224, 224
- # Test with default pixels
- (resized_h, resized_w), (patches_h, patches_w) = self.processor.get_smarted_resize(height, width)
- self.assertIsInstance(resized_h, int)
- self.assertIsInstance(resized_w, int)
- self.assertIsInstance(patches_h, int)
- self.assertIsInstance(patches_w, int)
- # Test with custom pixels
- (resized_h, resized_w), (_, _) = self.processor.get_smarted_resize(
- height, width, min_pixels=100, max_pixels=10000
- )
- self.assertIsInstance(resized_h, int)
- self.assertIsInstance(resized_w, int)
-
- def test_round_by_factor(self):
- """Test round_by_factor with various cases"""
- self.assertEqual(round_by_factor(100, 28), 112) # 100/28 ≈ 3.57, round(3.57) = 4, 4*28 = 112
- self.assertEqual(round_by_factor(50, 10), 50)
- self.assertEqual(round_by_factor(55, 10), 60)
- # Edge cases
- self.assertEqual(round_by_factor(0, 14), 0)
- self.assertEqual(round_by_factor(14, 14), 14)
- self.assertEqual(round_by_factor(13, 14), 14) # Round up
- self.assertEqual(round_by_factor(15, 14), 14) # Round down
-
- def test_ceil_by_factor(self):
- """Test ceil_by_factor with various cases"""
- self.assertEqual(ceil_by_factor(100, 28), 112) # ceil(100/28)*28 = ceil(3.57)*28 = 4*28 = 112
- self.assertEqual(ceil_by_factor(50, 10), 50)
- self.assertEqual(ceil_by_factor(55, 10), 60)
- # Edge cases
- self.assertEqual(ceil_by_factor(0, 14), 0)
- self.assertEqual(ceil_by_factor(14, 14), 14)
- self.assertEqual(ceil_by_factor(13, 14), 14) # Ceil up
- self.assertEqual(ceil_by_factor(15, 14), 28) # Ceil up to next multiple
-
- def test_floor_by_factor(self):
- """Test floor_by_factor with various cases"""
- self.assertEqual(floor_by_factor(100, 28), 84) # floor(100/28)*28 = floor(3.57)*28 = 3*28 = 84
- self.assertEqual(floor_by_factor(50, 10), 50)
- self.assertEqual(floor_by_factor(55, 10), 50)
- # Edge cases
- self.assertEqual(floor_by_factor(0, 14), 0)
- self.assertEqual(floor_by_factor(14, 14), 14)
- self.assertEqual(floor_by_factor(13, 14), 0) # Floor down
- self.assertEqual(floor_by_factor(15, 14), 14) # Floor down to multiple
- self.assertEqual(floor_by_factor(28, 14), 28) # Exact multiple
-
- def test_smart_resize(self):
- """Test smart_resize with various scenarios (lines 557-587)"""
- # Basic functionality
- height, width = 224, 224
- new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
- self.assertIsInstance(new_h, int)
- self.assertIsInstance(new_w, int)
- self.assertEqual(new_h % 28, 0)
- self.assertEqual(new_w % 28, 0)
-
- # High aspect ratio (height > width) - tests lines 557-563
- height, width = 10000, 10 # aspect ratio > 200
- new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
- self.assertIsInstance(new_h, int)
- self.assertIsInstance(new_w, int)
- self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200)
-
- # High aspect ratio (width > height) - tests lines 562-563
- height, width = 10, 10000
- new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
- self.assertIsInstance(new_h, int)
- self.assertIsInstance(new_w, int)
- self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200)
-
- # Too large - tests lines 575-578
- height, width = 10000, 10000
- new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
- self.assertLessEqual(new_h * new_w, 28 * 28 * 1280)
-
- # Too small - tests lines 579-582
- height, width = 10, 10
- new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280)
- self.assertGreaterEqual(new_h * new_w, 56 * 56)
-
- # Exceeds max_pixels with custom parameters
- height, width = 10000, 10000
- max_pixels = 10000
- min_pixels = 1000
- new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels)
- self.assertLessEqual(new_h * new_w, max_pixels)
- self.assertGreaterEqual(new_h * new_w, min_pixels)
-
- # Below min_pixels with custom parameters
- height, width = 10, 10
- min_pixels = 10000
- max_pixels = 100000
- new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels)
- self.assertGreaterEqual(new_h * new_w, min_pixels)
- self.assertLessEqual(new_h * new_w, max_pixels)
-
- # Invalid result (extreme parameters) - tests lines 584-585
- with self.assertRaises(ValueError):
- smart_resize(1, 1, factor=100000, min_pixels=100, max_pixels=1000)
-
- def test_is_scaled_image(self):
- """Test is_scaled_image with various image types"""
- # uint8 image
- image = np.array([[0, 255], [128, 200]], dtype=np.uint8)
- self.assertFalse(is_scaled_image(image))
- image = np.random.rand(224, 224, 3).astype(np.uint8) * 255
- self.assertFalse(is_scaled_image(image))
-
- # Scaled float image (values in [0, 1])
- image = np.array([[0.0, 0.5], [0.3, 1.0]], dtype=np.float32)
- self.assertTrue(is_scaled_image(image))
- image = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
- self.assertTrue(is_scaled_image(image))
-
- # Unscaled float image (values > 1)
- image = np.array([[0.0, 255.0], [128.0, 300.0]], dtype=np.float32)
- self.assertFalse(is_scaled_image(image))
- image = np.random.rand(224, 224, 3).astype(np.float32) * 255
- self.assertFalse(is_scaled_image(image))
-
- # Edge cases
- image = np.array([[0.0, 1.0]], dtype=np.float32)
- self.assertTrue(is_scaled_image(image))
- image = np.array([[0.0, 1.1]], dtype=np.float32)
- self.assertFalse(is_scaled_image(image))
- image = np.array([[-0.1, 1.0]], dtype=np.float32)
- self.assertFalse(is_scaled_image(image))
-
- def test_make_batched_images(self):
- """Test make_batched_images with various input types"""
- # Single image
- img = Image.new("RGB", (224, 224))
- result = make_batched_images(img)
- self.assertEqual(len(result), 1)
- self.assertEqual(result[0], img)
-
- # List of images
- imgs = [Image.new("RGB", (224, 224)) for _ in range(3)]
- result = make_batched_images(imgs)
- self.assertEqual(len(result), 3)
- self.assertEqual(result, imgs)
-
- # Nested list
- imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)]
- result = make_batched_images(imgs)
- self.assertEqual(len(result), 4) # 2*2 = 4
-
- # Invalid inputs
- with self.assertRaises(ValueError) as context:
- make_batched_images("invalid")
- self.assertIn("Could not make batched images", str(context.exception))
- with self.assertRaises(ValueError) as context:
- make_batched_images([[1, 2, 3], [4, 5, 6]])
- self.assertIn("Could not make batched images", str(context.exception))
-
- def test_make_batched_videos(self):
- """Test make_batched_videos with various input types"""
- # List of images
- imgs = [Image.new("RGB", (224, 224)) for _ in range(3)]
- result = make_batched_videos(imgs)
- self.assertEqual(len(result), 1)
- self.assertEqual(len(result[0]), 3)
-
- # Single image in list
- img = Image.new("RGB", (224, 224))
- result = make_batched_videos([img])
- self.assertEqual(len(result), 1)
- self.assertEqual(len(result[0]), 1)
-
- # Nested list
- imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)]
- result = make_batched_videos(imgs)
- self.assertEqual(len(result), 2)
- self.assertEqual(len(result[0]), 2)
-
- # 4D array (single)
- video = np.random.rand(3, 224, 224, 3).astype(np.uint8)
- result = make_batched_videos(video)
- self.assertEqual(len(result), 1)
- self.assertIsInstance(result[0], list)
-
- # 4D array in list (lines 119-120)
- videos = [np.random.rand(3, 224, 224, 3).astype(np.uint8)]
- result = make_batched_videos(videos)
- self.assertEqual(len(result), 1)
- self.assertIsInstance(result[0], list)
-
- # Invalid input
- with self.assertRaises(ValueError) as context:
- make_batched_videos("invalid")
- self.assertIn("Could not make batched video", str(context.exception))
-
- def test_preprocess_images(self):
- """Test preprocess handling images"""
- img = Image.new("RGB", (224, 224))
- result = self.processor.preprocess(images=img)
- self.assertIn("pixel_values", result)
- self.assertIn("image_grid_thw", result)
- # Verify pixel_values shape
- pixel_values = result["pixel_values"]
- self.assertIsInstance(pixel_values, np.ndarray)
-
- def test_preprocess_videos(self):
- """Test preprocess handling videos"""
- frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
- result = self.processor.preprocess(images=None, videos=frames)
- self.assertIn("pixel_values_videos", result)
- self.assertIn("video_grid_thw", result)
-
- def test_preprocess_invalid_images(self):
- """Test preprocess handling invalid image"""
- with self.assertRaises(ValueError):
- self.processor.preprocess(images="invalid")
-
- def test_preprocess_with_predetermined_grid_thw(self):
- """Test preprocess using predetermined_grid_thw"""
- img = Image.new("RGB", (224, 224))
- # predetermined_grid_thw should be (h, w) format, not [1, h, w]
- predetermined_grid_thw = [(16, 16)] # For single image, should be (h, w) tuple
- result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_flags(self):
- """Test preprocess with various flags disabled"""
- img = Image.new("RGB", (224, 224))
- # Test without resize
- result = self.processor.preprocess(images=img, do_resize=False)
- self.assertIn("pixel_values", result)
- # Test without rescale
- result = self.processor.preprocess(images=img, do_rescale=False)
- self.assertIn("pixel_values", result)
- # Test without normalize
- result = self.processor.preprocess(images=img, do_normalize=False)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_custom_mean_std(self):
- """Test preprocess using custom mean and std"""
- img = Image.new("RGB", (224, 224))
- # Test with simple custom mean/std
- result = self.processor.preprocess(images=img, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
- self.assertIn("pixel_values", result)
- # Test with ImageNet-style mean/std
- result = self.processor.preprocess(
- images=img, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225]
- )
- self.assertIn("pixel_values", result)
-
- def test_preprocess_do_convert_rgb(self):
- """Test preprocess with do_convert_rgb=True (line 289)"""
- img = Image.new("L", (224, 224)) # Grayscale image
- result = self.processor.preprocess(images=img, do_convert_rgb=True)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_scaled_image_warning(self):
- """Test warning for scaled image in preprocess (lines 294-298)"""
- # Create a scaled image (values between 0-1)
- img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
- # Use patch to capture warning
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
- ) as mock_logger:
- # Directly call _preprocess, pass scaled image
- self.processor._preprocess(
- [img_array], # Pass scaled numpy array
- do_rescale=True,
- do_convert_rgb=False,
- )
- # Verify warning is called when is_scaled_image returns True and do_rescale is True
- mock_logger.warning.assert_called()
-
- def test_preprocess_invalid_images_check(self):
- """Test invalid image check in preprocess (line 464)"""
- # Test invalid image type - need to ensure valid_images returns False
- # Use patch to make valid_images return False, but make_batched_images succeeds
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
- ) as mock_valid:
- mock_valid.return_value = False
- valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False
- with self.assertRaises(ValueError) as context:
- self.processor.preprocess(images=valid_images_list)
- self.assertIn("Invalid image type", str(context.exception))
-
- def test_preprocess_predetermined_grid_thw_multiple_images(self):
- """Test preprocess with predetermined_grid_thw for multiple images (lines 307-310)"""
- imgs = [Image.new("RGB", (224, 224)) for _ in range(2)]
- predetermined_grid_thw = [(16, 16), (20, 20)]
- result = self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_predetermined_grid_thw_length_mismatch(self):
- """Test preprocess with predetermined_grid_thw length mismatch (lines 307-310, 470)"""
- imgs = [Image.new("RGB", (224, 224)) for _ in range(2)]
- predetermined_grid_thw = [(16, 16)] # Length mismatch - only 1 element for 2 images
- # The function raises IndexError when accessing predetermined_grid_thw[img_idx] with img_idx=1
- with self.assertRaises(IndexError):
- self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw)
-
- def test_preprocess_with_input_data_format(self):
- """Test preprocess with input_data_format parameter (lines 299-301)"""
- img = Image.new("RGB", (224, 224))
- from paddleformers.transformers.image_utils import ChannelDimension
-
- # Test with FIRST
- result = self.processor.preprocess(images=img, input_data_format=ChannelDimension.FIRST)
- self.assertIn("pixel_values", result)
- # Test with None
- result = self.processor.preprocess(images=img, input_data_format=None)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_do_resize_with_predetermined_grid_thw(self):
- """Test preprocess with do_resize=True and predetermined_grid_thw (lines 314-317)"""
- img = Image.new("RGB", (224, 224))
- predetermined_grid_thw = [(16, 16)]
- result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw, do_resize=True)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_videos_with_predetermined_grid_thw(self):
- """Test preprocess videos with predetermined_grid_thw (lines 511)"""
- frames = [Image.new("RGB", (224, 224)) for _ in range(4)]
- predetermined_grid_thw = [(16, 16)] * 4
- result = self.processor.preprocess(images=None, videos=frames, predetermined_grid_thw=predetermined_grid_thw)
- self.assertIn("pixel_values_videos", result)
-
- def test_preprocess_return_tensors(self):
- """Test preprocess with return_tensors parameter (lines 396, 523)"""
- img = Image.new("RGB", (224, 224))
- # Use string instead of TensorType enum which may not be available
- result = self.processor.preprocess(images=img, return_tensors="np")
- self.assertIn("pixel_values", result)
-
- def test_preprocess_do_rescale_false_with_scaled_image(self):
- """Test preprocess with do_rescale=False and scaled image (line 335)"""
- # Create a scaled image
- img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 # Values in [0, 0.5]
- img = Image.fromarray((img_array * 255).astype(np.uint8))
- result = self.processor.preprocess(images=img, do_rescale=False)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_custom_resample(self):
- """Test preprocess with custom resample parameter (line 332)"""
- img = Image.new("RGB", (224, 224))
- from PIL import Image as PILImage
-
- result = self.processor.preprocess(images=img, resample=PILImage.BILINEAR)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_custom_rescale_factor(self):
- """Test preprocess with custom rescale_factor (line 336)"""
- img = Image.new("RGB", (224, 224))
- result = self.processor.preprocess(images=img, rescale_factor=1.0 / 128.0)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_data_format(self):
- """Test preprocess with different data_format values"""
- img = Image.new("RGB", (224, 224))
- from paddleformers.transformers.image_utils import ChannelDimension
-
- # Test with FIRST
- result = self.processor.preprocess(images=img, data_format=ChannelDimension.FIRST)
- self.assertIn("pixel_values", result)
- # Test with LAST
- result = self.processor.preprocess(images=img, data_format=ChannelDimension.LAST)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_multiple_images_loop(self):
- """Test preprocess loop with multiple images (lines 312-348, 468-488)"""
- images = [Image.new("RGB", (224, 224)) for _ in range(3)]
- result = self.processor.preprocess(images=images)
- self.assertIn("pixel_values", result)
- self.assertIn("image_grid_thw", result)
- pixel_values = result["pixel_values"]
- self.assertIsInstance(pixel_values, np.ndarray)
- self.assertEqual(len(pixel_values.shape), 2) # Should be [grid_t * grid_h * grid_w, C * psz * psz]
-
- def test_preprocess_videos_loop(self):
- """Test preprocess with videos in loop (lines 496-521)"""
- # Test with multiple videos
- videos = [
- [Image.new("RGB", (224, 224)) for _ in range(4)],
- [Image.new("RGB", (224, 224)) for _ in range(4)],
- ]
- result = self.processor.preprocess(images=None, videos=videos)
- self.assertIn("pixel_values_videos", result)
- self.assertIn("video_grid_thw", result)
- self.assertIsInstance(result["pixel_values_videos"], np.ndarray)
- # Test with nested list format
- videos = [[Image.new("RGB", (224, 224)) for _ in range(4)] for _ in range(2)]
- result = self.processor.preprocess(images=None, videos=videos)
- self.assertIn("pixel_values_videos", result)
- self.assertIn("video_grid_thw", result)
- self.assertIsInstance(result["pixel_values_videos"], np.ndarray)
-
- def test_preprocess_both_images_and_videos(self):
- """Test preprocess with both images and videos (lines 458-523)"""
- images = [Image.new("RGB", (224, 224))]
- videos = [[Image.new("RGB", (224, 224)) for _ in range(4)]]
- result = self.processor.preprocess(images=images, videos=videos)
- # Due to implementation, only video results are returned when both are provided
- self.assertIn("pixel_values_videos", result)
- self.assertIn("video_grid_thw", result)
-
- def test_preprocess_invalid_images_check_list_input(self):
- """Test preprocess with invalid images check (line 464)
-
- Note: The error is raised by make_batched_images before valid_images check,
- so the error message is different.
- """
- invalid_images = ["not an image", "also not an image"]
-
- with self.assertRaises(ValueError) as context:
- self.processor.preprocess(images=invalid_images)
- self.assertIn("Could not make batched images", str(context.exception))
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_paddleocr_vl_processor.py b/tests/input/v1/test_paddleocr_vl_processor.py
deleted file mode 100644
index 3c1e83b42ff..00000000000
--- a/tests/input/v1/test_paddleocr_vl_processor.py
+++ /dev/null
@@ -1,1182 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import pickle
-import unittest
-from unittest.mock import ANY, MagicMock, patch
-
-import numpy as np
-import zmq
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.paddleocr_vl_processor.image_processor import (
- ImageProcessor,
- smart_resize,
-)
-from fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor import (
- PaddleOCRVLProcessor,
-)
-from fastdeploy.input.v1.paddleocr_vl_processor.process import DataProcessor
-from fastdeploy.input.v1.paddleocr_vl_processor.process_video import sample_frames
-
-MODULE_PATH = "fastdeploy.input.v1.paddleocr_vl_processor.process"
-
-
-class TestProcessVideo(unittest.TestCase):
- def setUp(self):
- self.metadata = {"num_of_frame": 100, "fps": 25}
- self.frame_factor = 4
- self.min_frames = 8
- self.max_frames = 32
-
- def test_sample_with_num_frames(self):
- """测试使用num_frames参数采样(来自用户的原始测试)"""
- num_frames = 16
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=num_frames,
- fps=0, # 确保 fps 不>0
- metadata=self.metadata,
- )
- self.assertEqual(len(indices), 16)
- self.assertEqual(indices[0], 0)
- self.assertEqual(indices[-1], 93)
- np.testing.assert_array_equal(indices, np.arange(0, 100, 100 / 16).astype(np.int32))
-
- def test_error_num_frames_exceeds_total(self):
- """测试 num_frames 超过总帧数的异常(来自用户的原始测试)"""
- with self.assertRaises(ValueError) as context:
- sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=200, # 超过总帧数100
- fps=0,
- metadata=self.metadata,
- )
- self.assertIn("exceeds", str(context.exception))
-
- def test_error_mutual_exclusion(self):
- """新增:测试 num_frames 和 fps 互斥"""
- with self.assertRaises(ValueError) as context:
- sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=16, # > 0
- fps=10, # > 0
- metadata=self.metadata,
- )
- self.assertIn("mutually exclusive", str(context.exception))
-
- def test_error_fps_without_metadata(self):
- """新增:测试 fps > 0 但 metadata 为 None"""
- with self.assertRaises(TypeError) as context:
- sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=0,
- fps=10,
- metadata=None, # 缺失
- )
- # 验证是预期的 TypeError
- self.assertIn("'NoneType' object is not subscriptable", str(context.exception))
-
- def test_num_frames_rounding(self):
- """新增:测试 num_frames 向 frame_factor 舍入"""
- num_frames = 17 # 不是 4 的倍数
- # 逻辑: round(17 / 4) * 4 = round(4.25) * 4 = 4 * 4 = 16
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=num_frames,
- fps=0,
- metadata=self.metadata,
- )
- # 应舍入到 16
- self.assertEqual(len(indices), 16)
-
- def test_sample_with_fps_basic(self):
- """新增:测试使用 fps 采样(基本路径,被 max_frames 限制)"""
- # 逻辑: num_frames_calc = 100 / 25 * 10 = 40
- # num_frames_clamped = min(max(40, 8), 32) = 32
- # num_frames_factored = floor(32 / 4) * 4 = 32
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=0,
- fps=10,
- metadata=self.metadata,
- )
- # 应被 max_frames=32 限制
- self.assertEqual(len(indices), 32)
- self.assertEqual(indices[-1], 96)
-
- def test_sample_with_fps_hits_min_frames(self):
- """新增:测试使用 fps 采样(被 min_frames 限制)"""
- # 逻辑: num_frames_calc = 100 / 25 * 1 = 4
- # num_frames_clamped = min(max(4, 8), 32) = 8
- # num_frames_factored = floor(8 / 4) * 4 = 8
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=0,
- fps=1,
- metadata=self.metadata,
- )
- # 应被 min_frames=8 限制
- self.assertEqual(len(indices), 8)
- self.assertEqual(indices[-1], 87)
-
- def test_sample_with_fps_hits_total_frames(self):
- """新增:测试使用 fps 采样(被 total_num_frames 限制)"""
- local_max_frames = 200
-
- # 逻辑: num_frames_calc = 100 / 25 * 50 = 200
- # num_frames_clamped = min(min(max(200, 8), 200), 100) = 100
- # num_frames_factored = floor(100 / 4) * 4 = 100
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=local_max_frames,
- num_frames=0,
- fps=50,
- metadata=self.metadata,
- )
- # 应被 total_num_frames=100 限制
- self.assertEqual(len(indices), 100)
- self.assertEqual(indices[-1], 99) # 采样所有帧
-
- def test_no_sampling(self):
- """新增:测试不采样(fps=0, num_frames=0)"""
- indices = sample_frames(
- frame_factor=self.frame_factor,
- min_frames=self.min_frames,
- max_frames=self.max_frames,
- num_frames=0,
- fps=0,
- metadata=self.metadata,
- )
- # 应返回所有帧
- self.assertEqual(len(indices), self.metadata["num_of_frame"])
- self.assertEqual(len(indices), 100)
- self.assertEqual(indices[-1], 99)
- np.testing.assert_array_equal(indices, np.arange(0, 100).astype(np.int32))
-
-
-class Test_DataProcessor(unittest.TestCase):
- """
- 针对 process.py 中 DataProcessor 类的单元测试。
- """
-
- def setUp(self):
-
- # 1. 手动启动 Patcher
- patcher1 = patch(f"{MODULE_PATH}.AutoTokenizer.from_pretrained")
- patcher2 = patch(f"{MODULE_PATH}.ImageProcessor.from_pretrained")
- patcher_zmq_context = patch(f"{MODULE_PATH}.zmq.Context")
-
- self.mock_auto_tokenizer_constructor = patcher1.start()
- self.mock_image_processor_constructor = patcher2.start()
- self.mock_zmq_context_constructor = patcher_zmq_context.start()
-
- self.addCleanup(patcher1.stop)
- self.addCleanup(patcher2.stop)
- self.addCleanup(patcher_zmq_context.stop)
-
- # 2. 创建模拟对象
- self.mock_tokenizer = MagicMock()
- self.mock_image_processor = MagicMock()
- self.mock_zmq_context = MagicMock()
- self.mock_zmq_socket = MagicMock()
-
- # 3. 配置 from_pretrained 和 zmq
- self.mock_auto_tokenizer_constructor.return_value = self.mock_tokenizer
- self.mock_image_processor_constructor.return_value = self.mock_image_processor
- self.mock_zmq_context_constructor.return_value = self.mock_zmq_context
- self.mock_zmq_context.socket.return_value = self.mock_zmq_socket
-
- # 4. 配置模拟对象的属性和方法
- self._configure_mocks()
-
- # 5. 实例化 DataProcessor (默认不启用 cache)
- self.processor = DataProcessor(model_path="dummy_model_path")
- self._configure_processor_ids()
-
- # 6. 准备测试用的虚拟数据
- self.dummy_image = Image.fromarray(np.uint8(np.random.rand(224, 224, 3) * 255))
- self.dummy_video_frames = np.uint8(np.random.rand(16, 224, 224, 3) * 255)
- self.dummy_video_data = "path/to/dummy_video.mp4"
- self.dummy_processed_image_cache = (
- np.random.rand(64, 3, 14, 14).astype(np.float32),
- {"thw": (1, 8, 8), "fps": 0},
- )
- self.dummy_processed_video_cache = (
- np.random.rand(256, 3, 14, 14).astype(np.float32),
- {"thw": (4, 8, 8), "fps": 30},
- )
-
- def _configure_mocks(self):
- def mock_convert_tokens_to_ids(tokens):
- if tokens == "<|IMAGE_PLACEHOLDER|>":
- return 100
- if tokens == "<|video_pad|>":
- return 101
- if tokens == "<|IMAGE_START|>":
- return 102
- if isinstance(tokens, list):
- if tokens == ["Hello", "world"]:
- return [983, 984]
- if tokens == ["Prompt", "text"]:
- return [606, 511]
- if tokens == ["Prompt", "", "text"]:
- return [606, 511] # 模拟 "Prompt text".split()
- return [hash(t) % 1000 for t in tokens]
- return hash(tokens) % 1000
-
- self.mock_tokenizer.convert_tokens_to_ids.side_effect = mock_convert_tokens_to_ids
- self.mock_tokenizer.tokenize.side_effect = lambda s: s.split()
- self.mock_tokenizer.ignored_index = -100
- self.mock_tokenizer.chat_template = "dummy_template_string"
-
- self.mock_image_processor.merge_size = 2
- self.mock_image_processor.temporal_patch_size = 1
-
- def _configure_processor_ids(self):
- self.processor.image_token_id = 100
- self.processor.video_token_id = 101
- self.processor.image_patch_id = 100
- self.processor.vision_start_id = 102
-
- def _get_init_outputs(self):
- return {
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "labels": [],
- "cur_position": 0,
- "video_cnt": 0,
- "num_input_image_tokens": 0,
- "num_input_video_tokens": 0,
- "fps": [],
- "mm_positions": [],
- "mm_hashes": [],
- "vit_seqlen": [],
- "vit_position_ids": [],
- }
-
- def test_init(self):
- """测试 DataProcessor 的初始化"""
- self.mock_auto_tokenizer_constructor.assert_called_with("dummy_model_path", padding_side="left", use_fast=True)
- self.mock_image_processor_constructor.assert_called_with("dummy_model_path")
- self.assertEqual(self.processor.image_token, "<|IMAGE_PLACEHOLDER|>")
- self.assertEqual(self.processor.video_token_id, 101)
-
- def test_compute_text_positions(self):
- """测试 _compute_text_positions 纯函数"""
- pos_ids = self.processor._compute_text_positions(start_pos=5, num_tokens=3)
- expected = np.array([[5, 6, 7], [5, 6, 7], [5, 6, 7]])
- np.testing.assert_array_equal(pos_ids, expected)
-
- def test_compute_vision_positions(self):
- """测试 _compute_vision_positions 纯函数"""
- pos_ids = self.processor._compute_vision_positions(start_pos=10, t=2, h=4, w=4, second_per_grid_t=1.0)
- self.assertEqual(pos_ids.shape, (3, 8))
- expected_t = np.array([0, 0, 0, 0, 2, 2, 2, 2])
- expected_h = np.array([0, 0, 1, 1, 0, 0, 1, 1])
- expected_w = np.array([0, 1, 0, 1, 0, 1, 0, 1])
- expected = np.stack([expected_t, expected_h, expected_w]) + 10
- np.testing.assert_array_equal(pos_ids, expected)
-
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_text(self):
- """测试 _add_text 辅助函数"""
- outputs = self._get_init_outputs()
- self.mock_tokenizer.tokenize.return_value = ["Hello", "world"]
- self.mock_tokenizer.convert_tokens_to_ids.side_effect = None
- self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11]
-
- self.processor._add_text("Hello world", outputs)
-
- self.assertEqual(outputs["input_ids"], [10, 11])
- self.assertEqual(outputs["token_type_ids"], [0, 0])
- self.assertEqual(outputs["cur_position"], 2)
-
- @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_123")
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_image_autohash(self, mock_hasher):
- """测试 _add_image 辅助函数 (自动哈希)"""
- outputs = self._get_init_outputs()
- outputs["cur_position"] = 5
-
- num_patches_hw = 8 * 8
- num_tokens = 16
- mock_preprocess_return = {
- "pixel_values": np.random.rand(num_patches_hw, 3, 14, 14),
- "grid_thw": np.array([1, 8, 8]),
- }
- self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
- self.processor._add_image(self.dummy_image, outputs, uuid=None)
-
- self.assertEqual(len(outputs["input_ids"]), num_tokens)
- self.assertEqual(outputs["num_input_image_tokens"], num_tokens)
- mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"])
- self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_123")
- self.assertEqual(outputs["cur_position"], 9)
-
- @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features")
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_video_with_uuid(self, mock_hasher):
- """测试 _add_video 辅助函数 (使用 uuid)"""
- outputs = self._get_init_outputs()
- outputs["cur_position"] = 10
- meta = {"fps": 30}
-
- num_patches_total = 256
- num_tokens = 64
-
- mock_preprocess_return = {
- "pixel_values": np.random.rand(num_patches_total, 3, 14, 14),
- "image_grid_thw": np.array([4, 8, 8]),
- }
- self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
- self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid="custom_vid_uuid")
-
- self.assertEqual(len(outputs["input_ids"]), num_tokens)
- self.assertEqual(outputs["token_type_ids"], [2] * num_tokens)
- mock_hasher.assert_not_called()
- self.assertEqual(outputs["mm_hashes"][0], "custom_vid_uuid")
- self.assertEqual(outputs["image_type_ids"], [1, 1, 1, 1])
-
- @patch.object(DataProcessor, "_add_text", MagicMock())
- @patch.object(DataProcessor, "_add_image", MagicMock())
- @patch.object(DataProcessor, "_add_video", MagicMock())
- @patch.object(DataProcessor, "_load_and_process_video")
- def test_text2ids_parsing(self, mock_load_video):
- """测试 text2ids 的解析和分支逻辑"""
- mock_load_video.return_value = (self.dummy_video_frames, {"fps": 30})
- text = "Text1 <|IMAGE_PLACEHOLDER|> Text2 <|video_pad|> Text3"
- images = [self.dummy_image]
- videos = [self.dummy_video_data]
- image_uuid = ["img_uuid_1"]
- video_uuid = ["vid_uuid_1"]
-
- outputs = self.processor.text2ids(text, images, videos, image_uuid, video_uuid)
-
- self.processor._add_text.assert_any_call("Text1 ", outputs)
- self.processor._add_image.assert_called_once_with(self.dummy_image, outputs, "img_uuid_1")
- self.processor._add_video.assert_called_once_with(self.dummy_video_frames, {"fps": 30}, outputs, "vid_uuid_1")
-
- @patch(f"{MODULE_PATH}.parse_chat_messages")
- @patch.object(DataProcessor, "text2ids", return_value="final_output")
- def test_request2ids(self, mock_text2ids, mock_parse_chat):
- """测试 request2ids 的 chat 模板逻辑"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "Hello"},
- {"type": "image", "data": self.dummy_image, "uuid": "img1"},
- ],
- }
- ]
- request = {"request_id": "test_0", "messages": messages, "add_generation_prompt": True}
- request = Request.from_dict(request)
- mock_parse_chat.return_value = messages
- parsed_prompt = "User: Hello <|IMAGE_PLACEHOLDER|> Assistant:"
- self.mock_tokenizer.apply_chat_template.return_value = parsed_prompt
-
- result = self.processor.request2ids(request)
-
- self.mock_tokenizer.apply_chat_template.assert_called_once()
- mock_text2ids.assert_called_once_with(parsed_prompt, [self.dummy_image], [], ["img1"], [])
- self.assertEqual(result, "final_output")
-
- @patch(f"{MODULE_PATH}.sample_frames")
- @patch(f"{MODULE_PATH}.read_video_decord")
- def test_load_and_process_video(self, mock_read_video, mock_sample_frames):
- """测试 _load_and_process_video 的帧采样逻辑"""
- mock_reader = MagicMock()
- mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint(
- 0, 255, (100, 100, 3), dtype=np.uint8
- )
- mock_meta = {"num_of_frame": 100, "duration": 10.0, "fps": 10.0}
- mock_read_video.return_value = (mock_reader, mock_meta, None)
- mock_sample_frames.return_value = [0, 10, 20, 30, 40]
- self.processor.fps = 1
-
- frames, meta = self.processor._load_and_process_video("dummy_url", {"min_frames": 2, "max_frames": 10})
-
- mock_sample_frames.assert_called_once_with(
- frame_factor=ANY,
- min_frames=2,
- max_frames=10,
- metadata=mock_meta,
- fps=self.processor.fps,
- num_frames=self.processor.target_frames,
- )
- self.assertEqual(frames.shape, (5, 100, 100, 3))
- self.assertEqual(meta["fps"], 1)
-
- def test_init_with_external_tokenizer(self):
- """新增:测试使用外部传入的 tokenizer 初始化"""
- self.mock_auto_tokenizer_constructor.reset_mock()
-
- external_tokenizer = MagicMock()
- processor = DataProcessor(model_path="dummy", tokenizer=external_tokenizer)
-
- self.mock_auto_tokenizer_constructor.assert_not_called()
- self.assertIs(processor.tokenizer, external_tokenizer)
-
- def test_add_text_empty(self):
- """新增:测试 _add_text 传入空字符串"""
- outputs = self._get_init_outputs()
- self.processor._add_text("", outputs)
- self.assertEqual(outputs["input_ids"], [])
- self.assertEqual(outputs["cur_position"], 0)
-
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0})
- def test_add_text_pre_tokenized(self):
- """新增:测试 _add_text 传入已 tokenized 的 IDs"""
- outputs = self._get_init_outputs()
- token_ids = [10, 11, 12]
- self.processor._add_text(token_ids, outputs)
-
- self.mock_tokenizer.tokenize.assert_not_called()
- self.assertEqual(outputs["input_ids"], [10, 11, 12])
- self.assertEqual(outputs["token_type_ids"], [0, 0, 0])
- self.assertEqual(outputs["cur_position"], 3)
-
- @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_456")
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_video_no_uuid(self, mock_hasher):
- """新增:测试 _add_video 在 uuid 为 None 时自动哈希"""
- outputs = self._get_init_outputs()
- meta = {"fps": 30}
- mock_preprocess_return = {
- "pixel_values": np.random.rand(256, 3, 14, 14),
- "image_grid_thw": np.array([4, 8, 8]),
- }
- self.mock_image_processor.preprocess.return_value = mock_preprocess_return
-
- self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid=None)
-
- mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"])
- self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_456")
-
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_processed_image(self):
- """新增:测试 _add_processed_image 处理缓存数据"""
- outputs = self._get_init_outputs()
- outputs["cur_position"] = 3
-
- self.processor._add_processed_image(self.dummy_processed_image_cache, outputs, "cached_img_uuid")
-
- num_tokens = 16
- self.assertEqual(len(outputs["input_ids"]), num_tokens)
- self.assertEqual(outputs["input_ids"][0], self.processor.image_patch_id)
-
- np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_image_cache[0])
-
- self.assertEqual(outputs["mm_hashes"][0], "cached_img_uuid")
- self.assertEqual(outputs["cur_position"], 7)
-
- @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2})
- def test_add_processed_video(self):
- """新增:测试 _add_processed_video 处理缓存数据"""
- outputs = self._get_init_outputs()
- outputs["cur_position"] = 5
-
- self.processor._add_processed_video(self.dummy_processed_video_cache, outputs, "cached_vid_uuid")
-
- num_tokens = 64
- t, h, w = self.dummy_processed_video_cache[1]["thw"]
-
- self.assertEqual(len(outputs["input_ids"]), num_tokens)
- self.assertEqual(outputs["token_type_ids"], [2] * num_tokens)
-
- np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_video_cache[0])
-
- self.assertEqual(outputs["mm_hashes"][0], "cached_vid_uuid")
- self.assertEqual(outputs["image_type_ids"], [1] * t)
- self.assertGreater(outputs["cur_position"], 5)
-
- def test_text2ids_with_processed_data(self):
- """新增:测试 text2ids 调用 _add_processed_image 和 _add_processed_video"""
- with (
- patch.object(self.processor, "_add_processed_image") as mock_add_proc_img,
- patch.object(self.processor, "_add_processed_video") as mock_add_proc_vid,
- ):
-
- text = "<|IMAGE_PLACEHOLDER|><|video_pad|>"
- images = [self.dummy_processed_image_cache]
- videos = [self.dummy_processed_video_cache]
- image_uuid = ["img1"]
- video_uuid = ["vid1"]
-
- self.processor.text2ids(text, images, videos, image_uuid, video_uuid)
-
- mock_add_proc_img.assert_called_once_with(self.dummy_processed_image_cache, ANY, "img1")
- mock_add_proc_vid.assert_called_once_with(self.dummy_processed_video_cache, ANY, "vid1")
-
- @patch(f"{MODULE_PATH}.sample_frames")
- @patch(f"{MODULE_PATH}.read_video_decord")
- def test_load_and_process_video_no_sampling(self, mock_read_video, mock_sample_frames):
- """新增:测试 _load_and_process_video 不采样(fps=-1)"""
- mock_reader = MagicMock()
- mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint(
- 0, 255, (100, 100, 3), dtype=np.uint8
- )
- mock_meta = {"num_of_frame": 10, "duration": 1.0, "fps": 10.0}
- mock_read_video.return_value = (mock_reader, mock_meta, None)
-
- self.processor.fps = -1
- self.processor.target_frames = -1
-
- frames, meta = self.processor._load_and_process_video("dummy_url", {})
-
- mock_sample_frames.assert_not_called()
- self.assertEqual(frames.shape, (10, 100, 100, 3))
- self.assertEqual(meta["num_of_frame"], 10)
-
- def test_get_processor_cache(self):
- """新增:测试 get_processor_cache (zmq)"""
- hashes = ["hash1", "hash2"]
- expected_items = ["item1", "item2"]
- mock_resp = pickle.dumps(expected_items)
- self.mock_zmq_socket.recv_multipart.return_value = (b"", mock_resp)
-
- items = self.processor.get_processor_cache(self.mock_zmq_socket, hashes)
-
- self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", pickle.dumps(hashes)])
- self.assertEqual(items, expected_items)
-
- def test_update_processor_cache(self):
- """新增:测试 update_processor_cache (zmq)"""
- hashes = ["hash1"]
- items = ["item1"]
-
- self.processor.update_processor_cache(self.mock_zmq_socket, hashes, items)
-
- expected_req = pickle.dumps((hashes, items))
- self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", expected_req])
-
- def test_apply_chat_template(self):
- """新增:测试 apply_chat_template 核心逻辑"""
- request = {"messages": ["msg1"], "add_generation_prompt": True, "request_id": "req123"}
- self.mock_tokenizer.apply_chat_template.return_value = "Prompt <|IMAGE_PLACEHOLDER|> text"
- self.mock_tokenizer.tokenize.return_value = ["Prompt", "text"]
-
- self.mock_tokenizer.convert_tokens_to_ids.side_effect = None
- self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11]
-
- token_ids = self.processor.apply_chat_template(request)
-
- self.assertEqual(token_ids, [10, 11])
- self.assertEqual(request["text_after_process"], "Prompt <|IMAGE_PLACEHOLDER|> text")
-
- self.mock_tokenizer.tokenize.assert_called_with("Prompt text")
-
- def test_apply_chat_template_raises_error(self):
- """新增:测试 apply_chat_template 在模板不存在时引发 ValueError"""
- self.mock_tokenizer.chat_template = None
- with self.assertRaises(ValueError) as context:
- self.processor.apply_chat_template({"messages": []})
- self.assertIn("does not support chat_template", str(context.exception))
-
- @patch(f"{MODULE_PATH}.parse_chat_messages")
- def test_request2ids_cache_miss_raises_error(self, mock_parse_chat):
- """新增:测试 request2ids 在缓存关闭时缺少数据引发 ValueError"""
- messages = [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}]
- request = {"request_id": "test_0", "messages": messages}
- request = Request.from_dict(request)
-
- mock_parse_chat.return_value = messages
-
- with self.assertRaises(ValueError) as context:
- self.processor.request2ids(request)
-
- self.assertIn("Missing items cannot be retrieved without processor cache.", str(context.exception))
-
- @patch(f"{MODULE_PATH}.DataProcessor.get_processor_cache")
- @patch(f"{MODULE_PATH}.DataProcessor.update_processor_cache")
- @patch(f"{MODULE_PATH}.DataProcessor.text2ids")
- @patch(f"{MODULE_PATH}.parse_chat_messages")
- def test_request2ids_cache_hit_and_update(self, mock_parse_chat, mock_text2ids, mock_update_cache, mock_get_cache):
- """新增:测试 request2ids 缓存命中和缓存更新"""
- self.processor = DataProcessor(model_path="dummy_model_path", enable_processor_cache=True)
- self._configure_processor_ids()
-
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image", "uuid": "img_cache_hit"},
- {"type": "image", "data": self.dummy_image, "uuid": "img_to_update"},
- ],
- }
- ]
- request = {"request_id": "test_0", "messages": messages}
- request = Request.from_dict(request)
-
- mock_parse_chat.return_value = messages
- mock_get_cache.return_value = [self.dummy_processed_image_cache]
-
- mock_text2ids_output = {
- "grid_thw": [(1, 8, 8), (1, 8, 8)],
- "fps": [0, 0],
- "mm_hashes": ["img_cache_hit", "img_to_update"],
- "images": [self.dummy_processed_image_cache[0], self.dummy_processed_image_cache[0]],
- }
- mock_text2ids.return_value = mock_text2ids_output
- self.mock_tokenizer.apply_chat_template.return_value = "<|IMAGE_PLACEHOLDER|><|IMAGE_PLACEHOLDER|>"
-
- self.processor.request2ids(request)
-
- self.mock_zmq_context.socket.assert_called_with(zmq.DEALER)
- mock_get_cache.assert_called_once_with(self.mock_zmq_socket, ["img_cache_hit"])
-
- parsed_images = mock_text2ids.call_args[0][1]
- self.assertIs(parsed_images[0], self.dummy_processed_image_cache)
- self.assertIs(parsed_images[1], self.dummy_image)
-
- expected_hash_to_cache = ["img_to_update"]
- expected_item_to_cache = (self.dummy_processed_image_cache[0], {"thw": (1, 8, 8), "fps": 0})
- mock_update_cache.assert_called_once()
- self.assertEqual(mock_update_cache.call_args[0][1], expected_hash_to_cache)
- self.assertEqual(mock_update_cache.call_args[0][2][0][1], expected_item_to_cache[1])
- np.testing.assert_array_equal(mock_update_cache.call_args[0][2][0][0], expected_item_to_cache[0])
-
- @patch(f"{MODULE_PATH}.DataProcessor.text2ids")
- @patch(f"{MODULE_PATH}.parse_chat_messages")
- def test_request2ids_unsupported_type(self, mock_parse_chat, mock_text2ids):
- """新增:测试 request2ids 静默忽略不支持的类型"""
- messages = [
- {
- "role": "user",
- "content": [{"type": "text", "text": "Hello"}, {"type": "audio", "data": "...", "uuid": "audio1"}],
- }
- ]
- request = {"request_id": "test_0", "messages": messages}
- request = Request.from_dict(request)
-
- mock_parse_chat.return_value = messages
- self.mock_tokenizer.apply_chat_template.return_value = "User: Hello "
-
- self.processor.request2ids(request)
-
- mock_text2ids.assert_called_once()
- call_args = mock_text2ids.call_args[0]
- self.assertEqual(call_args[1], []) # images
- self.assertEqual(call_args[2], []) # videos
- self.assertEqual(call_args[3], []) # image_uuid
- self.assertEqual(call_args[4], []) # video_uuid
-
-
-class TestPaddleOCR_VL_ImageProcessor(unittest.TestCase):
- def setUp(self):
- # 初始化默认参数
- self.default_params = {
- "do_resize": True,
- "resample": 3,
- "do_rescale": True,
- "rescale_factor": 1 / 255,
- "do_normalize": True,
- "image_mean": [0.48145466, 0.4578275, 0.40821073],
- "image_std": [0.26862954, 0.26130258, 0.27577711],
- "do_convert_rgb": True,
- "min_pixels": 28 * 28 * 130,
- "max_pixels": 28 * 28 * 1280,
- "patch_size": 14,
- "temporal_patch_size": 1,
- "merge_size": 2,
- }
-
- # 创建测试图像
- self.test_image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
-
- def test_initialization(self):
- """测试初始化参数是否正确设置"""
- processor = ImageProcessor(**self.default_params)
-
- for param, value in self.default_params.items():
- self.assertEqual(getattr(processor, param), value)
-
- def test_smart_resize(self):
- """测试智能调整图像大小功能"""
- # 测试正常尺寸调整
- h, w = smart_resize(224, 224, factor=28)
- self.assertEqual(h % 28, 0)
- self.assertEqual(w % 28, 0)
-
- # 测试小尺寸调整
- h, w = smart_resize(20, 20, factor=28)
- self.assertGreaterEqual(h, 28)
- self.assertGreaterEqual(w, 28)
-
- # 测试超大尺寸调整
- h, w = smart_resize(2000, 2000, factor=28)
- self.assertLess(h * w, 28 * 28 * 1280)
-
- def test_preprocess_single_image(self):
- """测试单张图像预处理流程"""
- processor = ImageProcessor(**self.default_params)
-
- # 测试正常预处理
- result = processor.preprocess(self.test_image)
- self.assertIn("pixel_values", result)
- self.assertIn("grid_thw", result)
- self.assertEqual(result["pixel_values"].ndim, 4) # [N, C, H, W]
-
- # 测试关闭某些预处理步骤
- result = processor.preprocess(self.test_image, do_resize=False, do_normalize=False)
- self.assertIn("pixel_values", result)
-
- def test_preprocess_batch_images(self):
- """测试批量图像预处理"""
- processor = ImageProcessor(**self.default_params)
- batch_images = [self.test_image, self.test_image]
-
- result = processor.preprocess(batch_images)
- expected_shape = 1152
- self.assertEqual(result["pixel_values"].shape[0], expected_shape)
-
- def test_invalid_input(self):
- """测试无效输入处理"""
- processor = ImageProcessor(**self.default_params)
-
- # 测试无效图像
- with self.assertRaises(ValueError):
- processor.preprocess("invalid_image")
-
- # 测试视频输入(暂不支持)
- with self.assertRaises(NotImplementedError):
- processor.preprocess(self.test_image, videos=["video"])
-
- def test_from_pretrained(self):
- """测试从预训练模型加载配置"""
- with patch("builtins.open", unittest.mock.mock_open(read_data='{"do_resize": false}')) as mock_file:
- processor = ImageProcessor.from_pretrained("dummy_path")
- self.assertFalse(processor.do_resize)
- mock_file.assert_called_once()
-
-
-class TestPaddleOCRVLProcessor(unittest.TestCase):
- def setUp(self):
- # 创建 PaddleOCRVLProcessor 实例的模拟对象
- with patch.object(PaddleOCRVLProcessor, "__init__", return_value=None):
- self.processor = PaddleOCRVLProcessor("model_path")
-
- # 设置必要的属性
- self.processor.tokenizer = MagicMock()
- self.processor.tokenizer.eos_token_id = 1
- self.processor.processor = MagicMock()
- self.processor.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
- self.processor.eos_token_ids = [1]
- self.processor.reasoning_parser = None
- self.processor.model_status_dict = {}
-
- # 模拟 _apply_default_parameters
- def mock_apply_default_parameters(request_or_dict):
- if isinstance(request_or_dict, dict):
- if "top_p" not in request_or_dict:
- request_or_dict["top_p"] = 0.9
- return request_or_dict
-
- if not hasattr(request_or_dict, "top_p"):
- request_or_dict.top_p = 0.9
- return request_or_dict
-
- self.processor._apply_default_parameters = mock_apply_default_parameters
-
- # 模拟 pack_outputs
- def mock_pack_outputs(outputs):
- # 简化 position_ids 的处理
- position_ids_list = outputs["position_ids"]
- if not position_ids_list:
- position_ids = np.array([], dtype=np.int64)
- elif isinstance(position_ids_list[0], list):
- position_ids = np.array(position_ids_list, dtype=np.int64)
- else:
- position_ids = np.concatenate(position_ids_list, axis=1, dtype=np.int64)
-
- if position_ids.ndim == 1:
- position_ids = position_ids.reshape(1, -1)
-
- # 源码的 pack_outputs 会 transpose
- position_ids = position_ids.transpose(1, 0)
-
- return {
- "input_ids": np.array(outputs["input_ids"], dtype=np.int64),
- "token_type_ids": np.array(outputs["token_type_ids"], dtype=np.int64),
- "position_ids": position_ids,
- "images": np.vstack(outputs["images"]) if outputs.get("images") else None,
- "grid_thw": np.vstack(outputs["grid_thw"]) if outputs.get("grid_thw") else None,
- "image_type_ids": np.array(outputs["image_type_ids"]) if outputs.get("image_type_ids") else None,
- }
-
- self.processor.pack_outputs = mock_pack_outputs
- self.processor.np = np
-
- # 模拟 _SAMPLING_EPS 常量
- self.processor._SAMPLING_EPS = 1e-5
-
- # 模拟 processor 返回 (position_ids 必须是 2D array 的 list)
- self.processor.processor.text2ids.return_value = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], # 修正
- "images": ["image_feature"],
- "grid_thw": ["grid_feature"],
- "image_type_ids": [0],
- "cur_position": 3,
- }
-
- self.processor.processor.request2ids.return_value = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], # 修正
- "images": ["image_feature"],
- "grid_thw": ["grid_feature"],
- "image_type_ids": [0],
- "cur_position": 3,
- }
-
- # 模拟 _compute_text_positions 方法 (返回 2D array)
- self.processor.processor._compute_text_positions = lambda pos, num: np.array(
- [list(range(pos, pos + num))], dtype=np.int64
- )
-
- # 模拟 update_stop_seq
- self.processor.update_stop_seq = MagicMock(return_value=([[99, 98]], [2]))
-
- # 模拟 pack_outputs 需要的属性
- self.processor.processor.image_token_id = 100
- self.processor.processor.video_token_id = 101
-
- def test_process_request_dict_basic(self):
- """测试基本请求处理功能"""
- request = {
- "request_id": "test_request",
- "prompt": "test prompt",
- "multimodal_data": {"image": ["image1"]},
- "metadata": {"generated_token_ids": []},
- }
- request = Request.from_dict(request)
-
- result = self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(result.prompt_token_ids, [1, 2, 3])
- self.assertEqual(result.prompt_token_ids_len, 3)
- self.assertTrue(hasattr(result, "multimodal_inputs"))
-
- def test_process_request_dict_with_messages(self):
- """测试 messages 格式的请求处理"""
- request = {
- "request_id": "test_0",
- "messages": [
- {
- "role": "user",
- "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}],
- }
- ],
- "metadata": {"generated_token_ids": []},
- }
- request = Request.from_dict(request)
-
- result = self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(result.prompt_token_ids, [1, 2, 3])
- self.assertTrue(hasattr(result, "multimodal_inputs"))
-
- def test_process_request_dict_with_max_len(self):
- """测试最大长度限制功能"""
- request = {
- "request_id": "test_0",
- "prompt": "test prompt",
- "multimodal_data": {"image": ["image1"]},
- "metadata": {"generated_token_ids": []},
- }
- request = Request.from_dict(request)
-
- # 模拟 processor 返回长序列
- self.processor.processor.text2ids.return_value = {
- "input_ids": list(range(100)),
- "token_type_ids": [0] * 100,
- "position_ids": [np.array([list(range(100))], dtype=np.int64)],
- "images": ["image_feature"],
- "grid_thw": ["grid_feature"],
- "image_type_ids": [0],
- "cur_position": 100,
- }
-
- max_model_len = 50
- result = self.processor.process_request_dict(request, max_model_len)
- # 验证是否截断到 max_model_len - 1
- self.assertEqual(len(result.prompt_token_ids), max_model_len - 1)
- self.assertEqual(result.prompt_token_ids, list(range(49)))
- # 验证原始输入长度确实超过了限制
- self.assertGreater(len(self.processor.processor.text2ids.return_value["input_ids"]), max_model_len)
-
- def test_parse_processor_kwargs(self):
- """测试处理器参数解析"""
- valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
- result = self.processor._parse_processor_kwargs(valid_kwargs)
- self.assertEqual(result, valid_kwargs)
-
- # 测试无效参数
- invalid_kwargs = {"video_max_frames": "invalid"}
- with patch(
- "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
- ) as mock_logger:
- result = self.processor._parse_processor_kwargs(invalid_kwargs)
- self.assertEqual(result, {})
- # 确认警告已被记录
- mock_logger.warning.assert_called()
-
- def test_parse_limits(self):
- """测试输入限制解析"""
- custom_limits = {"image": 2, "video": 3}
- result = self.processor._parse_limits(custom_limits)
- self.assertEqual(result["image"], 2)
- self.assertEqual(result["video"], 3)
- self.assertEqual(result["audio"], 1) # 默认值
-
- def test_check_mm_limits(self):
- """测试多模态输入限制检查 (dict path)"""
- # 测试不超限
- item = {"image": ["image1"], "video": ["video1"]}
- self.processor._check_mm_limits(item)
-
- # 测试超限
- item_exceeded = {"image": ["image1", "image2"], "video": ["video1"]}
- with self.assertRaises(ValueError):
- self.processor._check_mm_limits(item_exceeded)
-
- def test_parse_processor_kwargs_invalid_type(self):
- """测试 _parse_processor_kwargs 传入非字典类型"""
- invalid_input = ["video_max_frames", 10]
- with patch(
- "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
- ) as mock_logger:
- result = self.processor._parse_processor_kwargs(invalid_input)
- self.assertEqual(result, {}) # 触发
- mock_logger.warning.assert_called()
-
- def test_parse_limits_invalid_type(self):
- """测试 _parse_limits 传入非字典类型"""
- invalid_input = ["image", 2]
- with patch(
- "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger"
- ) as mock_logger:
- result = self.processor._parse_limits(invalid_input)
- # 应返回默认值
- self.assertEqual(result, {"image": 1, "video": 1, "audio": 1})
- mock_logger.warning.assert_called()
-
- def test_check_mm_limits_messages_path(self):
- """测试 _check_mm_limits (messages path)"""
- messages = [
- {"role": "user", "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}]}
- ]
- self.processor._check_mm_limits(messages) # 不应抛出异常
-
- def test_check_mm_limits_messages_exceeded(self):
- """测试 _check_mm_limits (messages path) 超限"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "Hello"},
- {"type": "image_url", "url": "image1"},
- {"type": "image_url", "url": "image2"}, # 超过限制 1
- ],
- }
- ]
- with self.assertRaises(ValueError):
- self.processor._check_mm_limits(messages)
-
- def test_process_request_dict_no_prompt_or_messages(self):
- """测试当请求既没有 prompt 也没有 messages 时抛出异常"""
- request = {"request_id": "test_0", "metadata": {"generated_token_ids": []}}
- request = Request.from_dict(request)
- with self.assertRaises(ValueError):
- self.processor.process_request_dict(request, max_model_len=512)
-
- def test_process_request_dict_with_continuation(self):
- """测试续写逻辑 (metadata 包含 generated_token_ids)"""
- request = {
- "request_id": "test_0",
- "prompt": "test prompt",
- "multimodal_data": {"image": ["image1"]},
- "metadata": {"generated_token_ids": [10, 11, 12]}, # 已生成的 token
- }
- request = Request.from_dict(request)
- setattr(request, "metadata", {"generated_token_ids": [10, 11, 12]})
-
- result = self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(result.prompt_token_ids, [1, 2, 3, 10, 11, 12])
- self.assertEqual(result.prompt_token_ids_len, 6)
-
- def test_process_request_dict_with_stop_sequences(self):
- """测试 stop_sequences 处理"""
- request = {
- "request_id": "test_0",
- "prompt": "test prompt",
- "stop": ["stop1", "stop2"],
- "metadata": {"generated_token_ids": []},
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, max_model_len=512)
-
- # 验证 update_stop_seq 被调用
- self.processor.update_stop_seq.assert_called_with(["stop1", "stop2"])
- # 验证结果被设置到 request 中
- self.assertEqual(result.sampling_params.stop_token_ids, [[99, 98]])
- self.assertEqual(result.sampling_params.stop_seqs_len, [2])
-
- def test_process_request_dictefault_max_tokens(self):
- """测试默认 max_tokens 计算"""
- request = {
- "request_id": "test_0",
- "prompt": "test prompt",
- "metadata": {"generated_token_ids": []},
- } # 长度为 3
- request = Request.from_dict(request)
- max_model_len = 10
- result = self.processor.process_request_dict(request, max_model_len)
-
- self.assertEqual(result.sampling_params.max_tokens, 7)
-
- def test_process_request_dict_top_p_clamping(self):
- """测试 top_p 值被修正 (clamping)"""
- request = {
- "request_id": "test_0",
- "prompt": "test prompt",
- "top_p": 0.0, # 低于 _SAMPLING_EPS
- "metadata": {"generated_token_ids": []},
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(result.sampling_params.top_p, self.processor._SAMPLING_EPS)
-
- def test_append_generated_tokens(self):
- """直接测试 append_generated_tokens 辅助函数"""
- # : position_ids 必须是 [2D array]
- multimodal_inputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
- "cur_position": 3,
- }
- generated_token_ids = [10, 11]
-
- # 调用 append_generated_tokens (它是 PaddleOCRVLProcessor 的方法)
- PaddleOCRVLProcessor.append_generated_tokens(self.processor, multimodal_inputs, generated_token_ids)
-
- self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11])
- self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
- # : 检查 position_ids 是否为 [np.array(...), np.array(...)]
- self.assertEqual(len(multimodal_inputs["position_ids"]), 2)
- self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][0], np.array([[0, 1, 2]], dtype=np.int64)))
- self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][1], np.array([[3, 4]], dtype=np.int64)))
- self.assertEqual(multimodal_inputs["cur_position"], 5)
-
- def test_pack_outputs_real_no_images(self):
- """测试真实的 pack_outputs 方法 (无图像)"""
- outputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- # : position_ids 必须是 [2D array]
- "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
- "images": [], # 空列表
- "grid_thw": [],
- "image_type_ids": [],
- "cur_position": 3,
- }
-
- # 调用真实的类方法,而不是 setUp 中 mock 的实例方法
- result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs)
-
- self.assertIsNone(result["images"])
- self.assertIsNone(result["grid_thw"])
- self.assertIsNone(result["image_type_ids"])
- self.assertTrue(np.array_equal(result["input_ids"], np.array([1, 2, 3], dtype=np.int64)))
- # 验证 position_ids 被 concatenate 和 transpose
- # input: [array([[0, 1, 2]])] -> concat: array([[0, 1, 2]]) (shape 1,3) -> transpose: array([[0], [1], [2]]) (shape 3,1)
- self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64)))
- self.assertEqual(result["image_patch_id"], 100)
- self.assertEqual(result["video_patch_id"], 101)
-
- def test_pack_outputs_real_with_images(self):
- """测试真实的 pack_outputs 方法 (有图像)"""
- image_feature = np.array([[0.1, 0.2]])
- grid_feature = np.array([[1, 2, 3]])
-
- outputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- # : position_ids 必须是 [2D array]
- "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)],
- "images": [image_feature],
- "grid_thw": [grid_feature],
- "image_type_ids": [0],
- "cur_position": 3,
- }
-
- result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs)
-
- self.assertTrue(np.array_equal(result["images"], image_feature))
- self.assertTrue(np.array_equal(result["grid_thw"], grid_feature))
- self.assertTrue(np.array_equal(result["image_type_ids"], np.array([0])))
- self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64)))
-
- def test_think_status(self):
- """测试 思考机制"""
- request = {
- "prompt": "hello",
- "request_id": "test_1",
- "prompt_token_ids": [1, 2, 3],
- }
- request = Request.from_dict(request)
- self.processor.reasoning_parser = MagicMock()
- self.processor.reasoning_parser.get_model_status.return_value = "think_start"
- self.processor.model_status_dict = {}
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- request = {
- "prompt": "hello",
- "request_id": "test_2",
- "prompt_token_ids": [1, 2, 3],
- }
- request = Request.from_dict(request)
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_process_video.py b/tests/input/v1/test_process_video.py
deleted file mode 100644
index b8777cb0a76..00000000000
--- a/tests/input/v1/test_process_video.py
+++ /dev/null
@@ -1,386 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import io
-import math
-import os
-import tempfile
-import unittest
-from unittest.mock import patch
-
-import numpy as np
-from PIL import Image as PILImage
-
-import fastdeploy.input.v1.ernie4_5_vl_processor.process_video as process_video_module
-from fastdeploy.input.v1.ernie4_5_vl_processor.process_video import (
- get_frame_indices,
- read_frames_decord,
- read_video_decord,
-)
-
-
-class _MockFrame:
- """Lightweight frame wrapper that mimics the real frame object."""
-
- def __init__(self, arr):
- self._arr = arr
-
- def asnumpy(self):
- """Return the underlying numpy array."""
- return self._arr
-
-
-class MockVideoReaderWrapper:
- """
- Simple mock implementation of a video reader:
-
- - __len__ returns the total number of frames
- - __getitem__ returns a _MockFrame(arr)
- - get_avg_fps() returns fps
- - Specific indices can be configured to raise errors in __getitem__
- """
-
- def __init__(
- self,
- src,
- num_threads=1,
- vlen=12,
- fps=6,
- fail_indices=None,
- h=4,
- w=5,
- c=3,
- ):
- self.src = src
- self._vlen = vlen
- self._fps = fps
- self._fail = set(fail_indices or [])
- self._h, self._w, self._c = h, w, c
-
- def __len__(self):
- return self._vlen
-
- def get_avg_fps(self):
- return self._fps
-
- def __getitem__(self, idx):
- if idx < 0 or idx >= self._vlen:
- raise IndexError("index out of range")
- if idx in self._fail:
- raise ValueError(f"forced fail at {idx}")
- # Create a frame whose pixel value encodes the index (for easy debugging)
- arr = np.zeros((self._h, self._w, self._c), dtype=np.uint8)
- arr[:] = idx % 255
- return _MockFrame(arr)
-
-
-class TestReadVideoDecord(unittest.TestCase):
- def test_read_video_decord_with_wrapper(self):
- """Test passing an existing VideoReaderWrapper instance directly."""
- # Patch VideoReaderWrapper in the target module so isinstance checks use our mock class
- with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper):
- mock_reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5)
- reader, meta, path = read_video_decord(mock_reader, save_to_disk=False)
-
- self.assertIs(reader, mock_reader)
- self.assertEqual(meta["fps"], 5)
- self.assertEqual(meta["num_of_frame"], 10)
- self.assertTrue(math.isclose(meta["duration"], 10 / 5, rel_tol=1e-6))
- # The original reader object should be returned unchanged
- self.assertIs(path, mock_reader)
-
- def test_read_video_decord_with_bytes(self):
- """Test that bytes input is wrapped into BytesIO and passed to VideoReaderWrapper."""
- with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper):
- data = b"\x00\x01\x02\x03"
- reader, meta, path = read_video_decord(data, save_to_disk=False)
-
- self.assertIsInstance(reader, MockVideoReaderWrapper)
- self.assertEqual(meta["fps"], 6)
- self.assertEqual(meta["num_of_frame"], 12)
- self.assertTrue(math.isclose(meta["duration"], 12 / 6, rel_tol=1e-6))
- self.assertIsInstance(path, io.BytesIO)
-
-
-class TestGetFrameIndices(unittest.TestCase):
- def test_by_target_frames_middle(self):
- """Test target_frames mode with 'middle' sampling strategy."""
- vlen = 12
- out = get_frame_indices(
- vlen=vlen,
- target_frames=4,
- target_fps=-1,
- frames_sample="middle",
- input_fps=-1,
- )
- # 12 frames split into 4 segments -> midpoints [1, 4, 7, 10]
- self.assertEqual(out, [1, 4, 7, 10])
-
- def test_by_target_frames_leading(self):
- """Test target_frames mode with 'leading' sampling strategy."""
- vlen = 10
- out = get_frame_indices(
- vlen=vlen,
- target_frames=5,
- target_fps=-1,
- frames_sample="leading",
- input_fps=-1,
- )
- # 10 frames split into 5 segments -> segment starts [0, 2, 4, 6, 8]
- self.assertEqual(out, [0, 2, 4, 6, 8])
-
- def test_by_target_frames_rand(self):
- """Test target_frames mode with 'rand' sampling strategy."""
- vlen = 10
- out = get_frame_indices(
- vlen=vlen,
- target_frames=4,
- target_fps=-1,
- frames_sample="rand",
- input_fps=-1,
- )
- self.assertEqual(len(out), 4)
- self.assertTrue(all(0 <= i < vlen for i in out))
-
- def test_by_target_frames_fix_start(self):
- """Test target_frames mode with a fixed start offset."""
- vlen = 10
- out = get_frame_indices(
- vlen=vlen,
- target_frames=5,
- target_fps=-1,
- frames_sample="middle", # overridden by fix_start
- fix_start=1,
- input_fps=-1,
- )
- # Segment starts [0, 2, 4, 6, 8] -> +1 => [1, 3, 5, 7, 9]
- self.assertEqual(out, [1, 3, 5, 7, 9])
-
- def test_target_frames_greater_than_vlen(self):
- """Test that target_frames > vlen falls back to using vlen samples."""
- vlen = 5
- out = get_frame_indices(
- vlen=vlen,
- target_frames=10,
- target_fps=-1,
- frames_sample="middle",
- input_fps=-1,
- )
- self.assertEqual(len(out), vlen)
- self.assertTrue(all(0 <= i < vlen for i in out))
-
- def test_by_target_fps_middle(self):
- """Test target_fps mode with 'middle' sampling strategy."""
- vlen, in_fps = 12, 6
- out = get_frame_indices(
- vlen=vlen,
- target_frames=-1,
- target_fps=2,
- frames_sample="middle",
- input_fps=in_fps,
- )
- # Roughly 4 frames expected
- self.assertTrue(3 <= len(out) <= 5)
- self.assertTrue(all(0 <= i < vlen for i in out))
-
- def test_by_target_fps_leading(self):
- """Test target_fps mode with 'leading' sampling strategy."""
- vlen, in_fps = 12, 6
- out = get_frame_indices(
- vlen=vlen,
- target_frames=-1,
- target_fps=2,
- frames_sample="leading",
- input_fps=in_fps,
- )
- self.assertTrue(3 <= len(out) <= 5)
- self.assertTrue(all(0 <= i < vlen for i in out))
-
- def test_by_target_fps_rand(self):
- """Test target_fps mode with 'rand' sampling strategy."""
- vlen, in_fps = 12, 6
- out = get_frame_indices(
- vlen=vlen,
- target_frames=-1,
- target_fps=2,
- frames_sample="rand",
- input_fps=in_fps,
- )
- self.assertTrue(3 <= len(out) <= 5)
- self.assertTrue(all(0 <= i < vlen for i in out))
-
- def test_invalid_both_negative(self):
- """Test that both target_frames and target_fps being negative raises ValueError."""
- with self.assertRaises(ValueError):
- get_frame_indices(
- vlen=10,
- target_frames=-1,
- target_fps=-1,
- frames_sample="middle",
- )
-
- def test_invalid_both_specified(self):
- """Test that specifying both target_frames and target_fps raises AssertionError."""
- with self.assertRaises(AssertionError):
- get_frame_indices(
- vlen=10,
- target_frames=4,
- target_fps=2,
- frames_sample="middle",
- input_fps=6,
- )
-
- def test_invalid_target_fps_missing_input(self):
- """Test that target_fps > 0 with invalid input_fps raises AssertionError."""
- with self.assertRaises(AssertionError):
- get_frame_indices(
- vlen=10,
- target_frames=-1,
- target_fps=2,
- frames_sample="middle",
- input_fps=-1,
- )
-
-
-class TestReadFramesDecord(unittest.TestCase):
- def test_basic_read_no_save(self):
- """Test normal frame reading without saving to disk."""
- reader = MockVideoReaderWrapper("dummy", vlen=8, fps=4)
- meta = {"fps": 4, "duration": 8 / 4, "num_of_frame": 8}
-
- ret, idxs, ts = read_frames_decord(
- video_path="dummy",
- video_reader=reader,
- video_meta=meta,
- target_frames=4,
- frames_sample="middle",
- save_to_disk=False,
- )
-
- # Should return 4 PIL.Image instances
- self.assertEqual(len(ret), 4)
- for img in ret:
- self.assertIsInstance(img, PILImage.Image)
-
- self.assertEqual(idxs, [0, 2, 4, 6])
- dur = meta["duration"]
- n = meta["num_of_frame"]
- for i, t in zip(idxs, ts):
- self.assertTrue(math.isclose(t, i * dur / n, rel_tol=1e-6))
-
- def test_read_and_save_to_disk(self):
- """Test reading frames and saving them as PNG files on disk."""
- reader = MockVideoReaderWrapper("dummy", vlen=4, fps=2)
- meta = {"fps": 2, "duration": 4 / 2, "num_of_frame": 4}
-
- with (
- tempfile.TemporaryDirectory() as tmpdir,
- patch.object(
- process_video_module,
- "get_filename",
- return_value="det_id",
- ),
- ):
- ret, idxs, ts = read_frames_decord(
- video_path="dummy",
- video_reader=reader,
- video_meta=meta,
- target_frames=2,
- frames_sample="leading",
- save_to_disk=True,
- cache_dir=tmpdir,
- )
-
- self.assertEqual(len(ret), 2)
- for i, pth in enumerate(ret):
- self.assertIsInstance(pth, str)
- self.assertTrue(os.path.exists(pth))
- self.assertEqual(os.path.basename(pth), f"{i}.png")
-
- def test_fallback_previous_success(self):
- """Test that a failed frame read falls back to a previous valid frame when possible."""
- reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={3})
- meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10}
- idxs = [1, 2, 3, 6]
-
- ret, new_idxs, ts = read_frames_decord(
- video_path="dummy",
- video_reader=reader,
- video_meta=meta,
- frame_indices=idxs.copy(),
- save_to_disk=False,
- tol=5,
- )
-
- # Index 3 fails and should be replaced by 2 or 4 (previous/next search)
- self.assertIn(new_idxs[2], (2, 4))
- self.assertEqual(len(ret), 4)
-
- def test_fallback_next_when_prev_fails(self):
- """Test that when current and previous frames fail, a later frame is used as fallback."""
- reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={2, 3})
- meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10}
- idxs = [1, 2, 3, 6]
-
- ret, new_idxs, ts = read_frames_decord(
- video_path="dummy",
- video_reader=reader,
- video_meta=meta,
- frame_indices=idxs.copy(),
- save_to_disk=False,
- tol=5,
- )
-
- # Frame 3 should eventually be replaced by 4
- self.assertEqual(new_idxs[2], 4)
- self.assertEqual(len(ret), 4)
-
- def test_len_assert_when_no_fallback(self):
- """Test that assertion is triggered when no valid fallback frame can be found."""
-
- class FailAllAroundReader(MockVideoReaderWrapper):
- """Reader that fails on index 1 and has too small length to find fallback."""
-
- def __init__(self, *a, **kw):
- super().__init__(*a, **kw)
- self._vlen = 2
- self._fps = 2
- self._fail = {1}
-
- def __getitem__(self, idx):
- if idx in self._fail:
- raise ValueError("fail hard")
- return super().__getitem__(idx)
-
- reader = FailAllAroundReader("dummy")
- meta = {"fps": 2, "duration": 2 / 2, "num_of_frame": 2}
-
- # Request 2 frames: index 0 succeeds, index 1 always fails,
- # and tol=0 disallows searching neighbors -> stack and length assertion should fail
- with self.assertRaises(AssertionError):
- read_frames_decord(
- video_path="dummy",
- video_reader=reader,
- video_meta=meta,
- target_frames=2,
- frames_sample="leading",
- save_to_disk=False,
- tol=0,
- )
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_qwen3_vl_processor.py b/tests/input/v1/test_qwen3_vl_processor.py
deleted file mode 100644
index c858f1e51db..00000000000
--- a/tests/input/v1/test_qwen3_vl_processor.py
+++ /dev/null
@@ -1,1172 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import copy
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.qwen3_vl_processor import Qwen3VLProcessor
-from fastdeploy.input.v1.qwen3_vl_processor.process import sample_frames
-
-
-def mock_pil_image(height, width):
- """
- Generate mock random RGB image
-
- Args:
- height: Image height in pixels
- width: Image width in pixels
-
- Returns:
- PIL.Image object with random RGB data
- """
- rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
- return Image.fromarray(rgb_image)
-
-
-def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
- """
- Generate mock video frames with metadata for testing purposes
-
- Creates synthetic video data by generating random RGB frames and constructing
- corresponding metadata to simulate real video processing.
-
- Args:
- height (int): Height of video frames in pixels
- width (int): Width of video frames in pixels
- nums_frame (int): Number of frames to generate
- fps (int): Frames per second for the mock video
-
- Returns:
- tuple: A tuple containing:
- frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
- containing randomly generated RGB frames
- meta (dict): Dictionary with video metadata:
- - fps (int): Frames per second (same as input)
- - duration (float): Calculated duration in seconds (nums_frame/fps)
- - num_of_frame (int): Number of frames (same as nums_frame input)
- """
- frames = []
- for _ in range(nums_frame):
- frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
- frames.append(frame)
- frames = np.stack(frames, axis=0)
-
- meta = {
- "fps": fps,
- "duration": nums_frame / fps,
- "num_of_frame": nums_frame,
- }
- return frames, meta
-
-
-class TestQwen3VLProcessor(unittest.TestCase):
- """
- Unit tests for Qwen Vision-Language Processor functionality
- """
-
- def setUp(self):
- """
- Initialize test case with:
- - Mock configuration
- - Patched message parsing and video processing methods
- - QwenVLProcessor instance with test parameters
- """
- config = MagicMock()
- config.vision_config.tokens_per_second = 2
-
- self.patcher_parse_image = patch(
- "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640)
- )
- self.patcher_parse_image.start()
-
- self.patcher_parse_video = patch(
- "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123"
- )
- self.patcher_parse_video.start()
-
- self.patcher_read_frames = patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.DataProcessor._load_and_process_video",
- return_value=mock_read_frames(480, 640, 5, 2),
- )
- self.patcher_read_frames.start()
-
- mm_processor_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
- limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-
- self.model_name_or_path = "/ModelData/Qwen3-VL-4B-Instruct"
- self.processor = Qwen3VLProcessor(
- config=config,
- model_name_or_path=self.model_name_or_path,
- limit_mm_per_prompt=limit_mm_per_prompt,
- mm_processor_kwargs=mm_processor_kwargs,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- )
-
- def tearDown(self) -> None:
- """Clean up test case by stopping all mock patches"""
- self.patcher_read_frames.stop()
- self.patcher_parse_image.stop()
- self.patcher_parse_video.stop()
-
- def test_process_request_dict(self):
- """
- Test processing of dictionary-format request with multimodal input
-
- Validates:
- 1. Token ID lengths match position_ids and token_type_ids shapes
- 2. Image processing produces expected output dimensions
- 3. Video processing produces expected output dimensions
- 4. Correct counts for images (1) and videos (1)
- """
- num_completion_token_ids = 10
- request = {
- "request_id": "12345",
- "completion_token_ids": [1] * num_completion_token_ids,
- "stop": ["stop", "eof"],
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
-
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
- self.assertEqual(
- result.multimodal_inputs["images"].shape[0],
- sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
- )
- self.assertEqual(
- result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
- )
-
- def test_prompt(self):
- """
- Test processing of prompt with image and video placeholders
-
- Validates:
- 1. Token ID lengths match position_ids and token_type_ids shapes
- 2. Image processing produces expected output dimensions
- 3. Video processing produces expected output dimensions
- 4. Correct counts for images (1) and videos (1)
- """
- IMAGE_PLACEHOLDER = "<|image_pad|>"
- VIDEO_PLACEHOLDER = "<|video_pad|>"
- prompt = {
- "request_id": "12345",
- "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
- "multimodal_data": {
- "image": [mock_pil_image(10, 2100)],
- "video": [{"video": b"123", "fps": 5}],
- },
- }
-
- request = Request.from_dict(prompt)
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
- self.assertEqual(
- result.multimodal_inputs["images"].shape[0],
- sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
- )
- self.assertEqual(
- result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
- )
-
- def test_message_and_prompt(self):
- """
- Test consistency between message-based and prompt-based processing
-
- Validates that processing a request through:
- 1. The message format (with image/video URLs)
- 2. The prompt format (with direct image/video data)
- produces identical tokenization and multimodal input results.
-
- Checks:
- 1. Prompt token IDs match between both processing methods
- 2. Grid dimensions (THW) match between both methods
- 3. Position IDs match between both methods
- """
- # Create test request in message format
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- # Create equivalent request in prompt format
- prompt = {
- "request_id": "12345",
- "prompt": request.prompt_tokens,
- "multimodal_data": {
- "image": [mock_pil_image(480, 640)],
- "video": [{"video": b"123"}],
- },
- }
- request2 = Request.from_dict(prompt)
- result2 = self.processor.process_request_dict(request2, 1024 * 100)
-
- # Verify both processing methods produce identical results
- self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids)
- self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
- self.assertTrue(
- np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all()
- )
-
- def test_apply_chat_template(self):
- """
- Test the consistency between:
- 1. Directly applying chat template using HuggingFace tokenizer
- 2. Applying chat template through the processor's request processing
-
- This test verifies that:
- - The processor correctly handles multimodal messages (image, video, text)
- - The prompt_tokens field matches the output from direct tokenizer application
- - The chat template application preserves the message structure and content
-
- Test Steps:
- 1. Create sample multimodal messages with image, video and text content
- 2. Apply chat template directly using the tokenizer
- 3. Process the same messages through the processor
- 4. Compare the outputs to ensure consistency
- """
- from transformers import AutoTokenizer
-
- tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
-
- # Sample multimodal messages containing image, video and text
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ]
-
- # Apply chat template directly using the tokenizer
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
- # Create equivalent request dictionary
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
-
- # Process request through the processor
- self.processor.process_request_dict(request, 1024 * 100)
- prompt2 = request.prompt_tokens
-
- # Verify both methods produce identical prompt strings
- self.assertEqual(prompt, prompt2)
-
- def test_add_processed_image(self):
- """
- Test DataProcessor._add_processed_image via Qwen3VLProcessor
- """
- merge_size = self.processor.processor.image_processor.merge_size
-
- # shape[0] must be divisible by merge_size^2
- num_tokens = 4
- img = np.zeros(
- (num_tokens * merge_size * merge_size, 3, 3),
- dtype=np.float32,
- )
- meta = {
- "thw": (1, 8, 8),
- }
- uuid = "test-image-uuid"
-
- img_cache = (img, meta)
-
- outputs = {
- "mm_positions": [],
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "cur_position": 5,
- "images": [],
- "mm_hashes": [],
- "grid_thw": [],
- "image_type_ids": [],
- "fps": [],
- }
-
- # -----------------------
- # mock vision position computation
- # -----------------------
- dp = self.processor.processor
- dp._compute_vision_positions = MagicMock(return_value=np.array([[10, 11, 12]], dtype=np.int64))
-
- dp._add_processed_image(img_cache, outputs, uuid)
-
- # ---- input_ids / token_type_ids ----
- self.assertEqual(len(outputs["input_ids"]), num_tokens)
- self.assertEqual(
- outputs["input_ids"],
- [dp.image_token_id] * num_tokens,
- )
-
- # ---- mm_positions ----
- self.assertEqual(len(outputs["mm_positions"]), 1)
- mm_pos = outputs["mm_positions"][0]
- self.assertEqual(mm_pos.length, num_tokens)
-
- # ---- vision positions ----
- dp._compute_vision_positions.assert_called_once_with(5, 1, 8, 8, 0)
- np.testing.assert_array_equal(
- outputs["position_ids"][0],
- np.array([[10, 11, 12]], dtype=np.int64),
- )
- self.assertEqual(outputs["cur_position"], 13)
-
- # ---- image payload ----
- self.assertEqual(len(outputs["images"]), 1)
- np.testing.assert_array_equal(outputs["images"][0], img)
-
- self.assertEqual(outputs["mm_hashes"], [uuid])
- np.testing.assert_array_equal(
- outputs["grid_thw"][0],
- np.array([[1, 8, 8]]),
- )
- self.assertEqual(outputs["image_type_ids"], [0])
- self.assertEqual(outputs["fps"], [0])
-
- def test_multimodal_token_len_validation(self):
- """Test token_len validation for raw and processed multimodal paths"""
- dp = self.processor.processor
- merge_size = dp.image_processor.merge_size
-
- def build_outputs(image=False, video=False):
- outputs = {
- "mm_positions": [],
- "input_ids": [],
- "token_type_ids": [],
- "position_ids": [],
- "cur_position": 0,
- "images": [],
- "mm_hashes": [],
- "grid_thw": [],
- "image_type_ids": [],
- "fps": [],
- }
- if image:
- outputs["num_input_image_tokens"] = 0
- if video:
- outputs["num_input_video_tokens"] = 0
- return outputs
-
- processed_image = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (1, 8, 8)},
- )
- processed_video = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (2, 8, 8), "fps": 5},
- )
-
- with self.subTest("add_image"):
- with patch.object(
- dp.image_processor,
- "preprocess",
- return_value={
- "grid_thw": np.array([1, merge_size * 2, merge_size * 2]),
- "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
- },
- ):
- with self.assertRaisesRegex(ValueError, "image tokens num not match the size"):
- dp._add_image(mock_pil_image(32, 32), build_outputs(image=True), None, token_len=3)
-
- with self.subTest("add_processed_image"):
- with self.assertRaisesRegex(ValueError, "image tokens num not match the size"):
- dp._add_processed_image(processed_image, build_outputs(), "uuid", token_len=2)
-
- with self.subTest("add_video"):
- with patch.object(
- dp.image_processor,
- "preprocess",
- return_value={
- "grid_thw": np.array([1, merge_size * 2, merge_size * 2]),
- "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
- },
- ):
- with self.assertRaisesRegex(ValueError, "video tokens num not match the size"):
- dp._add_video(
- np.zeros((2, 4, 4, 3), dtype=np.uint8),
- {"fps": 4},
- build_outputs(video=True),
- None,
- token_len=3,
- )
-
- with self.subTest("add_processed_video"):
- with self.assertRaisesRegex(ValueError, "video tokens num not match the size"):
- dp._add_processed_video(processed_video, build_outputs(), "uuid", token_len=2)
-
- def test_prompt_token_ids2outputs_error_branches(self):
- """Test prompt_token_ids2outputs error branches with minimal fixtures"""
- dp = self.processor.processor
- request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": [dp.image_token_id],
- "messages": [{"role": "user", "content": [{"type": "image_url", "uuid": "missing-image"}]}],
- }
- )
- parsed_messages = [{"role": "user", "content": {"type": "image", "data": None, "uuid": "missing-image"}}]
-
- with self.subTest("missing_without_cache"):
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
- ):
- with self.assertRaisesRegex(ValueError, "Missing items cannot be retrieved without processor cache."):
- dp.prompt_token_ids2outputs(request)
-
- with self.subTest("missing_cache_item_not_found"):
- old_enable_processor_cache = dp.enable_processor_cache
- dp.enable_processor_cache = True
- fake_context = MagicMock()
- fake_context.socket.return_value = MagicMock()
- try:
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
- ):
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context
- ):
- with patch.object(dp, "get_processor_cache", return_value=[None]):
- with self.assertRaisesRegex(ValueError, "Missing item 0 not found in processor cache"):
- dp.prompt_token_ids2outputs(request)
- finally:
- dp.enable_processor_cache = old_enable_processor_cache
-
- with self.subTest("unexpected_multimodal_type"):
-
- class FlakyTypeItem:
- def __init__(self):
- self.calls = 0
-
- def get(self, key, default=None):
- if key == "type":
- self.calls += 1
- return "image" if self.calls == 1 else "audio"
- if key == "data":
- return "bad-data"
- if key == "uuid":
- return "bad-uuid"
- return default
-
- parsed_messages = [{"role": "user", "content": FlakyTypeItem()}]
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
- ):
- with self.assertRaisesRegex(ValueError, "Unsupported multimodal type: audio"):
- dp.prompt_token_ids2outputs(request)
-
- def test_prompt_token_ids2outputs_cache_update_paths(self):
- """Test prompt_token_ids2outputs cache update for missing, 1D and 2D grid_thw paths"""
- dp = self.processor.processor
- merge_size = dp.image_processor.merge_size
- old_enable_processor_cache = dp.enable_processor_cache
- dp.enable_processor_cache = True
-
- missing_image = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (1, 8, 8)},
- )
- processed_video = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (2, 8, 8), "fps": 6},
- )
- parsed_messages = [
- {
- "role": "user",
- "content": [
- {"type": "image", "data": None, "uuid": "missing-image"},
- {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"},
- {"type": "video", "data": processed_video, "uuid": "processed-video"},
- ],
- }
- ]
- request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": [dp.image_token_id, 99, dp.image_token_id, 98, dp.image_token_id],
- "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}],
- }
- )
- fake_socket = MagicMock()
- fake_context = MagicMock()
- fake_context.socket.return_value = fake_socket
-
- try:
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
- ):
- with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context):
- with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)):
- with patch.object(
- dp.image_processor,
- "preprocess",
- return_value={
- "grid_thw": np.array([1, merge_size, merge_size]),
- "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
- },
- ):
- with patch.object(
- dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4)
- ):
- with patch.object(
- dp, "get_processor_cache", return_value=[missing_image]
- ) as cache_get:
- with patch.object(dp, "update_processor_cache") as cache_update:
- outputs = dp.prompt_token_ids2outputs(request)
-
- cache_get.assert_called_once_with(fake_socket, ["missing-image"])
- cache_update.assert_called_once()
- _, cached_hashes, cached_items = cache_update.call_args.args
- self.assertEqual(cached_hashes, ["raw-video", "processed-video"])
- self.assertEqual(cached_items[0][1]["thw"], (1, merge_size, merge_size))
- self.assertEqual(cached_items[1][1]["thw"], (2, 8, 8))
- self.assertEqual(outputs["mm_hashes"], ["missing-image", "raw-video", "processed-video"])
- self.assertEqual(outputs["input_ids"][-1], dp.image_token_id)
- finally:
- dp.enable_processor_cache = old_enable_processor_cache
-
- def test_request2ids_cache_update_paths(self):
- """Test request2ids cache update for missing, 1D and 2D grid_thw paths"""
- dp = self.processor.processor
- merge_size = dp.image_processor.merge_size
- old_enable_processor_cache = dp.enable_processor_cache
- dp.enable_processor_cache = True
-
- missing_image = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (1, 8, 8)},
- )
- processed_image = (
- np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32),
- {"thw": (1, 8, 8)},
- )
- parsed_messages = [
- {
- "role": "user",
- "content": [
- {"type": "image", "data": None, "uuid": "missing-image"},
- {"type": "image", "data": processed_image, "uuid": "processed-image"},
- {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"},
- ],
- }
- ]
- request = Request.from_dict(
- {
- "request_id": "12345",
- "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}],
- "add_generation_prompt": True,
- }
- )
- fake_socket = MagicMock()
- fake_context = MagicMock()
- fake_context.socket.return_value = fake_socket
-
- try:
- with patch(
- "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages
- ):
- with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context):
- with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)):
- with patch.object(
- dp.image_processor,
- "preprocess",
- return_value={
- "grid_thw": np.array([1, merge_size, merge_size]),
- "pixel_values": np.zeros((1, 3, 3), dtype=np.float32),
- },
- ):
- with patch.object(
- dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4)
- ):
- with patch.object(
- dp, "get_processor_cache", return_value=[missing_image]
- ) as cache_get:
- with patch.object(dp, "update_processor_cache") as cache_update:
- with patch.object(
- self.processor.tokenizer,
- "apply_chat_template",
- return_value="<|image_pad|>a<|image_pad|>b<|video_pad|>",
- ):
- outputs = dp.request2ids(request)
-
- cache_get.assert_called_once_with(fake_socket, ["missing-image"])
- cache_update.assert_called_once()
- _, cached_hashes, cached_items = cache_update.call_args.args
- self.assertEqual(cached_hashes, ["processed-image", "raw-video"])
- self.assertEqual(cached_items[0][1]["thw"], (1, 8, 8))
- self.assertEqual(cached_items[1][1]["thw"], (1, merge_size, merge_size))
- self.assertEqual(outputs["mm_hashes"], ["missing-image", "processed-image", "raw-video"])
- finally:
- dp.enable_processor_cache = old_enable_processor_cache
-
- def test_parse_processor_kwargs_valid(self):
- """Test _parse_processor_kwargs with valid input"""
- valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
- result = self.processor._parse_processor_kwargs(valid_kwargs)
- self.assertEqual(result, valid_kwargs)
-
- def test_parse_processor_kwargs_empty(self):
- """Test _parse_processor_kwargs with empty input"""
- result = self.processor._parse_processor_kwargs(None)
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_invalid_type(self):
- """Test _parse_processor_kwargs with invalid type"""
- result = self.processor._parse_processor_kwargs("invalid")
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_invalid_value_type(self):
- """Test _parse_processor_kwargs with invalid value type"""
- invalid_kwargs = {"video_max_frames": "10"} # Should be int
- result = self.processor._parse_processor_kwargs(invalid_kwargs)
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_mixed_valid_invalid(self):
- """Test _parse_processor_kwargs with mixed valid and invalid types"""
- mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"}
- result = self.processor._parse_processor_kwargs(mixed_kwargs)
- self.assertEqual(result, {})
-
- def test_parse_limits_valid(self):
- """Test _parse_limits with valid limits"""
- limits = {"image": 2, "video": 3}
- result = self.processor._parse_limits(limits)
- expected = {"image": 2, "video": 3, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_empty(self):
- """Test _parse_limits with empty input"""
- result = self.processor._parse_limits(None)
- expected = {"image": 1, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_invalid_type(self):
- """Test _parse_limits with invalid type"""
- result = self.processor._parse_limits("invalid")
- expected = {"image": 1, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_partial(self):
- """Test _parse_limits with partial limits"""
- limits = {"image": 5}
- result = self.processor._parse_limits(limits)
- expected = {"image": 5, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_check_mm_limits_dict_valid(self):
- """Test _check_mm_limits with valid dict input"""
- mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]}
- # Should not raise exception
- self.processor._check_mm_limits(mm_data)
-
- def test_check_mm_limits_dict_exceed_limit(self):
- """Test _check_mm_limits when dict input exceeds limit"""
- mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]}
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(mm_data)
- self.assertIn("Too many image items", str(context.exception))
-
- def test_check_mm_limits_messages_valid(self):
- """Test _check_mm_limits with valid messages input"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "text", "text": "Describe this image."},
- ],
- }
- ]
- # Should not raise exception
- self.processor._check_mm_limits(messages)
-
- def test_check_mm_limits_messages_exceed_limit(self):
- """Test _check_mm_limits when messages input exceeds limit"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}},
- {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}},
- ],
- }
- ]
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(messages)
- self.assertIn("Too many image items", str(context.exception))
-
- def test_check_mm_limits_video_exceed(self):
- """Test _check_mm_limits when video exceeds limit"""
- mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]}
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(mm_data)
- self.assertIn("Too many video items", str(context.exception))
-
- def test_process_request_dict_with_prompt(self):
- """Test process_request_dict with prompt format"""
- request = {
- "request_id": "12345",
- "prompt": "Test prompt",
- "multimodal_data": {"image": [mock_pil_image(10, 10)]},
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(len(result.prompt_token_ids), 0)
- self.assertGreater(len(result.multimodal_inputs), 0)
-
- def test_process_request_dict_with_messages(self):
- """Test process_request_dict with messages format"""
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [{"type": "text", "text": "Hello"}],
- }
- ],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(len(result.prompt_token_ids), 0)
- self.assertGreater(len(result.multimodal_inputs), 0)
-
- def test_process_request_dict_with_prompt_token_ids_only(self):
- """Test process_request_dict with prompt_token_ids only"""
- request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": [1, 2, 3],
- }
- )
- result = self.processor.process_request_dict(request, 1024)
-
- self.assertEqual(result.prompt_token_ids, [1, 2, 3])
- self.assertEqual(result.prompt_token_ids_len, 3)
- self.assertIsNone(result.multimodal_inputs["images"])
- self.assertEqual(result.multimodal_inputs["token_type_ids"].tolist(), [0, 0, 0])
-
- def test_process_request_dict_with_prompt_token_ids_and_messages(self):
- """Test process_request_dict with prompt_token_ids and multimodal messages"""
- source_request = Request.from_dict(
- {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- )
- source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
- token_request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": list(source_result.prompt_token_ids),
- "messages": copy.deepcopy(source_request.messages),
- }
- )
- token_result = self.processor.process_request_dict(token_request, 1024 * 100)
-
- self.assertEqual(token_result.prompt_token_ids, source_result.prompt_token_ids)
- self.assertTrue(
- np.equal(token_result.multimodal_inputs["grid_thw"], source_result.multimodal_inputs["grid_thw"]).all()
- )
- self.assertTrue(
- np.equal(
- token_result.multimodal_inputs["position_ids"],
- source_result.multimodal_inputs["position_ids"],
- ).all()
- )
- self.assertTrue(
- np.equal(
- token_result.multimodal_inputs["image_type_ids"],
- source_result.multimodal_inputs["image_type_ids"],
- ).all()
- )
-
- def test_process_request_dict_prompt_token_ids_more_multimodal_segments_than_messages(self):
- """Test prompt_token_ids path when token-side multimodal segments exceed messages"""
- source_request = Request.from_dict(
- {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- )
- source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
- token_request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": list(source_result.prompt_token_ids),
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- )
-
- with self.assertRaisesRegex(ValueError, "more multimodal placeholder"):
- self.processor.process_request_dict(token_request, 1024 * 100)
-
- def test_process_request_dict_prompt_token_ids_unused_multimodal_messages(self):
- """Test prompt_token_ids path when messages have unused multimodal items"""
- source_request = Request.from_dict(
- {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "text", "text": "Describe image."},
- ],
- }
- ],
- }
- )
- source_result = self.processor.process_request_dict(source_request, 1024 * 100)
-
- token_request = Request.from_dict(
- {
- "request_id": "12345",
- "prompt_token_ids": list(source_result.prompt_token_ids),
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image."},
- ],
- }
- ],
- }
- )
-
- with self.assertRaisesRegex(ValueError, "number of multimodal items does not match"):
- self.processor.process_request_dict(token_request, 1024 * 100)
-
- def test_process_request_dict_invalid_format(self):
- """Test process_request_dict with invalid format"""
- request = {"request_id": "12345"}
- request = Request.from_dict(request)
- with self.assertRaises(ValueError) as context:
- self.processor.process_request_dict(request, 1024)
- self.assertIn("must contain 'prompt', or 'messages'", str(context.exception))
-
- def test_process_request_dict_with_bad_words(self):
- """Test process_request_dict with bad_words"""
- request = {
- "request_id": "12345",
- "prompt": "Test prompt",
- "bad_words": ["bad", "word"],
- "bad_words_token_ids": [100, 200],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- # Verify bad_words_token_ids is set
- self.assertIsNotNone(result.sampling_params.bad_words_token_ids)
-
- def test_process_request_dict_invalid_chat_template_kwargs(self):
- """Test process_request_dict with invalid chat_template_kwargs"""
- request = {
- "request_id": "12345",
- "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}],
- }
- request = Request.from_dict(request)
- request.chat_template_kwargs = "invalid"
- with self.assertRaises(ValueError) as context:
- self.processor.process_request_dict(request, 1024)
- self.assertIn("must be a dict", str(context.exception))
-
- def test_process_request_dict_with_completion_token_ids(self):
- """Test process_request_dict with completion_token_ids"""
- request = {"request_id": "12345", "prompt": "Test"}
- request = Request.from_dict(request)
- request.completion_token_ids = [1, 2, 3]
- result = self.processor.process_request_dict(request, 1024)
- # Verify completion tokens are appended
- self.assertGreater(len(result.prompt_token_ids), 3)
-
- def test_process_request_dict_prompt_truncation(self):
- """Test process_request_dict with prompt truncation"""
- # Create a long prompt that exceeds max_model_len
- long_prompt = "Test " * 1000
- request = {
- "request_id": "12345",
- "prompt": long_prompt,
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 100)
- # Verify prompt is truncated
- self.assertLessEqual(len(result.prompt_token_ids), 99)
-
- def test_process_request_dict_default_max_tokens(self):
- """Test process_request_dict sets default max_tokens"""
- request = {
- "request_id": "12345",
- "prompt": "Test",
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(result.sampling_params.max_tokens, 0)
-
- def test_process_request_dict_enable_thinking_false(self):
- """Test process_request_dict sets enable_thinking to False"""
- request = {
- "request_id": "12345",
- "prompt": "Test",
- "enable_thinking": True,
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertFalse(result.enable_thinking)
-
- def test_append_completion_tokens(self):
- """Test append_completion_tokens method"""
- multimodal_inputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- "cur_position": 3,
- }
- completion_token_ids = [4, 5]
- self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
- self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
- self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
- self.assertEqual(multimodal_inputs["cur_position"], 5)
-
- def test_pack_outputs_with_images(self):
- """Test pack_outputs with image data"""
- outputs = {
- "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
- "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
- "image_type_ids": [0, 1],
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- }
- result = self.processor.pack_outputs(outputs)
-
- self.assertIsNotNone(result["images"])
- self.assertIsNotNone(result["grid_thw"])
- self.assertIsNotNone(result["image_type_ids"])
- self.assertEqual(result["images"].shape[0], 4)
- self.assertEqual(result["grid_thw"].shape[0], 2)
-
- def test_pack_outputs_without_images(self):
- """Test pack_outputs without image data"""
- outputs = {
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- }
- result = self.processor.pack_outputs(outputs)
-
- # Test that image-related fields are None when no images
- self.assertIsNone(result["images"])
- self.assertIsNone(result["grid_thw"])
- self.assertIsNone(result["image_type_ids"])
-
- # Test data types
- self.assertEqual(result["input_ids"].dtype, np.int64)
- self.assertEqual(result["token_type_ids"].dtype, np.int64)
- self.assertEqual(result["position_ids"].dtype, np.int64)
-
- # Test patch IDs are set
- self.assertIn("image_patch_id", result)
- self.assertIn("video_patch_id", result)
- self.assertIn("mm_num_token_func", result)
-
-
-class TestSampleFrames(unittest.TestCase):
- """
- Unit tests for sample_frames function
- """
-
- def setUp(self):
- self.metadata = {
- "num_of_frame": 100,
- "fps": 25,
- }
-
- def test_fps_and_num_frames_mutually_exclusive(self):
- with self.assertRaises(ValueError):
- sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=32,
- metadata=self.metadata,
- fps=2,
- num_frames=16,
- )
-
- def test_num_frames_round_to_factor(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=64,
- metadata=self.metadata,
- num_frames=18, # round(18 / 4) * 4 = 16
- )
-
- self.assertEqual(len(indices), 16)
- self.assertEqual(indices[0], 0)
- self.assertLess(indices[-1], self.metadata["num_of_frame"])
-
- def test_fps_sampling_basic(self):
- # total = 100 frames, fps=25, target fps=5 → 20 frames
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=64,
- metadata=self.metadata,
- fps=5,
- )
-
- self.assertEqual(len(indices), 20)
- self.assertEqual(indices.dtype, np.int32)
- self.assertEqual(indices[0], 0)
-
- def test_fps_respects_min_frames(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=24,
- max_frames=64,
- metadata=self.metadata,
- fps=1, # very small fps
- )
-
- self.assertEqual(len(indices), 24)
-
- def test_num_frames_exceeds_total_raises(self):
- with self.assertRaises(ValueError):
- sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=200,
- metadata=self.metadata,
- num_frames=200,
- )
-
- def test_force_multiple_of_4_hack(self):
- indices = sample_frames(
- frame_factor=2,
- min_frames=2,
- max_frames=100,
- metadata=self.metadata,
- num_frames=10, # 10 % 4 != 0 → hack → 8
- )
-
- self.assertEqual(len(indices), 8)
- self.assertEqual(len(indices) % 4, 0)
-
- def test_keep_all_frames_when_num_frames_zero(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=0,
- max_frames=100,
- metadata=self.metadata,
- num_frames=0,
- )
-
- self.assertEqual(len(indices), self.metadata["num_of_frame"])
- np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32))
-
- def test_indices_evenly_spaced(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=32,
- metadata=self.metadata,
- num_frames=16,
- )
-
- diffs = np.diff(indices)
- self.assertTrue(np.all(diffs > 0))
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_qwen_vl_processor.py b/tests/input/v1/test_qwen_vl_processor.py
deleted file mode 100644
index 5de7df1da39..00000000000
--- a/tests/input/v1/test_qwen_vl_processor.py
+++ /dev/null
@@ -1,776 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-from PIL import Image
-
-from fastdeploy.engine.request import Request
-from fastdeploy.input.v1.qwen_vl_processor import QwenVLProcessor
-from fastdeploy.input.v1.qwen_vl_processor.process_video import sample_frames
-
-
-def mock_pil_image(height, width):
- """
- Generate mock random RGB image
-
- Args:
- height: Image height in pixels
- width: Image width in pixels
-
- Returns:
- PIL.Image object with random RGB data
- """
- rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
- return Image.fromarray(rgb_image)
-
-
-def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
- """
- Generate mock video frames with metadata for testing purposes
-
- Creates synthetic video data by generating random RGB frames and constructing
- corresponding metadata to simulate real video processing.
-
- Args:
- height (int): Height of video frames in pixels
- width (int): Width of video frames in pixels
- nums_frame (int): Number of frames to generate
- fps (int): Frames per second for the mock video
-
- Returns:
- tuple: A tuple containing:
- frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
- containing randomly generated RGB frames
- meta (dict): Dictionary with video metadata:
- - fps (int): Frames per second (same as input)
- - duration (float): Calculated duration in seconds (nums_frame/fps)
- - num_of_frame (int): Number of frames (same as nums_frame input)
- """
- frames = []
- for _ in range(nums_frame):
- frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
- frames.append(frame)
- frames = np.stack(frames, axis=0)
-
- meta = {
- "fps": fps,
- "duration": nums_frame / fps,
- "num_of_frame": nums_frame,
- }
- return frames, meta
-
-
-class TestQwenVLProcessor(unittest.TestCase):
- """
- Unit tests for Qwen Vision-Language Processor functionality
- """
-
- def setUp(self):
- """
- Initialize test case with:
- - Mock configuration
- - Patched message parsing and video processing methods
- - QwenVLProcessor instance with test parameters
- """
- config = MagicMock()
- config.vision_config.tokens_per_second = 2
-
- self.patcher_parse_image = patch(
- "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640)
- )
- self.patcher_parse_image.start()
-
- self.patcher_parse_video = patch(
- "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123"
- )
- self.patcher_parse_video.start()
-
- self.patcher_read_frames = patch(
- "fastdeploy.input.v1.qwen_vl_processor.process.DataProcessor._load_and_process_video",
- return_value=mock_read_frames(480, 640, 5, 2),
- )
- self.patcher_read_frames.start()
-
- mm_processor_kwargs = {
- "video_max_frames": 10,
- "video_min_frames": 1,
- }
- limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
-
- self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
- self.processor = QwenVLProcessor(
- config=config,
- model_name_or_path=self.model_name_or_path,
- limit_mm_per_prompt=limit_mm_per_prompt,
- mm_processor_kwargs=mm_processor_kwargs,
- reasoning_parser_obj=None,
- tool_parser_obj=None,
- )
-
- def tearDown(self) -> None:
- """Clean up test case by stopping all mock patches"""
- self.patcher_read_frames.stop()
- self.patcher_parse_image.stop()
- self.patcher_parse_video.stop()
-
- def test_process_request(self):
- """
- Test processing of Request object with multimodal input
-
- Validates:
- 1. Token ID lengths match position_ids and token_type_ids shapes
- 2. Image processing produces expected output dimensions
- 3. Video processing produces expected output dimensions
- 4. Correct counts for images (1) and videos (1)
- """
- message = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
-
- request = Request.from_dict(message)
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
- self.assertEqual(
- result.multimodal_inputs["images"].shape[0],
- sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
- )
- self.assertEqual(
- result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
- )
-
- def test_process_request_dict(self):
- """
- Test processing of dictionary-format request with multimodal input
-
- Validates:
- 1. Token ID lengths match position_ids and token_type_ids shapes
- 2. Image processing produces expected output dimensions
- 3. Video processing produces expected output dimensions
- 4. Correct counts for images (1) and videos (1)
- """
- num_completion_token_ids = 10
- request = {
- "request_id": "12345",
- "completion_token_ids": [1] * num_completion_token_ids,
- "stop": ["stop", "eof"],
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
-
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
- self.assertEqual(
- result.multimodal_inputs["images"].shape[0],
- sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
- )
- self.assertEqual(
- result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
- )
-
- def test_process_request_dict_enable_thinking(self):
- num_completion_token_ids = 10
- request = {
- "request_id": "12345",
- "completion_token_ids": [1] * num_completion_token_ids,
- "stop": ["stop", "eof"],
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "text", "text": "Hello"},
- ],
- }
- ],
- "chat_template_kwargs": {"enable_thinking": True},
- }
- request = Request.from_dict(request)
-
- result = self.processor.process_request_dict(request, 100)
- self.assertEqual(result.enable_thinking, False)
-
- def test_prompt(self):
- """
- Test processing of prompt with image and video placeholders
-
- Validates:
- 1. Token ID lengths match position_ids and token_type_ids shapes
- 2. Image processing produces expected output dimensions
- 3. Video processing produces expected output dimensions
- 4. Correct counts for images (1) and videos (1)
- """
- IMAGE_PLACEHOLDER = "<|image_pad|>"
- VIDEO_PLACEHOLDER = "<|video_pad|>"
- prompt = {
- "request_id": "12345",
- "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.",
- "multimodal_data": {
- "image": [mock_pil_image(10, 2100)],
- "video": [{"video": b"123", "fps": 5}],
- },
- }
-
- request = Request.from_dict(prompt)
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
- self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
- self.assertEqual(
- result.multimodal_inputs["images"].shape[0],
- sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
- )
- self.assertEqual(
- result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
- )
-
- def test_message_and_prompt(self):
- """
- Test consistency between message-based and prompt-based processing
-
- Validates that processing a request through:
- 1. The message format (with image/video URLs)
- 2. The prompt format (with direct image/video data)
- produces identical tokenization and multimodal input results.
-
- Checks:
- 1. Prompt token IDs match between both processing methods
- 2. Grid dimensions (THW) match between both methods
- 3. Position IDs match between both methods
- """
- # Create test request in message format
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024 * 100)
-
- # Create equivalent request in prompt format
- prompt = {
- "request_id": "12345",
- "prompt": request.prompt_tokens,
- "multimodal_data": {
- "image": [mock_pil_image(480, 640)],
- "video": [{"video": b"123"}],
- },
- }
- request2 = Request.from_dict(prompt)
- result2 = self.processor.process_request_dict(request2, 1024 * 100)
-
- # Verify both processing methods produce identical results
- self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids)
- self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all())
- self.assertTrue(
- np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all()
- )
-
- def test_apply_chat_template(self):
- """
- Test the consistency between:
- 1. Directly applying chat template using HuggingFace tokenizer
- 2. Applying chat template through the processor's request processing
-
- This test verifies that:
- - The processor correctly handles multimodal messages (image, video, text)
- - The prompt_tokens field matches the output from direct tokenizer application
- - The chat template application preserves the message structure and content
-
- Test Steps:
- 1. Create sample multimodal messages with image, video and text content
- 2. Apply chat template directly using the tokenizer
- 3. Process the same messages through the processor
- 4. Compare the outputs to ensure consistency
- """
- from transformers import AutoTokenizer
-
- tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
-
- # Sample multimodal messages containing image, video and text
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video", "video": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ]
-
- # Apply chat template directly using the tokenizer
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
- # Create equivalent request dictionary
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
- {"type": "text", "text": "Describe image and video."},
- ],
- }
- ],
- }
- request = Request.from_dict(request)
-
- # Process request through the processor
- self.processor.process_request_dict(request, 1024 * 100)
- prompt2 = request.prompt_tokens
-
- # Verify both methods produce identical prompt strings
- self.assertEqual(prompt, prompt2)
-
- def test_think_status(self):
- """测试 思考机制"""
- request = {
- "prompt": "hello",
- "request_id": "test_1",
- "prompt_token_ids": [1, 2, 3],
- "temperature": 0.7,
- "top_p": 0.9,
- }
- request = Request.from_dict(request)
- self.processor.reasoning_parser = MagicMock()
- self.processor.reasoning_parser.get_model_status.return_value = "think_start"
- self.processor.model_status_dict = {}
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- request = {
- "prompt": "hello",
- "request_id": "test",
- "prompt_token_ids": [1, 2, 3],
- "temperature": 0.7,
- "top_p": 0.9,
- }
- request = Request.from_dict(request)
- self.processor.process_request_dict(request, max_model_len=512)
- self.assertEqual(request.enable_thinking, True)
-
- def test_parse_processor_kwargs_valid(self):
- """Test _parse_processor_kwargs with valid input"""
- valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
- result = self.processor._parse_processor_kwargs(valid_kwargs)
- self.assertEqual(result, valid_kwargs)
-
- def test_parse_processor_kwargs_empty(self):
- """Test _parse_processor_kwargs with empty input"""
- result = self.processor._parse_processor_kwargs(None)
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_invalid_type(self):
- """Test _parse_processor_kwargs with invalid type"""
- result = self.processor._parse_processor_kwargs("invalid")
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_invalid_value_type(self):
- """Test _parse_processor_kwargs with invalid value type"""
- invalid_kwargs = {"video_max_frames": "10"} # Should be int
- result = self.processor._parse_processor_kwargs(invalid_kwargs)
- self.assertEqual(result, {})
-
- def test_parse_processor_kwargs_mixed_valid_invalid(self):
- """Test _parse_processor_kwargs with mixed valid and invalid types"""
- mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"}
- result = self.processor._parse_processor_kwargs(mixed_kwargs)
- self.assertEqual(result, {})
-
- def test_parse_limits_valid(self):
- """Test _parse_limits with valid limits"""
- limits = {"image": 2, "video": 3}
- result = self.processor._parse_limits(limits)
- expected = {"image": 2, "video": 3, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_empty(self):
- """Test _parse_limits with empty input"""
- result = self.processor._parse_limits(None)
- expected = {"image": 1, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_invalid_type(self):
- """Test _parse_limits with invalid type"""
- result = self.processor._parse_limits("invalid")
- expected = {"image": 1, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_parse_limits_partial(self):
- """Test _parse_limits with partial limits"""
- limits = {"image": 5}
- result = self.processor._parse_limits(limits)
- expected = {"image": 5, "video": 1, "audio": 1}
- self.assertEqual(result, expected)
-
- def test_check_mm_limits_dict_valid(self):
- """Test _check_mm_limits with valid dict input"""
- mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]}
- # Should not raise exception
- self.processor._check_mm_limits(mm_data)
-
- def test_check_mm_limits_dict_exceed_limit(self):
- """Test _check_mm_limits when dict input exceeds limit"""
- mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]}
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(mm_data)
- self.assertIn("Too many image items", str(context.exception))
-
- def test_check_mm_limits_messages_valid(self):
- """Test _check_mm_limits with valid messages input"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
- {"type": "text", "text": "Describe this image."},
- ],
- }
- ]
- # Should not raise exception
- self.processor._check_mm_limits(messages)
-
- def test_check_mm_limits_messages_exceed_limit(self):
- """Test _check_mm_limits when messages input exceeds limit"""
- messages = [
- {
- "role": "user",
- "content": [
- {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}},
- {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}},
- ],
- }
- ]
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(messages)
- self.assertIn("Too many image items", str(context.exception))
-
- def test_check_mm_limits_video_exceed(self):
- """Test _check_mm_limits when video exceeds limit"""
- mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]}
- with self.assertRaises(ValueError) as context:
- self.processor._check_mm_limits(mm_data)
- self.assertIn("Too many video items", str(context.exception))
-
- def test_process_request_dict_with_prompt(self):
- """Test process_request_dict with prompt format"""
- request = {
- "request_id": "12345",
- "prompt": "Test prompt",
- "multimodal_data": {"image": [mock_pil_image(10, 10)]},
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(len(result.prompt_token_ids), 0)
- self.assertGreater(len(result.multimodal_inputs), 0)
-
- def test_process_request_dict_with_messages(self):
- """Test process_request_dict with messages format"""
- request = {
- "request_id": "12345",
- "messages": [
- {
- "role": "user",
- "content": [{"type": "text", "text": "Hello"}],
- }
- ],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(len(result.prompt_token_ids), 0)
- self.assertGreater(len(result.multimodal_inputs), 0)
-
- def test_process_request_dict_invalid_format(self):
- """Test process_request_dict with invalid format"""
- request = {"request_id": "12345"}
- request = Request.from_dict(request)
- with self.assertRaises(ValueError) as context:
- self.processor.process_request_dict(request, 1024)
- self.assertIn("must contain 'prompt', or 'messages'", str(context.exception))
-
- def test_process_request_dict_with_bad_words(self):
- """Test process_request_dict with bad_words"""
- request = {
- "request_id": "12345",
- "prompt": "Test prompt",
- "bad_words": ["bad", "word"],
- "bad_words_token_ids": [100, 200],
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- # Verify bad_words_token_ids is set
- self.assertIsNotNone(result.sampling_params.bad_words_token_ids)
-
- def test_process_request_dict_invalid_chat_template_kwargs(self):
- """Test process_request_dict with invalid chat_template_kwargs"""
- request = {
- "request_id": "12345",
- "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}],
- }
- request = Request.from_dict(request)
- request.chat_template_kwargs = "invalid"
- with self.assertRaises(ValueError) as context:
- self.processor.process_request_dict(request, 1024)
- self.assertIn("must be a dict", str(context.exception))
-
- def test_process_request_dict_with_completion_token_ids(self):
- """Test process_request_dict with completion_token_ids"""
- request = {"request_id": "12345", "prompt": "Test"}
- request = Request.from_dict(request)
- request.completion_token_ids = [1, 2, 3]
- result = self.processor.process_request_dict(request, 1024)
- # Verify completion tokens are appended
- self.assertGreater(len(result.prompt_token_ids), 3)
-
- def test_process_request_dict_prompt_truncation(self):
- """Test process_request_dict with prompt truncation"""
- # Create a long prompt that exceeds max_model_len
- long_prompt = "Test " * 1000
- request = {
- "request_id": "12345",
- "prompt": long_prompt,
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 100)
- # Verify prompt is truncated
- self.assertLessEqual(len(result.prompt_token_ids), 99)
-
- def test_process_request_dict_default_max_tokens(self):
- """Test process_request_dict sets default max_tokens"""
- request = {
- "request_id": "12345",
- "prompt": "Test",
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertGreater(result.sampling_params.max_tokens, 0)
-
- def test_process_request_dict_enable_thinking_false(self):
- """Test process_request_dict sets enable_thinking to False"""
- request = {
- "request_id": "12345",
- "prompt": "Test",
- "enable_thinking": True,
- }
- request = Request.from_dict(request)
- result = self.processor.process_request_dict(request, 1024)
- self.assertFalse(result.enable_thinking)
-
- def test_append_completion_tokens(self):
- """Test append_completion_tokens method"""
- multimodal_inputs = {
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- "cur_position": 3,
- }
- completion_token_ids = [4, 5]
- self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
-
- self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
- self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
- self.assertEqual(multimodal_inputs["cur_position"], 5)
-
- def test_pack_outputs_with_images(self):
- """Test pack_outputs with image data"""
- outputs = {
- "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
- "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
- "image_type_ids": [0, 1],
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- }
- result = self.processor.pack_outputs(outputs)
-
- self.assertIsNotNone(result["images"])
- self.assertIsNotNone(result["grid_thw"])
- self.assertIsNotNone(result["image_type_ids"])
- self.assertEqual(result["images"].shape[0], 4)
- self.assertEqual(result["grid_thw"].shape[0], 2)
-
- def test_pack_outputs_without_images(self):
- """Test pack_outputs without image data"""
- outputs = {
- "images": [],
- "grid_thw": [],
- "image_type_ids": [],
- "input_ids": [1, 2, 3],
- "token_type_ids": [0, 0, 0],
- "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
- }
- result = self.processor.pack_outputs(outputs)
-
- # Test that image-related fields are None when no images
- self.assertIsNone(result["images"])
- self.assertIsNone(result["grid_thw"])
- self.assertIsNone(result["image_type_ids"])
-
- # Test data types
- self.assertEqual(result["input_ids"].dtype, np.int64)
- self.assertEqual(result["token_type_ids"].dtype, np.int64)
- self.assertEqual(result["position_ids"].dtype, np.int64)
-
- # Test patch IDs are set
- self.assertIn("image_patch_id", result)
- self.assertIn("video_patch_id", result)
- self.assertIn("mm_num_token_func", result)
-
-
-class TestSampleFrames(unittest.TestCase):
- """
- Unit tests for sample_frames function
- """
-
- def setUp(self):
- self.metadata = {
- "num_of_frame": 100,
- "fps": 25,
- }
-
- def test_fps_and_num_frames_mutually_exclusive(self):
- with self.assertRaises(ValueError):
- sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=32,
- metadata=self.metadata,
- fps=2,
- num_frames=16,
- )
-
- def test_num_frames_round_to_factor(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=64,
- metadata=self.metadata,
- num_frames=18, # round(18 / 4) * 4 = 16
- )
-
- self.assertEqual(len(indices), 16)
- self.assertEqual(indices[0], 0)
- self.assertLess(indices[-1], self.metadata["num_of_frame"])
-
- def test_fps_sampling_basic(self):
- # total = 100 frames, fps=25, target fps=5 → 20 frames
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=64,
- metadata=self.metadata,
- fps=5,
- )
-
- self.assertEqual(len(indices), 20)
- self.assertEqual(indices.dtype, np.int32)
- self.assertEqual(indices[0], 0)
-
- def test_fps_respects_min_frames(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=24,
- max_frames=64,
- metadata=self.metadata,
- fps=1, # very small fps
- )
-
- self.assertEqual(len(indices), 24)
-
- def test_num_frames_exceeds_total_raises(self):
- with self.assertRaises(ValueError):
- sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=200,
- metadata=self.metadata,
- num_frames=200,
- )
-
- def test_force_multiple_of_4_hack(self):
- indices = sample_frames(
- frame_factor=2,
- min_frames=2,
- max_frames=100,
- metadata=self.metadata,
- num_frames=10, # 10 % 4 != 0 → hack → 8
- )
-
- self.assertEqual(len(indices), 8)
- self.assertEqual(len(indices) % 4, 0)
-
- def test_keep_all_frames_when_num_frames_zero(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=0,
- max_frames=100,
- metadata=self.metadata,
- num_frames=0,
- )
-
- self.assertEqual(len(indices), self.metadata["num_of_frame"])
- np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32))
-
- def test_indices_evenly_spaced(self):
- indices = sample_frames(
- frame_factor=4,
- min_frames=8,
- max_frames=32,
- metadata=self.metadata,
- num_frames=16,
- )
-
- diffs = np.diff(indices)
- self.assertTrue(np.all(diffs > 0))
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_text_processor.py b/tests/input/v1/test_text_processor.py
deleted file mode 100644
index 147d843e85d..00000000000
--- a/tests/input/v1/test_text_processor.py
+++ /dev/null
@@ -1,586 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import importlib
-import importlib.util
-import sys
-import types
-import unittest
-from pathlib import Path
-from types import SimpleNamespace
-from unittest import mock
-
-import numpy as np
-
-from fastdeploy.engine.request import Request, RequestOutput
-from fastdeploy.engine.sampling_params import SamplingParams
-
-
-class DummyTokenizer:
- bos_token = ""
- cls_token = ""
- sep_token = ""
- eos_token = ""
- mask_token = ""
- chat_template = "dummy"
-
- def __init__(self):
- self.pad_token_id = 1
- self.eos_token_id = 2
- self.eos_token = 2
- self.vocab_size = 256
- self.bos_token_id = self._convert_token_to_id(self.bos_token)
- self.cls_token_id = self._convert_token_to_id(self.cls_token)
- self.sep_token_id = self._convert_token_to_id(self.sep_token)
- self.mask_token_id = self._convert_token_to_id(self.mask_token)
-
- def _convert_token_to_id(self, token):
- return len(str(token))
-
- def __call__(self, text, **kwargs):
- if isinstance(text, list):
- values = [self._value(item) for item in text]
- else:
- values = [self._value(text)]
- max_length = kwargs.get("max_length")
- if max_length is not None:
- values = values[:max_length]
- return {"input_ids": np.array([values], dtype=np.int64)}
-
- def _value(self, item):
- if isinstance(item, str):
- return len(item)
- return int(item)
-
- def tokenize(self, text):
- if isinstance(text, str):
- return [text]
- return [str(text)]
-
- def convert_tokens_to_ids(self, tokens):
- return [self._value(token) for token in tokens]
-
- def decode(self, token_ids, **kwargs):
- return " ".join(str(t) for t in token_ids)
-
- def decode_token(self, token_ids, prefix_offset, read_offset):
- start = read_offset
- delta_tokens = token_ids[start:]
- delta = "".join(str(t) for t in delta_tokens)
- prefix_offset += len(token_ids)
- read_offset += len(delta_tokens)
- return delta, prefix_offset, read_offset
-
- def batch_decode(self, batch, **kwargs):
- return [self.decode(seq) for seq in batch]
-
- def apply_chat_template(self, request, **kwargs):
- if isinstance(request, dict):
- system = request.get("system")
- messages = request.get("messages", [])
- else:
- system = getattr(request, "system", None)
- messages = getattr(request, "messages", [])
- parts = [system] if system else []
- parts.extend(msg.get("content", "") for msg in messages)
- return " ".join(part for part in parts if part)
-
-
-class DummyLlamaTokenizer(DummyTokenizer):
- pass
-
-
-class DummyAutoTokenizer:
- @classmethod
- def from_pretrained(cls, *args, **kwargs):
- return DummyTokenizer()
-
-
-class DummyHFTokenizer:
- @classmethod
- def from_pretrained(cls, *args, **kwargs):
- return DummyTokenizer()
-
-
-def _create_dummy_modules():
- """Create all dummy modules needed for testing fastdeploy.input.text_processor."""
- repo_root = Path(__file__).resolve().parents[2]
-
- dummy_logger = SimpleNamespace(
- info=lambda *args, **kwargs: None,
- warning=lambda *args, **kwargs: None,
- debug=lambda *args, **kwargs: None,
- )
-
- utils_module = types.ModuleType("fastdeploy.utils")
- utils_module.data_processor_logger = dummy_logger
-
- envs_module = types.ModuleType("fastdeploy.envs")
- envs_module.FD_USE_HF_TOKENIZER = False
-
- generation_module = types.ModuleType("paddleformers.generation")
-
- class DummyGenerationConfig:
- def __init__(self):
- self.top_p = 0.8
- self.temperature = 0.9
- self.repetition_penalty = 1.1
- self.frequency_penalty = 0.2
- self.presence_penalty = 0.1
-
- @classmethod
- def from_pretrained(cls, *args, **kwargs):
- return cls()
-
- generation_module.GenerationConfig = DummyGenerationConfig
-
- transformers_module = types.ModuleType("paddleformers.transformers")
- transformers_module.AutoTokenizer = DummyAutoTokenizer
- transformers_module.LlamaTokenizer = DummyLlamaTokenizer
- transformers_module.Llama3Tokenizer = DummyLlamaTokenizer
-
- hf_transformers_module = types.ModuleType("transformers")
- hf_transformers_module.AutoTokenizer = DummyHFTokenizer
-
- llm_utils_module = types.ModuleType("paddleformers.cli.utils.llm_utils")
- llm_utils_module.get_eos_token_id = lambda tokenizer, config: [tokenizer.eos_token_id]
-
- fastdeploy_module = types.ModuleType("fastdeploy")
- fastdeploy_module.__path__ = [str(repo_root / "fastdeploy")]
- fastdeploy_module.utils = utils_module
- fastdeploy_module.envs = envs_module
-
- return {
- "fastdeploy": fastdeploy_module,
- "fastdeploy.utils": utils_module,
- "fastdeploy.envs": envs_module,
- "paddleformers.generation": generation_module,
- "paddleformers.transformers": transformers_module,
- "transformers": hf_transformers_module,
- "paddleformers.cli.utils.llm_utils": llm_utils_module,
- }
-
-
-def _import_text_processor(use_hf_tokenizer=False):
- modules = _create_dummy_modules()
-
- modules["fastdeploy.envs"].FD_USE_HF_TOKENIZER = use_hf_tokenizer
-
- previous_modules = {}
- for name, module in modules.items():
- previous_modules[name] = sys.modules.get(name)
- sys.modules[name] = module
-
- try:
- text_processor_module = importlib.import_module("fastdeploy.input.v1.text_processor")
- importlib.reload(text_processor_module)
- except Exception:
- for name, original in previous_modules.items():
- if original is None:
- sys.modules.pop(name, None)
- else:
- sys.modules[name] = original
- raise
-
- def cleanup():
- sys.modules.pop("fastdeploy.input.text_processor", None)
- for name, original in previous_modules.items():
- if original is None:
- sys.modules.pop(name, None)
- else:
- sys.modules[name] = original
-
- return text_processor_module, cleanup
-
-
-class DummyRequest:
- def __init__(self, **kwargs):
- self.request_id = kwargs.get("request_id", "req")
- self.prompt = kwargs.get("prompt")
- self.prompt_token_ids = kwargs.get("prompt_token_ids")
- self.messages = kwargs.get("messages")
- self.eos_token_ids = kwargs.get("eos_token_ids")
- self.chat_template = kwargs.get("chat_template")
- self.enable_thinking = kwargs.get("enable_thinking")
- self.history = kwargs.get("history")
- self.tools = kwargs.get("tools")
- self.system = kwargs.get("system")
- self.sampling_params = SimpleNamespace(
- top_p=kwargs.get("top_p"),
- temperature=kwargs.get("temperature"),
- repetition_penalty=kwargs.get("repetition_penalty"),
- frequency_penalty=kwargs.get("frequency_penalty"),
- presence_penalty=kwargs.get("presence_penalty"),
- stop=kwargs.get("stop"),
- stop_token_ids=kwargs.get("stop_token_ids"),
- stop_seqs_len=kwargs.get("stop_seqs_len"),
- bad_words=kwargs.get("bad_words"),
- bad_words_token_ids=kwargs.get("bad_words_token_ids"),
- max_tokens=kwargs.get("max_tokens"),
- )
-
- def get(self, key, default=None):
- if hasattr(self, key) and getattr(self, key) is not None:
- return getattr(self, key)
- return getattr(self.sampling_params, key, default)
-
- def set(self, key, value):
- if hasattr(self.sampling_params, key):
- setattr(self.sampling_params, key, value)
- else:
- setattr(self, key, value)
-
- def to_dict(self):
- return {
- "request_id": self.request_id,
- "messages": self.messages,
- "prompt": self.prompt,
- "system": self.system,
- "history": self.history,
- "tools": self.tools,
- "chat_template": self.chat_template,
- "enable_thinking": self.enable_thinking,
- }
-
- def __getitem__(self, key):
- return self.get(key)
-
- def __setitem__(self, key, value):
- self.set(key, value)
-
-
-class DataProcessorTestCase(unittest.TestCase):
- @staticmethod
- def create_dummy_reasoning(tokenizer, reasoning_content="think"):
- class DummyReasoning:
- def __init__(self, tokenizer):
- self.tokenizer = tokenizer
-
- def extract_reasoning_content(self, full_text, response_dict, model_status):
- return reasoning_content, f"{full_text}!"
-
- return DummyReasoning(tokenizer)
-
- @staticmethod
- def create_dummy_tool_parser(tokenizer, content="tool-text"):
- class DummyToolParser:
- def __init__(self, tokenizer):
- self.tokenizer = tokenizer
-
- def extract_tool_calls(self, full_text, response_dict):
- return SimpleNamespace(tools_called=True, tool_calls=["tool"], content=content)
-
- return DummyToolParser
-
- def setUp(self):
- module, cleanup = _import_text_processor()
- self.text_processor_module = module
- self.addCleanup(cleanup)
- self.processor = self.text_processor_module.DataProcessor("stub-model")
-
- def test_base_data_processor_contract(self):
- text_processor_module = self.text_processor_module
-
- class MinimalProcessor(text_processor_module.BaseDataProcessor):
- def __init__(self):
- self.generation_config = SimpleNamespace(
- top_p=0.5,
- temperature=0.6,
- repetition_penalty=1.1,
- frequency_penalty=0.2,
- presence_penalty=0.3,
- )
- super().__init__()
-
- def _load_tokenizer(self):
- return DummyTokenizer()
-
- def process_request_dict(self, request, **kwargs):
- return super().process_request_dict(request, **kwargs)
-
- def process_response_dict(self, response_obj):
- return super().process_response_dict(response_obj)
-
- processor = MinimalProcessor()
- request = Request(request_id="test_0", sampling_params=SamplingParams())
- defaults = processor._apply_default_parameters(request)
- self.assertAlmostEqual(defaults.sampling_params.top_p, 0.5)
- with self.assertRaises(NotImplementedError):
- processor.process_request_dict({}, max_model_len=None)
- with self.assertRaises(NotImplementedError):
- processor.process_response_dict({})
- with self.assertRaises(NotImplementedError):
- processor.text2ids("text")
- with self.assertRaises(NotImplementedError):
- processor.messages2ids([])
- with self.assertRaises(NotImplementedError):
- processor.ids2tokens([1], "task")
-
- def test_process_request_dict_prompt_defaults(self):
- request = {"request_id": "test_0", "prompt": "hi", "temperature": 0, "top_p": 0, "stop": ["stop"]}
- request = Request.from_dict(request)
- processed = self.processor.process_request_dict(request, max_model_len=5)
-
- self.assertEqual(processed.prompt_token_ids, [2])
- self.assertEqual(processed.sampling_params.stop_token_ids, [[4]])
- self.assertEqual(processed.sampling_params.stop_seqs_len, [1])
- self.assertEqual(processed.sampling_params.temperature, 1)
- self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5)
- self.assertEqual(processed.sampling_params.max_tokens, 4)
-
- def test_process_request_dict_messages_template(self):
- request = {
- "request_id": "chat",
- "messages": [{"role": "user", "content": "hello"}],
- "chat_template_kwargs": {"system": "system prompt"},
- }
- request = Request.from_dict(request)
- request.chat_template_kwargs = {"system": "system prompt"}
- processed = self.processor.process_request_dict(request, max_model_len=6)
-
- self.assertEqual(processed.prompt_token_ids, [len("system prompt hello")])
- self.assertEqual(processed.system, "system prompt")
- self.assertTrue(processed.enable_thinking)
- self.assertEqual(processed.prompt_tokens, "system prompt hello")
-
- def test_process_request_dictect_handles_sequences(self):
- request = DummyRequest(
- prompt=[1, 2, 3, 4, 5, 6],
- stop=["stop"],
- bad_words=["zz"],
- temperature=0,
- top_p=0,
- )
- processed = self.processor.process_request_dict(request, max_model_len=5)
-
- self.assertEqual(processed.prompt_token_ids, [1, 2, 3, 4])
- self.assertEqual(processed.sampling_params.max_tokens, 1)
- self.assertEqual(processed.sampling_params.stop_token_ids, [[4]])
- self.assertEqual(set(processed.sampling_params.bad_words_token_ids), {2, 3})
- self.assertEqual(processed.sampling_params.temperature, 1)
- self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5)
-
- def test_process_request_requires_prompt_or_messages(self):
- request = DummyRequest(prompt=None, messages=None, prompt_token_ids=None)
- with self.assertRaisesRegex(ValueError, "Request must contain 'prompt_token_ids', 'prompt', or 'messages'"):
- self.processor.process_request_dict(request, max_model_len=5)
-
- def test_process_request_dict_rejects_bad_kwargs(self):
- request = {
- "request_id": "test_0",
- "messages": [{"role": "user", "content": "hi"}],
- "chat_template_kwargs": "invalid",
- }
- request = Request.from_dict(request)
- request.chat_template_kwargs = "invalid"
- request.sampling_params = SamplingParams()
- with self.assertRaisesRegex(ValueError, "chat_template_kwargs must be a dict"):
- self.processor.process_request_dict(request)
-
- def test_ids2tokens_and_clear_request_status(self):
- delta, _, _ = self.processor.ids2tokens([3], "task-1")
- self.assertEqual(delta, "3")
- delta, _, _ = self.processor.ids2tokens([4], "task-1")
- self.assertEqual(delta, "4")
-
- combined = self.processor.clear_request_status("task-1")
- self.assertEqual(combined, "34")
- self.assertNotIn("task-1", self.processor.decode_status)
-
- def test_clear_request_status_hf_branch(self):
- module, cleanup = _import_text_processor(use_hf_tokenizer=True)
- self.addCleanup(cleanup)
- processor = module.DataProcessor("stub-model")
- processor.decode_status = {"task": [[], [], "transcript"]}
-
- self.assertEqual(processor.clear_request_status("task"), "transcript")
- self.assertNotIn("task", processor.decode_status)
-
- def test_data_processor_init_handles_missing_generation_config(self):
- with mock.patch.object(
- self.text_processor_module.GenerationConfig,
- "from_pretrained",
- side_effect=OSError("missing"),
- ):
- processor = self.text_processor_module.DataProcessor("stub-model")
- self.assertIsNone(processor.generation_config)
-
- def test_process_response_with_reasoning_and_tools(self):
- processor = self.processor
- processor.model_status_dict = {"resp": "normal"}
-
- processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer)
- processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-only")
-
- response = SimpleNamespace(
- request_id="resp", outputs=SimpleNamespace(token_ids=[1, processor.tokenizer.eos_token_id]), finished=True
- )
-
- processed = processor.process_response_obj_normal(response)
- self.assertEqual(processed.outputs.text, "tool-only")
- self.assertEqual(processed.outputs.reasoning_content, "think")
- self.assertEqual(processed.outputs.tool_calls, ["tool"])
-
- def test_process_response_streaming_clears_state(self):
- processor = self.processor
- req_id = "stream"
- processor.decode_status[req_id] = [0, 0, [], ""]
- response = {"finished": True, "request_id": req_id, "outputs": {"token_ids": [7]}}
- response = RequestOutput.from_dict(response)
-
- result = processor.process_response_obj_streaming(response, enable_thinking=False)
- self.assertEqual(result.outputs.text, "7")
- self.assertNotIn(req_id, processor.decode_status)
-
- def test_process_response_obj_normal_with_reasoning(self):
- processor = self.processor
- processor.model_status_dict = {"normal": "normal"}
- processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer, reasoning_content="because")
- processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-text")
-
- response = {
- "finished": True,
- "request_id": "normal",
- "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]},
- }
- response = RequestOutput.from_dict(response)
-
- result = processor.process_response_obj_normal(response, enable_thinking=True)
- self.assertEqual(result.outputs.completion_tokens, "7")
- self.assertEqual(result.outputs.text, "tool-text")
- self.assertEqual(result.outputs.reasoning_content, "because")
- self.assertEqual(result.outputs.reasoning_token_num, 1)
-
- def test_process_response_dict_dispatch(self):
- processor = self.processor
- calls = {}
-
- def fake_stream(response_obj, **kwargs):
- calls["stream"] = kwargs
- return "stream"
-
- def fake_normal(response_obj, **kwargs):
- calls["normal"] = kwargs
- return "normal"
-
- original_stream = processor.process_response_obj_streaming
- original_normal = processor.process_response_obj_normal
- processor.process_response_obj_streaming = fake_stream
- processor.process_response_obj_normal = fake_normal
- self.addCleanup(lambda: setattr(processor, "process_response_obj_streaming", original_stream))
- self.addCleanup(lambda: setattr(processor, "process_response_obj_normal", original_normal))
-
- response = {"outputs": {}, "finished": False, "request_id": "req"}
- self.assertEqual(processor.process_response_dict(response, stream=True, enable_thinking=True), "stream")
- self.assertTrue(calls["stream"]["enable_thinking"])
- self.assertEqual(
- processor.process_response_dict(response, stream=False, enable_thinking=True),
- "normal",
- )
- self.assertTrue(calls["normal"]["enable_thinking"])
-
- def test_update_stop_seq_excludes_eos(self):
- stop_seqs, stop_len = self.processor.update_stop_seq(["stop", self.processor.tokenizer.eos_token_id])
- self.assertEqual(stop_seqs, [[4]])
- self.assertEqual(stop_len, [1])
-
- def test_pad_batch_data_left_padding(self):
- padded, lengths = self.processor.pad_batch_data(
- [[1], [2, 3]],
- pad_id=-1,
- return_seq_len=True,
- return_array=False,
- pad_style="left",
- )
- self.assertEqual(padded, [[-1, 1], [2, 3]])
- self.assertEqual(lengths, [1, 2])
-
- def test_pad_batch_data_empty_returns_array(self):
- padded, lengths = self.processor.pad_batch_data([], return_seq_len=True)
- self.assertEqual(padded.shape, (1, 0))
- self.assertEqual(lengths.shape, (0,))
-
- def test_get_pad_id_prefers_eos_when_missing(self):
- processor = self.text_processor_module.DataProcessor("stub-model")
- llama_tokenizer = DummyLlamaTokenizer()
- llama_tokenizer.pad_token_id = None
- llama_tokenizer.eos_token = 99
- processor.tokenizer = llama_tokenizer
-
- self.assertEqual(processor.get_pad_id(), 99)
-
- def test_load_tokenizer_hf_branch(self):
- module, cleanup = _import_text_processor(use_hf_tokenizer=True)
- self.addCleanup(cleanup)
- processor = module.DataProcessor("stub-model")
- self.assertIsInstance(processor.tokenizer, DummyTokenizer)
-
- def test_text2ids_hf_branch(self):
- module, cleanup = _import_text_processor(use_hf_tokenizer=True)
- self.addCleanup(cleanup)
- processor = module.DataProcessor("stub-model")
- ids = processor.text2ids("hi", max_model_len=5)
- self.assertEqual(ids.tolist(), [2, 0, 0, 0, 0][: len(ids)])
-
- def test_process_logprob_response(self):
- self.assertEqual(self.processor.process_logprob_response([1, 2]), "1 2")
-
- def test_process_request_dict_uses_existing_ids(self):
- request = {"request_id": "test_0", "prompt_token_ids": [1, 2, 3], "max_tokens": 5}
- request = Request.from_dict(request)
- processed = self.processor.process_request_dict(request, max_model_len=6)
- self.assertEqual(processed.prompt_token_ids, [1, 2, 3])
- self.assertEqual(processed.sampling_params.max_tokens, 3)
-
- def test_process_request_dict_requires_chat_template(self):
- original_template = self.processor.tokenizer.chat_template
- self.processor.tokenizer.chat_template = None
- self.addCleanup(lambda: setattr(self.processor.tokenizer, "chat_template", original_template))
- with self.assertRaisesRegex(ValueError, "chat_template"):
- request = {"request_id": "test_0", "messages": [{"role": "user", "content": "hi"}]}
- request = Request.from_dict(request)
- self.processor.process_request_dict(request)
-
- def test_update_bad_words_with_warnings(self):
- processor = self.processor
-
- def custom_tokenize(text):
- base = text.strip()
- if base == "combo":
- return ["co", "mbo"]
- if base == "oversize":
- return [base]
- return [base]
-
- def custom_convert(tokens):
- if tokens == ["co", "mbo"]:
- return [1, 2]
- if tokens == ["oversize"]:
- return [processor.tokenizer.vocab_size + 1]
- return [len(tokens[0])]
-
- original_tokenize = processor.tokenizer.tokenize
- original_convert = processor.tokenizer.convert_tokens_to_ids
- processor.tokenizer.tokenize = custom_tokenize
- processor.tokenizer.convert_tokens_to_ids = custom_convert
- self.addCleanup(lambda: setattr(processor.tokenizer, "tokenize", original_tokenize))
- self.addCleanup(lambda: setattr(processor.tokenizer, "convert_tokens_to_ids", original_convert))
-
- self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), [])
-
-
-if __name__ == "__main__":
- unittest.main()
diff --git a/tests/input/v1/test_tokenizer_client.py b/tests/input/v1/test_tokenizer_client.py
deleted file mode 100644
index 06804ebade3..00000000000
--- a/tests/input/v1/test_tokenizer_client.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import httpx
-import pytest
-import respx
-
-from fastdeploy.input.tokenzier_client import (
- AsyncTokenizerClient,
- ImageEncodeRequest,
- VideoEncodeRequest,
-)
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_image_success():
- base_url = "http://testserver"
- client = AsyncTokenizerClient(base_url=base_url)
-
- # Mock 创建任务接口
- respx.post(f"{base_url}/image/encode").mock(
- return_value=httpx.Response(200, json={"code": 0, "task_tag": "task123"})
- )
- # Mock 轮询接口,返回完成状态
- mock_get_ret = {
- "state": "Finished",
- "result": {"feature_url": "bos://host:port/key", "feature_shape": [80, 45, 1563]},
- }
- respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json=mock_get_ret))
-
- request = ImageEncodeRequest(
- version="v1", req_id="req_img_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg"
- )
-
- result = await client.encode_image(request)
- assert result["feature_url"] == "bos://host:port/key"
- assert result["feature_shape"] == [80, 45, 1563]
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_video_failure():
- base_url = "http://testserver"
- client = AsyncTokenizerClient(base_url=base_url, max_wait=1)
-
- respx.post(f"{base_url}/video/encode").mock(
- return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_vid_001"})
- )
- # 模拟轮询接口失败状态
- respx.get(f"{base_url}/encode/get").mock(
- return_value=httpx.Response(200, json={"state": "Error", "message": "Encode failed"})
- )
-
- request = VideoEncodeRequest(
- version="v1",
- req_id="req_vid_001",
- is_gen=True,
- resolution=720,
- video_url="http://example.com/video.mp4",
- start_ts=0.0,
- end_ts=10.0,
- frames=30,
- vit_merge=True,
- )
-
- with pytest.raises(RuntimeError, match="Encode failed"):
- await client.encode_video(request)
-
-
-@pytest.mark.asyncio
-@respx.mock
-async def test_encode_timeout():
- base_url = "http://testserver"
- client = AsyncTokenizerClient(base_url=base_url, max_wait=1, poll_interval=0.1)
-
- respx.post(f"{base_url}/image/encode").mock(
- return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_timeout"})
- )
- # 模拟轮询接口一直返回等待状态,导致超时
- respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json={"status": "processing"}))
-
- request = ImageEncodeRequest(
- version="v1", req_id="req_img_timeout", is_gen=False, resolution=256, image_url="http://example.com/image.jpg"
- )
-
- with pytest.raises(TimeoutError):
- await client.encode_image(request)
diff --git a/tests/inter_communicator/test_zmq_server.py b/tests/inter_communicator/test_zmq_server.py
index 629551b1707..57c9a0c479a 100644
--- a/tests/inter_communicator/test_zmq_server.py
+++ b/tests/inter_communicator/test_zmq_server.py
@@ -9,7 +9,6 @@
import types
import unittest
from collections import defaultdict
-from multiprocessing.reduction import ForkingPickler
from unittest import mock
import msgpack
@@ -264,19 +263,6 @@ def send(self, msg, flags=0, **kwargs):
with self.assertRaises(RuntimeError):
server.send_pyobj({"boom": True})
- def test_pack_aggregated_data_respects_env_flag(self):
- server = _DummyServer()
- responses = [_DummyResponse(1), _DummyResponse(2, finished=True)]
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- packed = server.pack_aggregated_data(responses)
- unpacked = ForkingPickler.loads(packed)
- self.assertEqual(unpacked[0]["tensor_sum"], 3)
-
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
- packed = server.pack_aggregated_data(responses)
- unpacked = ForkingPickler.loads(packed)
- self.assertIsInstance(unpacked[0], _DummyResponse)
-
def test_receive_json_once_paths(self):
fake_socket = _FakeSocket()
fake_socket.closed = True
@@ -360,8 +346,7 @@ def test_send_response_per_query_cache_and_flush(self):
self.assertIn(req_id, server.cached_results)
server.req_dict[req_id] = b"client"
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)])
+ server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)])
self.assertNotIn(req_id, server.req_dict)
self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
@@ -370,17 +355,7 @@ def test_send_response_per_query_aggregate(self):
server = _DummyServer(socket=fake_socket)
server.req_dict["req-agg"] = b"client"
server.aggregate_send = True
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)])
- self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
-
- def test_send_response_per_query_v1_processor(self):
- fake_socket = _FakeSocket()
- server = _DummyServer(socket=fake_socket)
- server.req_dict["req-v1"] = b"client"
- server.aggregate_send = False
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
- server._send_response_per_query("req-v1", [_DummyResponse(6, finished=True)])
+ server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)])
self.assertEqual(fake_socket.sent[-1][0], "send_multipart")
def test_send_response_per_query_send_failure(self):
@@ -391,8 +366,7 @@ def send_multipart(self, parts, copy=True):
server = _DummyServer(socket=_ErrorSocket())
server.req_dict["req-error"] = b"client"
server.aggregate_send = False
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)])
+ server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)])
self.assertEqual(server.req_dict, {})
def test_send_response_per_query_raises_without_socket(self):
@@ -436,22 +410,11 @@ def test_send_batch_response_success(self):
fake_socket = _FakeSocket()
server = _DummyServer(socket=fake_socket)
server.address = "test-address"
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- batch_data = [[_DummyResponse(1, finished=True)]]
- server._send_batch_response(batch_data)
+ batch_data = [[_DummyResponse(1, finished=True)]]
+ server._send_batch_response(batch_data)
self.assertEqual(len(fake_socket.sent), 1)
self.assertEqual(fake_socket.sent[0][0], "send")
- def test_send_batch_response_v1_processor(self):
- """Test _send_batch_response with ENABLE_V1_DATA_PROCESSOR=True"""
- fake_socket = _FakeSocket()
- server = _DummyServer(socket=fake_socket)
- server.address = "test-address"
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True):
- batch_data = [[_DummyResponse(1, finished=True)]]
- server._send_batch_response(batch_data)
- self.assertEqual(len(fake_socket.sent), 1)
-
def test_send_batch_response_raises_without_socket(self):
"""Test _send_batch_response logs error and returns when socket is None"""
server = _DummyServer(socket=None)
@@ -470,9 +433,8 @@ def send(self, msg, flags=0, **kwargs):
server = _DummyServer(socket=_ErrorSocket())
server.address = "test-address"
batch_data = [[_DummyResponse(1)]]
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- # Should not raise, error is caught and logged
- server._send_batch_response(batch_data)
+ # Should not raise, error is caught and logged
+ server._send_batch_response(batch_data)
def test_recv_result_handle_paths(self):
fake_socket = _FakeSocket()
@@ -640,10 +602,9 @@ def test_send_batch_response_with_worker_pid_none_uses_default_socket(self):
server = _DummyServer(socket=fake_socket)
server.address = "test-address"
- with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False):
- batch_data = [[_DummyResponse(1, finished=True)]]
- # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket
- server._send_batch_response(batch_data, worker_pid=None)
+ batch_data = [[_DummyResponse(1, finished=True)]]
+ # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket
+ server._send_batch_response(batch_data, worker_pid=None)
# The default socket should have been used to send the data
self.assertEqual(len(fake_socket.sent), 1)
diff --git a/tests/model_executor/test_thinking_budget.py b/tests/model_executor/test_thinking_budget.py
index 8ba9319ff7d..d9dd8f4b9b9 100644
--- a/tests/model_executor/test_thinking_budget.py
+++ b/tests/model_executor/test_thinking_budget.py
@@ -27,13 +27,6 @@
Ernie4_5_VLProcessor as ErnieVLDataProcessor,
)
from fastdeploy.input.text_processor import DataProcessor as TextDataProcessor
-from fastdeploy.input.v1.ernie4_5_processor import (
- Ernie4_5Processor as V1ErnieTextDataProcessor,
-)
-from fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor import (
- Ernie4_5_VLProcessor as V1ErnieVLDataProcessor,
-)
-from fastdeploy.input.v1.text_processor import DataProcessor as V1TextDataProcessor
from fastdeploy.model_executor.logits_processor import ThinkingBudgetLogitsProcessor
from fastdeploy.scheduler import SchedulerConfig
@@ -711,31 +704,6 @@ def parallel_config(self):
ips = None
-class DummyRequestV1(SimpleNamespace):
- def get(self, key, default=None):
- if hasattr(self, key):
- value = getattr(self, key)
- if value is not None:
- return value
- if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key):
- value = getattr(self.sampling_params, key)
- if value is not None:
- return value
- return default
-
- def __getitem__(self, key):
- return getattr(self, key)
-
- def __setitem__(self, key, value):
- setattr(self, key, value)
-
- def set(self, key, value):
- if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key):
- setattr(self.sampling_params, key, value)
- else:
- setattr(self, key, value)
-
-
class TestThinkingBudgetSupplemental(unittest.TestCase):
def test_update_thinking_prompt_state_from_text_processor(self):
processor = TextDataProcessor.__new__(TextDataProcessor)
@@ -750,43 +718,6 @@ def test_update_thinking_prompt_state_from_text_processor(self):
self.assertEqual(updated["think_prompt_tokens_after_start"], 0)
self.assertEqual(updated["think_prompt_last_token_id"], 3)
- def test_v1_process_request_missing_logits_processors_args(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- processor.generation_config = SimpleNamespace(
- top_p=0.7,
- temperature=1.0,
- repetition_penalty=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- )
- processor.eos_token_ids = [1]
- processor.update_stop_seq = lambda *args, **kwargs: None
- processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
- processor.encode_with_cache = lambda *args, **kwargs: [1]
- processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
- processor.reasoning_parser = None
- request = DummyRequestV1(
- request_id="req",
- eos_token_ids=None,
- prompt_token_ids=[1],
- prompt=None,
- messages=None,
- max_tokens=1,
- chat_template_kwargs=None,
- sampling_params=SimpleNamespace(
- bad_words=None,
- bad_words_token_ids=None,
- max_tokens=1,
- temperature=1.0,
- top_p=0.9,
- repetition_penalty=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- ),
- )
- with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
- processor.process_request(request, max_model_len=8)
-
def test_engine_line_break_id_from_dict(self):
tokenizer = DummyTokenizerForTextProcessor()
data_processor = SimpleNamespace(tokenizer=tokenizer, eos_token_id_len=1, pad_token_id=0)
@@ -835,27 +766,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False):
self.assertEqual(processor.encode_with_cache("iter"), [21, 22])
self.assertNotIn(("np", False), processor._tokenize_cache)
- def test_v1_encode_with_cache_branches(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- processor._tokenize_cache = OrderedDict()
- processor._tokenize_cache_capacity = 1
- call_counter = {"np": 0, "iter": 0}
-
- def _text2ids(text, max_model_len=None, add_special_tokens=False):
- if text == "np":
- call_counter["np"] += 1
- return np.array([31, 32], dtype=np.int64)
- call_counter["iter"] += 1
- return (v for v in [41, 42])
-
- processor.text2ids = _text2ids
-
- self.assertEqual(processor.encode_with_cache("np"), [31, 32])
- self.assertEqual(processor.encode_with_cache("np"), [31, 32])
- self.assertEqual(call_counter["np"], 1)
- self.assertEqual(processor.encode_with_cache("iter"), [41, 42])
- self.assertNotIn(("np", False), processor._tokenize_cache)
-
def test_text_encode_with_cache_lazy_init(self):
processor = TextDataProcessor.__new__(TextDataProcessor)
call_counter = {"count": 0}
@@ -872,22 +782,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False):
self.assertEqual(processor.encode_with_cache("lazy"), [51, 52])
self.assertEqual(call_counter["count"], 1)
- def test_v1_encode_with_cache_lazy_init(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- call_counter = {"count": 0}
-
- def _text2ids(text, max_model_len=None, add_special_tokens=False):
- call_counter["count"] += 1
- return np.array([61, 62], dtype=np.int64)
-
- processor.text2ids = _text2ids
-
- self.assertFalse(hasattr(processor, "_tokenize_cache"))
- self.assertEqual(processor.encode_with_cache("lazy"), [61, 62])
- self.assertTrue(hasattr(processor, "_tokenize_cache"))
- self.assertEqual(processor.encode_with_cache("lazy"), [61, 62])
- self.assertEqual(call_counter["count"], 1)
-
def test_ernie_encode_literal_text_with_cache(self):
processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor)
processor.tokenizer = SimpleNamespace(
@@ -898,16 +792,6 @@ def test_ernie_encode_literal_text_with_cache(self):
self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72])
self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72])
- def test_v1_ernie_encode_literal_text_with_cache(self):
- processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor)
- processor.tokenizer = SimpleNamespace(
- tokenize=lambda text: ["token_c", "token_d"],
- convert_tokens_to_ids=lambda tokens: [81, 82],
- )
-
- self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82])
- self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82])
-
def test_text_update_thinking_prompt_state_branches(self):
processor = TextDataProcessor.__new__(TextDataProcessor)
processor._think_token_ids = None
@@ -949,29 +833,6 @@ def test_text_update_thinking_prompt_state_branches(self):
# 命中 _get_think_token_ids 的缓存分支
self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID))
- def test_v1_update_thinking_prompt_state_branches(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- processor._think_token_ids = None
- processor.tokenizer = DummyTokenizerForTextProcessor()
-
- self.assertEqual(processor._update_thinking_prompt_state([1], "not-dict"), "not-dict")
- self.assertEqual(
- processor._update_thinking_prompt_state([1], {"thinking_budget": -1}), {"thinking_budget": -1}
- )
- self.assertEqual(processor._update_thinking_prompt_state(None, {"thinking_budget": 1}), {"thinking_budget": 1})
-
- with_start_no_end = processor._update_thinking_prompt_state(
- np.array([1, THINKING_START_TOKEN_ID, 2, 3], dtype=np.int64),
- {"thinking_budget": 4},
- )
- self.assertTrue(with_start_no_end["think_prompt_started"])
- self.assertFalse(with_start_no_end["think_prompt_ended"])
- self.assertEqual(with_start_no_end["think_prompt_tokens_after_start"], 0)
- self.assertEqual(with_start_no_end["think_prompt_last_token_id"], 3)
-
- # 命中 _get_think_token_ids 的缓存分支
- self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID))
-
def test_text_process_request_dict_think_stop_sentence(self):
processor = TextDataProcessor.__new__(TextDataProcessor)
processor._apply_default_parameters = lambda request: request
@@ -1003,74 +864,6 @@ def test_text_process_request_dict_think_stop_sentence(self):
)
self.assertNotIn("think_stop_sentence", processed["logits_processors_args"])
- def test_v1_process_request_think_stop_sentence(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- processor._apply_default_parameters = lambda request: request
- processor.eos_token_ids = [1]
- processor.update_stop_seq = lambda *args, **kwargs: None
- processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
- processor._encode_literal_text_with_cache = lambda text: [301, 302]
- processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
- processor.reasoning_parser = None
-
- request = DummyRequestV1(
- request_id="req_v1",
- eos_token_ids=[1],
- prompt_token_ids=[10],
- prompt=None,
- messages=None,
- logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
- bad_words=None,
- bad_words_token_ids=None,
- max_tokens=1,
- temperature=1.0,
- top_p=0.9,
- )
- with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
- processed = processor.process_request(request, max_model_len=16)
- self.assertEqual(
- processed.logits_processors_args.get("think_stop_sentence_token_ids"),
- [301, 302],
- )
- self.assertNotIn("think_stop_sentence", processed.logits_processors_args)
-
- def test_v1_process_request_dict_think_stop_sentence(self):
- processor = V1TextDataProcessor.__new__(V1TextDataProcessor)
- processor._apply_default_parameters = lambda request: request
- processor.eos_token_ids = [1]
- processor.update_stop_seq = lambda *args, **kwargs: None
- processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
- processor._encode_literal_text_with_cache = lambda text: [401, 402]
- processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args
- processor.reasoning_parser = None
-
- request = DummyRequestV1(
- request_id="req_v1_dict",
- eos_token_ids=[1],
- prompt_token_ids=[11],
- prompt=None,
- messages=None,
- chat_template_kwargs=None,
- sampling_params=SimpleNamespace(
- bad_words=None,
- bad_words_token_ids=None,
- max_tokens=1,
- temperature=1.0,
- top_p=0.9,
- repetition_penalty=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
- ),
- )
- with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None):
- processed = processor.process_request_dict(request, max_model_len=16)
- self.assertEqual(
- processed.sampling_params.logits_processors_args.get("think_stop_sentence_token_ids"),
- [401, 402],
- )
- self.assertNotIn("think_stop_sentence", processed.sampling_params.logits_processors_args)
-
def test_ernie_process_request_dict_prepares_thinking_budget_args(self):
processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor)
processor._apply_default_parameters = lambda request: request
@@ -1104,46 +897,6 @@ def test_ernie_process_request_dict_prepares_thinking_budget_args(self):
self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"])
self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0)
- def test_v1_ernie_process_request_dict_prepares_thinking_budget_args(self):
- processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor)
- processor._apply_default_parameters = lambda request: request
- processor.eos_token_ids = [1]
- processor.update_stop_seq = lambda *args, **kwargs: None
- processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
- processor._encode_literal_text_with_cache = lambda text: [601, 602]
- processor.tokenizer = DummyTokenizerForTextProcessor()
- processor.reasoning_parser = None
-
- request = DummyRequestV1(
- request_id="req_v1_ernie_text",
- eos_token_ids=[1],
- prompt_token_ids=[1, THINKING_START_TOKEN_ID, 2],
- prompt=None,
- messages=None,
- chat_template_kwargs=None,
- enable_thinking=True,
- sampling_params=SimpleNamespace(
- bad_words=None,
- bad_words_token_ids=None,
- max_tokens=1,
- temperature=1.0,
- top_p=0.9,
- repetition_penalty=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- response_max_tokens=None,
- n=1,
- logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
- ),
- )
- with patch("fastdeploy.input.v1.ernie4_5_processor.process_stop_token_ids", lambda *args, **kwargs: None):
- processed = processor.process_request_dict(request, max_model_len=16)
-
- self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [601, 602])
- self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"])
- self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"])
- self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0)
-
def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
processor = ErnieVLDataProcessor.__new__(ErnieVLDataProcessor)
processor._apply_default_parameters = lambda request: request
@@ -1182,58 +935,6 @@ def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"])
self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0)
- def test_v1_ernie_vl_process_request_dict_prepares_thinking_budget_args(self):
- processor = V1ErnieVLDataProcessor.__new__(V1ErnieVLDataProcessor)
- processor._apply_default_parameters = lambda request: request
- processor.eos_token_ids = [1]
- processor.update_stop_seq = lambda *args, **kwargs: None
- processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids
- processor._encode_literal_text_with_cache = lambda text: [801, 802]
- processor.tokenizer = DummyTokenizerForTextProcessor()
- processor.reasoning_parser = None
- processor._check_mm_limits = lambda *args, **kwargs: None
- processor.append_completion_tokens = lambda *args, **kwargs: None
- processor.pack_outputs = lambda outs: outs
- processor.ernie4_5_processor = SimpleNamespace(
- request2ids=lambda request: {"input_ids": np.array([1, THINKING_START_TOKEN_ID, 2], dtype=np.int64)}
- )
-
- request = DummyRequestV1(
- request_id="req_v1_ernie_vl",
- eos_token_ids=[1],
- prompt_token_ids=None,
- prompt=None,
- messages=[{"role": "user", "content": "hi"}],
- chat_template_kwargs=None,
- enable_thinking=True,
- completion_token_ids=None,
- multimodal_data=None,
- sampling_params=SimpleNamespace(
- bad_words=None,
- bad_words_token_ids=None,
- max_tokens=1,
- temperature=1.0,
- top_p=0.9,
- repetition_penalty=1.0,
- frequency_penalty=0.0,
- presence_penalty=0.0,
- response_max_tokens=None,
- reasoning_max_tokens=None,
- n=1,
- logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"},
- ),
- )
- with patch(
- "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.process_stop_token_ids",
- lambda *args, **kwargs: None,
- ):
- processed = processor.process_request_dict(request, max_model_len=16)
-
- self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [801, 802])
- self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"])
- self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"])
- self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0)
-
if __name__ == "__main__":
unittest.main()