diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py index 3f99388d685..4afb3dc5c49 100644 --- a/fastdeploy/engine/async_llm.py +++ b/fastdeploy/engine/async_llm.py @@ -446,7 +446,7 @@ async def add_request( ) if envs.ZMQ_SEND_BATCH_DATA and self.connection_manager is not None: request["zmq_worker_pid"] = self.connection_manager.worker_pid - if not envs.ENABLE_V1_DATA_PROCESSOR and self.cfg.model_config.enable_mm: + if self.cfg.model_config.enable_mm: self.request_client.send_pyobj(request) else: self.request_client.send_json(request) @@ -543,8 +543,7 @@ async def generate( ) else: processed_output = response_item - if not envs.ENABLE_V1_DATA_PROCESSOR: - processed_output = RequestOutput.from_dict(processed_output) + processed_output = RequestOutput.from_dict(processed_output) # Enrich outputs with prompt metadata on the first packet if req_id: prompt_meta = self._prompt_metadata.get(req_id) diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 28776b53ede..a2e72a1c844 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -1165,7 +1165,7 @@ def _insert_zmq_task_to_scheduler(self): while self.running: try: block = True if len(added_requests) == 0 else False - if not self.cfg.model_config.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR: + if not self.cfg.model_config.enable_mm: err, data = self.recv_request_server.receive_json_once(block) else: err, data = self.recv_request_server.receive_pyobj_once(block) @@ -1222,8 +1222,7 @@ def _insert_zmq_task_to_scheduler(self): continue err_msg = None try: - if not envs.ENABLE_V1_DATA_PROCESSOR: - request = Request.from_dict(data) + request = Request.from_dict(data) request.metrics.scheduler_recv_req_time = time.time() main_process_metrics.requests_number.inc() trace_carrier = data.get("trace_carrier") diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 3f311e743fd..f03a18594de 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -437,7 +437,7 @@ async def add_requests(self, task): def _send_task(self, task): if envs.ZMQ_SEND_BATCH_DATA: task["zmq_worker_pid"] = self.worker_pid - if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR: + if not self.enable_mm: self.zmq_client.send_json(task) else: if envs.FD_ENABLE_E2W_TENSOR_CONVERT: @@ -599,7 +599,7 @@ async def run_control_method(self, request: ControlRequest): req_dict = request.to_dict() if envs.ZMQ_SEND_BATCH_DATA: req_dict["zmq_worker_pid"] = self.worker_pid - if not self.enable_mm and not envs.ENABLE_V1_DATA_PROCESSOR: + if not self.enable_mm: self.zmq_client.send_json(req_dict) else: self.zmq_client.send_pyobj(req_dict) diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 9d380b0db0c..09e06ffa0f6 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -26,7 +26,7 @@ import fastdeploy.envs as envs import fastdeploy.metrics.trace as tracing -from fastdeploy.engine.request import Request, RequestOutput +from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -145,10 +145,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest): prompt_tokens = None max_tokens = None try: - if not envs.ENABLE_V1_DATA_PROCESSOR: - current_req_dict = request.to_dict_for_infer(f"{request_id}_0") - else: - current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0") + current_req_dict = request.to_dict_for_infer(f"{request_id}_0") if "chat_template" not in current_req_dict: current_req_dict["chat_template"] = self.chat_template current_req_dict["metrics"]["arrival_time"] = time.time() diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index 4caf9fe210a..9c2b386fce0 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -27,7 +27,7 @@ import fastdeploy.envs as envs import fastdeploy.metrics.trace as tracing -from fastdeploy.engine.request import Request, RequestOutput +from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( CompletionLogprobs, CompletionRequest, @@ -178,10 +178,7 @@ async def create_completion(self, request: CompletionRequest): try: for idx, prompt in enumerate(request_prompts): request_id_idx = f"{request_id}_{idx}" - if not envs.ENABLE_V1_DATA_PROCESSOR: - current_req_dict = request.to_dict_for_infer(request_id_idx, prompt) - else: - current_req_dict = Request.from_generic_request(request, request_id=f"{request_id}_0") + current_req_dict = request.to_dict_for_infer(request_id_idx, prompt) current_req_dict["metrics"]["arrival_time"] = time.time() prompt_token_ids = await self.engine_client.format_and_add_data(current_req_dict) # tokenize if isinstance(prompt_token_ids, np.ndarray): diff --git a/fastdeploy/entrypoints/openai/serving_embedding.py b/fastdeploy/entrypoints/openai/serving_embedding.py index ec3223b3576..25f3f630510 100644 --- a/fastdeploy/entrypoints/openai/serving_embedding.py +++ b/fastdeploy/entrypoints/openai/serving_embedding.py @@ -15,20 +15,17 @@ """ import base64 -import time from collections.abc import AsyncGenerator from typing import Literal, Union import numpy as np from typing_extensions import assert_never, override -import fastdeploy.envs as envs from fastdeploy.engine.pooling_params import PoolingParams from fastdeploy.engine.request import ( EmbeddingOutput, EmbeddingRequestOutput, PoolingRequestOutput, - Request, ) from fastdeploy.entrypoints.openai.protocol import ( EmbeddingCompletionRequest, @@ -69,25 +66,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_ @override def _request_to_dict(self, ctx: ServeContext): request: EmbeddingRequest = ctx.request - if not envs.ENABLE_V1_DATA_PROCESSOR: - request_dict = super()._request_to_dict(ctx) - if hasattr(request, "to_pooling_params"): - pooling_params: PoolingParams = request.to_pooling_params() - pooling_params.verify("embed", self.cfg.model_config) - request_dict["pooling_params"] = pooling_params.to_dict() - request_dict["metrics"] = {} - return request_dict - else: - request_obj = None - if hasattr(request, "to_pooling_params"): - pooling_params: PoolingParams = request.to_pooling_params() - pooling_params.verify("embed", self.cfg.model_config) - request_obj = Request.from_generic_request( - req=request, request_id=ctx.request_id, pooling_params=pooling_params - ) - request_obj.metrics.arrival_time = time.time() - super()._process_chat_template_kwargs(request_obj) - return request_obj + request_dict = super()._request_to_dict(ctx) + if hasattr(request, "to_pooling_params"): + pooling_params: PoolingParams = request.to_pooling_params() + pooling_params.verify("embed", self.cfg.model_config) + request_dict["pooling_params"] = pooling_params.to_dict() + request_dict["metrics"] = {} + return request_dict @override def _request_to_batch_dicts(self, ctx: ServeContext): diff --git a/fastdeploy/entrypoints/openai/serving_reward.py b/fastdeploy/entrypoints/openai/serving_reward.py index cbde62deea5..cc3ed8a4729 100644 --- a/fastdeploy/entrypoints/openai/serving_reward.py +++ b/fastdeploy/entrypoints/openai/serving_reward.py @@ -14,14 +14,12 @@ # limitations under the License. """ -import time from collections.abc import AsyncGenerator from typing_extensions import override -import fastdeploy.envs as envs from fastdeploy.engine.pooling_params import PoolingParams -from fastdeploy.engine.request import PoolingRequestOutput, Request, RewardRequestOutput +from fastdeploy.engine.request import PoolingRequestOutput, RewardRequestOutput from fastdeploy.entrypoints.openai.protocol import ( ChatRewardData, ChatRewardRequest, @@ -46,25 +44,13 @@ def __init__(self, engine_client, models, cfg, pid, ips, max_waiting_time, chat_ @override def _request_to_dict(self, ctx: ServeContext): request: ChatRewardRequest = ctx.request - if not envs.ENABLE_V1_DATA_PROCESSOR: - request_dict = super()._request_to_dict(ctx) - if hasattr(request, "to_pooling_params"): - pooling_params: PoolingParams = request.to_pooling_params() - pooling_params.verify("reward", self.cfg.model_config) - request_dict["pooling_params"] = pooling_params.to_dict() - request_dict["metrics"] = {} - return request_dict - else: - request_obj: Request = None - if hasattr(request, "to_pooling_params"): - pooling_params: PoolingParams = request.to_pooling_params() - pooling_params.verify("reward", self.cfg.model_config) - request_obj = Request.from_generic_request( - req=request, request_id=ctx.request_id, pooling_params=pooling_params - ) - request_obj.metrics.arrival_time = time.time() - super()._process_chat_template_kwargs(request_obj) - return request_obj + request_dict = super()._request_to_dict(ctx) + if hasattr(request, "to_pooling_params"): + pooling_params: PoolingParams = request.to_pooling_params() + pooling_params.verify("reward", self.cfg.model_config) + request_dict["pooling_params"] = pooling_params.to_dict() + request_dict["metrics"] = {} + return request_dict @override def _request_to_batch_dicts(self, ctx: ServeContext): diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 72cd6dc7c48..fef58eaf6cc 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -94,8 +94,6 @@ def _validate_split_kv_size(value: int) -> int: "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"), # enable kv cache block scheduler v1 (no need for kv_cache_ratio) "ENABLE_V1_KVCACHE_SCHEDULER": lambda: int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "1")), - # enable data processor v2 - "ENABLE_V1_DATA_PROCESSOR": lambda: int(os.getenv("ENABLE_V1_DATA_PROCESSOR", "0")), # set prealloc block num for decoder "FD_ENC_DEC_BLOCK_NUM": lambda: int(os.getenv("FD_ENC_DEC_BLOCK_NUM", "2")), # enbale max prefill of one execute step diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 04c028d9060..56bbe1296f7 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -19,7 +19,6 @@ from fastdeploy.config import ErnieArchitectures, ModelConfig from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager from fastdeploy.reasoning import ReasoningParserManager -from fastdeploy.utils import envs from fastdeploy.utils import llm_logger as logger @@ -83,10 +82,7 @@ def create_processor(self): logger.info(f"Plugin input processor not available ({e}), using built-in processor") if not self.model_config.enable_mm: if not ErnieArchitectures.contains_ernie_arch(architecture): - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.text_processor import DataProcessor - else: - from fastdeploy.input.v1.text_processor import DataProcessor + from fastdeploy.input.text_processor import DataProcessor self.processor = DataProcessor( model_name_or_path=self.model_name_or_path, @@ -94,14 +90,7 @@ def create_processor(self): tool_parser_obj=tool_parser_obj, ) else: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.ernie4_5_processor import ( - Ernie4_5Processor, - ) - else: - from fastdeploy.input.v1.ernie4_5_processor import ( - Ernie4_5Processor, - ) + from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor self.processor = Ernie4_5Processor( model_name_or_path=self.model_name_or_path, @@ -110,14 +99,9 @@ def create_processor(self): ) else: if ErnieArchitectures.contains_ernie_arch(architecture): - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) - else: - from fastdeploy.input.v1.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor, - ) + from fastdeploy.input.ernie4_5_vl_processor import ( + Ernie4_5_VLProcessor, + ) self.processor = Ernie4_5_VLProcessor( model_name_or_path=self.model_name_or_path, @@ -128,14 +112,9 @@ def create_processor(self): enable_processor_cache=self.enable_processor_cache, ) elif "PaddleOCRVL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) - else: - from fastdeploy.input.v1.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, - ) + from fastdeploy.input.paddleocr_vl_processor import ( + PaddleOCRVLProcessor, + ) self.processor = PaddleOCRVLProcessor( config=self.model_config, @@ -145,12 +124,7 @@ def create_processor(self): reasoning_parser_obj=reasoning_parser_obj, ) elif "Qwen2_5_VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen_vl_processor import QwenVLProcessor - else: - from fastdeploy.input.v1.qwen_vl_processor import ( - QwenVLProcessor, - ) + from fastdeploy.input.qwen_vl_processor import QwenVLProcessor self.processor = QwenVLProcessor( config=self.model_config, @@ -161,12 +135,7 @@ def create_processor(self): enable_processor_cache=self.enable_processor_cache, ) elif "Qwen3VL" in architecture: - if not envs.ENABLE_V1_DATA_PROCESSOR: - from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor - else: - from fastdeploy.input.v1.qwen3_vl_processor import ( - Qwen3VLProcessor, - ) + from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor self.processor = Qwen3VLProcessor( config=self.model_config, diff --git a/fastdeploy/input/v1/__init__.py b/fastdeploy/input/v1/__init__.py deleted file mode 100644 index f4ede90624a..00000000000 --- a/fastdeploy/input/v1/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" diff --git a/fastdeploy/input/v1/ernie4_5_processor.py b/fastdeploy/input/v1/ernie4_5_processor.py deleted file mode 100644 index f6545dc068a..00000000000 --- a/fastdeploy/input/v1/ernie4_5_processor.py +++ /dev/null @@ -1,641 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import os - -import numpy as np -from paddleformers.generation import GenerationConfig - -from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer -from fastdeploy.input.v1.text_processor import BaseDataProcessor -from fastdeploy.utils import data_processor_logger - -_SAMPLING_EPS = 1e-5 -from fastdeploy.input.utils import process_stop_token_ids - - -class Ernie4_5Processor(BaseDataProcessor): - """ - 初始化模型实例。 - - Args: - model_name_or_path (str): 模型名称或路径。 - - Attributes: - model_name_or_path (str): 存储模型名称或路径。 - decode_status (dict): 存储解码状态信息。 - tokenizer (object): 存储分词器实例。 - eos_token_ids (list): 存储结束符号的token ID列表。 - eos_token_id_len (int): 存储结束符号的token ID列表的长度。 - pad_token_id (int): 存储填充符号的token ID。 - """ - - def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None): - - self.model_name_or_path = model_name_or_path - data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") - - # Generation config - try: - self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path) - except Exception as e: - data_processor_logger.warning( - f"Can't find generation config, so it will not use " - f"generation_config field in the model config, details={e}" - ) - self.generation_config = None - - self.decode_status = dict() - self.tool_parser_dict = dict() - self.thinking_parser_dict = dict() - self.model_status_dict = dict() - self._load_tokenizer() - data_processor_logger.info( - f"tokenizer information: bos_token is {self.tokenizer.bos_token} \ - {self.tokenizer.bos_token_id}, \ - eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} " - ) - try: - from paddleformers.trl.llm_utils import get_eos_token_id - except Exception: - from paddleformers.cli.utils.llm_utils import get_eos_token_id - - self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config) - self.eos_token_id_len = len(self.eos_token_ids) - self.pad_token_id = self.get_pad_id() - self.reasoning_parser = None - self.tool_parser_obj = tool_parser_obj - if reasoning_parser_obj: - self.reasoning_parser = reasoning_parser_obj(self.tokenizer) - - def process_request(self, request, max_model_len=None, **kwargs): - """ - Preprocess the request - - Args: - request (Dict): may contain text and messages fields - - Returns: - bool: Whether preprocessing is successful - str: error message - """ - data_processor_logger.info(f"Start processing request: {request}") - request = self._apply_default_parameters(request) - if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - # processing bad_words - bad_words = request.get("bad_words") - bad_words_token_ids = request.get("bad_words_token_ids") - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request["bad_words_token_ids"] = bad_words_token_ids - - logits_processors_args = self._prepare_think_stop_sentence( - request.get("logits_processors_args") or {}, max_model_len - ) - request["logits_processors_args"] = logits_processors_args - - # processing prompt_token_ids - if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: - if request.prompt is not None: - # prompt = request.prompt if request.prompt is not None else request.messages[0] - prompt = request.prompt - assert isinstance(prompt, str) or ( - isinstance(prompt, list) and all([isinstance(t, int) for t in prompt]) - ), f"prompt must be a string or a list of integers, but got {type(prompt)}" - - if isinstance(prompt, list): # if prompt is a token id list - request.prompt_token_ids = prompt - else: - tokens = self.tokenizer.tokenize(prompt) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - request.prompt_token_ids = token_ids - data_processor_logger.debug( - f"request_ids: {request.request_id}, prompt: {prompt}, " - f"tokens: {tokens}, token_ids: {token_ids}" - ) - elif request.messages is not None: - task = request.to_dict() - chat_template_kwargs = kwargs.get("chat_template_kwargs", {}) - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if k not in task or task[k] is None: - task[k] = v - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs) - else: - raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.") - - if len(request.prompt_token_ids) == 0: - raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") - - # truncate prompts that exceed the length limit - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] - logits_processors_args = self._update_thinking_prompt_state( - request.prompt_token_ids, request.get("logits_processors_args") or {} - ) - request["logits_processors_args"] = logits_processors_args - max_tokens = max_model_len - len(request.prompt_token_ids) - if request.get("max_tokens") is None: - request.set("max_tokens", max(1, max_tokens)) - else: - request.set("max_tokens", min(max_tokens, request.get("max_tokens"))) - if request.get("temperature") < _SAMPLING_EPS: - # zero temperature is equivalent to greedy sampling - request.set("temperature", 1) - request.set("top_k", 1) - if request.get("top_p") < _SAMPLING_EPS: - request.set("top_p", _SAMPLING_EPS) - request.set("top_k", 1) - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - if request.get("response_max_tokens") is not None and request.enable_thinking is False: - request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"]) - - data_processor_logger.info(f"Processed request: {request}") - return request - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """ - Preprocess the request - - Args: - request Request: may contain text and messages fields - - Returns: - bool: Whether preprocessing is successful - str: error message - """ - data_processor_logger.info(f"Start processing request: {request}") - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - # processing bad_words - bad_words = request.sampling_params.bad_words - bad_words_token_ids = request.sampling_params.bad_words_token_ids - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request.sampling_params.bad_words_token_ids = bad_words_token_ids - - logits_processors_args = self._prepare_think_stop_sentence( - getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len - ) - request.sampling_params.logits_processors_args = logits_processors_args - - # processing prompt_token_ids - if not request.prompt_token_ids: - if request.prompt: - prompt = request.prompt - assert isinstance(prompt, str) or ( - isinstance(prompt, list) and all([isinstance(t, int) for t in prompt]) - ), f"prompt must be a string or a list of integers, but got {type(prompt)}" - if isinstance(prompt, list): # if prompt is a token id list - request.prompt_token_ids = prompt - else: - request.prompt_tokens = prompt - tokens = self.tokenizer.tokenize(prompt) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - request.prompt_token_ids = token_ids - req_id = request.request_id - data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") - elif request.messages: - chat_template_kwargs = kwargs.get("chat_template_kwargs", {}) - if not chat_template_kwargs: - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if not getattr(request, k, None): - setattr(request, k, v) - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - if getattr(request, "enable_thinking") is None: - setattr(request, "enable_thinking", True) - request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs) - delattr(request, "chat_template_kwargs") - else: - raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") - - if len(request.prompt_token_ids) == 0: - raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") - - # truncate prompts that exceed the length limit - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] - logits_processors_args = self._update_thinking_prompt_state( - request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {} - ) - request.sampling_params.logits_processors_args = logits_processors_args - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - if request.sampling_params.temperature < _SAMPLING_EPS: - # zero temperature is equivalent to greedy sampling - request.sampling_params.temperature = 1 - request.sampling_params.top_k = 1 - if request.sampling_params.top_p < _SAMPLING_EPS: - request.sampling_params.top_p = _SAMPLING_EPS - request.sampling_params.top_k = 1 - - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.sampling_params.n or 1 - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False: - request.sampling_params.max_tokens = min( - request.sampling_params.response_max_tokens, request.sampling_params.max_tokens - ) - - data_processor_logger.info(f"Processed request: {request}") - return request - - def process_response(self, response_dict, **kwargs): - """ - Preprocess the response - - Args: - response_dict (Dict): response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - req_id = response_dict.request_id - token_ids = response_dict.outputs.token_ids - - response_dict.usage = {"completion_tokens": response_dict.outputs.index + 1} - if token_ids[-1] == self.tokenizer.eos_token_id: - token_ids = token_ids[:-1] - full_text = self.tokenizer.decode(token_ids) - if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, - response_dict, - self.model_status_dict[req_id], - ) - response_dict.outputs.text = text - response_dict.outputs.reasoning_content = reasoning_content - else: - response_dict.outputs.text = full_text - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict.outputs.tool_calls = tool_call_info.tool_calls - response_dict.outputs.text = tool_call_info.content - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") - if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": - return None - return response_dict - - def process_response_dict(self, response_dict, stream, **kwargs): - """ - Preprocess the response - - Args: - response_dict (Dict): response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - if stream: - return self.process_response_obj_streaming(response_dict, **kwargs) - else: - return self.process_response_obj_normal(response_dict, **kwargs) - - def process_response_obj_normal(self, response_obj, **kwargs): - """ - Preprocess the response - - Args: - response_obj : response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - token_ids = response_obj.outputs.token_ids - is_end = response_obj.finished - req_id = response_obj.request_id - request = kwargs.get("request", None) - if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] == self.tokenizer.eos_token_id: - token_ids = token_ids[:-1] - delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) - if is_end: - full_text = previous_texts + delta_text - response_obj.outputs.text = full_text - if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, - request, - self.model_status_dict[req_id], - ) - response_obj.outputs.text = text - response_obj.outputs.reasoning_content = reasoning_content - reasoning_tokens = self.tokenizer.tokenize(reasoning_content) - response_obj.outputs.reasoning_token_num = len(reasoning_tokens) - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, request) - if tool_call_info.tools_called: - response_obj.outputs.tool_calls = tool_call_info.tool_calls - response_obj.outputs.text = tool_call_info.content - response_obj.outputs.completion_tokens = full_text - data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") - del self.decode_status[req_id] - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - return response_obj - - def process_response_obj_streaming(self, response_obj, **kwargs): - """ - Preprocess the response streaming - - Args: - response_obj : response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - token_ids = response_obj.outputs.token_ids - is_end = response_obj.finished - req_id = response_obj.request_id - request = kwargs.get("request", None) - - if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] == self.tokenizer.eos_token_id: - token_ids = token_ids[:-1] - delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_obj.outputs.completion_tokens = delta_text - if self.reasoning_parser: - reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - self.model_status_dict[req_id], - ) - response_obj.outputs.delta_message = reasoning_delta_message - reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None - reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else [] - response_obj.outputs.reasoning_token_num = len(reasoning_tokens) - response_obj.outputs.reasoning_token_num = len(reasoning_tokens) - response_obj.outputs.reasoning_content = reasoning_content - response_obj.outputs.text = ( - reasoning_delta_message.content or "" - if reasoning_delta_message and hasattr(reasoning_delta_message, "content") - else "" - ) - else: - response_obj.outputs.text = delta_text - if self.tool_parser_obj: - if req_id not in self.tool_parser_dict: - self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer) - tool_parser = self.tool_parser_dict[req_id] - tool_call_delta_message = tool_parser.extract_tool_calls_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - request, - ) - if tool_call_delta_message is None or tool_call_delta_message.tool_calls: - response_obj.outputs.delta_message = tool_call_delta_message - - if is_end: - data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") - del self.decode_status[req_id] - if req_id in self.tool_parser_dict: - del self.tool_parser_dict[req_id] - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - return response_obj - - def messages2ids(self, request_or_messages, **kwargs): - """ - Convert multi-turn messages into ID sequences. - - Args: - request_or_messages: Either a request dict containing 'messages' field, - or a list of message dicts directly - - Returns: - List of token IDs as strings (converted from token objects) - """ - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat_template.") - message_dict = { - key: getattr(request_or_messages, key, None) - for key in ["messages", "tools", "documents", "enable_thinking", "system"] - if getattr(request_or_messages, key, None) is not None - } - spliced_message = self.tokenizer.apply_chat_template( - message_dict, - tokenize=False, - split_special_tokens=False, - add_special_tokens=False, - **kwargs, - ) - request_or_messages.prompt_tokens = spliced_message - req_id = getattr(request_or_messages, "request_id", None) - tokens = self.tokenizer.tokenize(spliced_message) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") - return token_ids - - def ids2tokens(self, token_id, task_id): - """ - token ids to strings - - Args: - token_ids (List[int]): token ids - task_id (str): task id - - Returns: - List[str]: strings - """ - - if task_id not in self.decode_status: - # prefix offset & read offset & history token ids & history token strings - self.decode_status[task_id] = [0, 0, [], ""] - - prefix_offset = self.decode_status[task_id][0] - read_offset = self.decode_status[task_id][1] - previous_token_ids = self.decode_status[task_id][2] - previous_texts = self.decode_status[task_id][3] - decode_str, prefix_offset, read_offset = self.tokenizer.decode_token( - previous_token_ids + token_id, prefix_offset, read_offset - ) - self.decode_status[task_id][0] = prefix_offset - self.decode_status[task_id][1] = read_offset - self.decode_status[task_id][2] += token_id - self.decode_status[task_id][3] += decode_str - - return decode_str, previous_token_ids, previous_texts - - def _load_tokenizer(self): - """ - load tokenizer - - Returns: - tokenizer (AutoTokenizer) - """ - vocab_file_names = [ - "tokenizer.model", - "spm.model", - "ernie_token_100k.model", - ] - for i in range(len(vocab_file_names)): - if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): - Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] - break - self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path) - - def get_pad_id(self): - """ - get pad_token_id, if not pad_token_id, use eos_token - - Returns: - int: pad_token_id - """ - # if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id: - # return self.tokenizer.eos_token - return self.tokenizer.pad_token_id - - def pad_batch_data( - self, - insts, - pad_id=0, - return_seq_len=False, - return_array=True, - pad_style="right", - ): - """Pad the instances to the max sequence length in batch.""" - if len(insts) == 0: - padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]] - if return_seq_len: - seq_len = np.array([], dtype=np.int64) if return_array else [] - return padded_insts, seq_len - return padded_insts - - max_len = max(map(len, insts)) - if pad_style == "left": - padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts] - else: - padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts] - if return_array: - padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len]) - - if return_seq_len: - seq_len = [len(inst) for inst in insts] - if return_array: - seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1) - return padded_insts, seq_len - return padded_insts - - def update_stop_seq(self, stop_sequences): - """ - Update stop sequences from request. - """ - stop_seqs = [] - if isinstance(stop_sequences, str): - stop_sequences = [stop_sequences] - for seq in stop_sequences: - if seq != self.tokenizer.eos_token_id: - stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq))) - stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False) - data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") - return stop_seqs, stop_seqs_len - - def process_logprob_response(self, token_ids, **kwargs): - full_text = self.tokenizer.decode(token_ids, **kwargs) - return full_text - - def update_bad_words(self, bad_words, bad_words_token_ids): - """Support bad words""" - - token_ids = bad_words_token_ids - - if token_ids is None: - token_ids = [] - for bad_word in bad_words: - # To prohibit words both at the beginning - # and in the middle of text - # (related to add_prefix_space tokenizer parameter) - for add_prefix_space in [False, True]: - prefix = " " if add_prefix_space else "" - prompt = prefix + bad_word.lstrip() - prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt)) - data_processor_logger.debug(f"processed bad_words: {prompt}, {prompt_token_ids}") - - if len(prompt_token_ids) != 1: - if not add_prefix_space: - data_processor_logger.warning( - f"Skip bad_words: <{prompt}>." - f"Bad words should be a single token." - f"Got tokens: {prompt_token_ids}." - ) - continue - - if prompt_token_ids[0] > self.tokenizer.vocab_size: - if not add_prefix_space: - data_processor_logger.warning( - f"Skip bad_words: <{prompt}>." - f"All token id values should be satisfying:" - f" 0 <= token_id < {self.tokenizer.vocab_size}." - f"Got token: {prompt_token_ids}." - ) - continue - - if prompt_token_ids not in token_ids: - token_ids.extend(prompt_token_ids) - return token_ids diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py deleted file mode 100644 index f7d30a78d58..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .ernie4_5_vl_processor import Ernie4_5_VLProcessor -from .process import DataProcessor, fancy_print -from .process_video import read_video_decord -from .utils.video_utils import VideoReaderWrapper - -__all__ = [ - "DataProcessor", - "fancy_print", - "VideoReaderWrapper", - "read_video_decord", - "Ernie4_5_VLProcessor", -] diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py deleted file mode 100644 index bae80b60a96..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ /dev/null @@ -1,340 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import traceback - -import numpy as np -from paddleformers.generation import GenerationConfig - -from fastdeploy.engine.request import Request -from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids -from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor -from fastdeploy.utils import data_processor_logger - -from .process import DataProcessor - -_SAMPLING_EPS = 1e-5 - - -class Ernie4_5_VLProcessor(Ernie4_5Processor): - """The processor class for ERNIE MoE VL models.""" - - def __init__( - self, - model_name_or_path, - limit_mm_per_prompt=None, - mm_processor_kwargs=None, - reasoning_parser_obj=None, - tool_parser_obj=None, - enable_processor_cache=False, - ): - data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") - tokenizer_path = model_name_or_path - preprocessor_path = model_name_or_path - processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) - - self.ernie4_5_processor = DataProcessor( - tokenizer_name=tokenizer_path, - image_preprocessor_name=preprocessor_path, - enable_processor_cache=enable_processor_cache, - **processor_kwargs, - ) - self.ernie4_5_processor.eval() - self.image_patch_id = self.ernie4_5_processor.image_patch_id - self.spatial_conv_size = self.ernie4_5_processor.spatial_conv_size - - self.tool_parser_dict = dict() - self.decode_status = dict() - self.model_status_dict = dict() - self._load_tokenizer() - - # Generation config - try: - self.generation_config = GenerationConfig.from_pretrained(model_name_or_path) - except Exception as e: - data_processor_logger.warning( - f"Can't find generation config: {e}, so it will not use generation_config field in the model config" - ) - self.generation_config = None - - # self.eos_token_ids = [self.tokenizer.eos_token_id] - try: - from paddleformers.trl.llm_utils import get_eos_token_id - except Exception: - from paddleformers.cli.utils.llm_utils import get_eos_token_id - - self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config) - self.eos_token_id_len = len(self.eos_token_ids) - self.pad_token_id = self.get_pad_id() - self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) - self.reasoning_parser = None - if reasoning_parser_obj: - self.reasoning_parser = reasoning_parser_obj(self.tokenizer) - self.tool_parser_obj = tool_parser_obj - - def get_pad_id(self): - """get pad id""" - return self.tokenizer.pad_token_id - - def _load_tokenizer(self): - """ - load tokenizer - - Returns: - tokenizer (AutoTokenizer) - """ - self.tokenizer = self.ernie4_5_processor.tokenizer - - def _apply_default_parameters(self, request): - """ - Apply default value for parameters in request - """ - - def set_value(req, key, value): - value = getattr(self.generation_config, key, value) - if getattr(req.sampling_params, key) is None: - setattr(req.sampling_params, key, value) - - set_value(request, "top_p", 0.7) - set_value(request, "temperature", 1.0) - set_value(request, "repetition_penalty", 1.0) - set_value(request, "frequency_penalty", 0.0) - set_value(request, "presence_penalty", 0.0) - return request - - def _parse_processor_kwargs(self, kwargs): - """解析多模态处理器参数配置""" - if not kwargs: - return {} - - try: - if not isinstance(kwargs, dict): - raise ValueError("mm-processor-kwargs must be a dictionary") - - # 验证参数类型 - data_processor_logger.info(f"kwargs:{kwargs}") - expected_types = { - "spatial_conv_size": int, - "temporal_conv_size": int, - "image_min_pixels": int, - "image_max_pixels": int, - "video_min_pixels": int, - "video_max_pixels": int, - "video_target_frames": int, - "video_frames_sample": str, - "video_max_frames": int, - "video_min_frames": int, - "video_fps": int, - } - - for key, value in kwargs.items(): - if key in expected_types and not isinstance(value, expected_types[key]): - raise ValueError( - f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}" - ) - - return kwargs - - except Exception as e: - data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}, {str(traceback.format_exc())}") - return {} - - def _parse_limits(self, limits): - """解析多模态限制配置""" - DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1} - - if not limits: - return DEFAULT_LIMITS - - try: - if not isinstance(limits, dict): - raise ValueError("limit-mm-per-prompt must be a dictionary") - data_processor_logger.info(f"_parse_limits:{limits}") - return {**DEFAULT_LIMITS, **limits} - except Exception as e: - data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") - return DEFAULT_LIMITS - - def _check_mm_limits(self, item): - if isinstance(item, dict): - # 请求包含prompt和multi_modal_data - mm_data = item - else: - # 请求包含messages - mm_data = {"image": [], "video": []} - - for message in item: - if isinstance(message.get("content"), list): - for part in message["content"]: - if part.get("type") == "image": - mm_data["image"].append(part) - elif part.get("type") == "video": - mm_data["video"].append(part) - - for modality, data in mm_data.items(): - if modality in self.limit_mm_per_prompt: - limit = self.limit_mm_per_prompt[modality] - if len(data) > limit: - raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") - - def process_request(self, request, max_model_len=None, **kwargs): - """process the input data""" - task = request.to_dict() - task["chat_template_kwargs"] = kwargs.get("chat_template_kwargs") - self.process_request_dict(task, max_model_len) - request = Request.from_dict(task) - request = self._apply_default_parameters(request) - - return request - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """process the input data""" - - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - bad_words = request.sampling_params.bad_words - bad_words_token_ids = request.sampling_params.bad_words_token_ids - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request.sampling_params.bad_words_token_ids = bad_words_token_ids - - logits_processors_args = self._prepare_think_stop_sentence( - getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len - ) - request.sampling_params.logits_processors_args = logits_processors_args - - if request.prompt_token_ids: - messages = request.messages - if messages: - self._check_mm_limits(messages) - if getattr(request, "enable_thinking") is None: - setattr(request, "enable_thinking", True) - outputs = self.ernie4_5_processor.prompt_token_ids2outputs(request) - elif request.prompt: - multimodal_data = request.multimodal_data - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - request.prompt_tokens = request.prompt - outputs = self.ernie4_5_processor.text2ids(request.prompt, images, videos) - elif request.messages: - messages = request.messages - self._check_mm_limits(messages) - chat_template_kwargs = kwargs.get("chat_template_kwargs", {}) - if not chat_template_kwargs: - chat_template_kwargs = request.chat_template_kwargs - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if getattr(request, k, None) is None: - setattr(request, k, v) - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - if getattr(request, "enable_thinking") is None: - setattr(request, "enable_thinking", True) - outputs = self.ernie4_5_processor.request2ids(request) - delattr(request, "chat_template_kwargs") - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - if request.completion_token_ids: - self.append_completion_tokens(outputs, request.completion_token_ids) - - outputs = self.pack_outputs(outputs) - request.prompt_token_ids = ( - outputs["input_ids"].tolist() - if not getattr(request, "prompt_token_ids", None) - else request.prompt_token_ids - ) - request.prompt_token_ids_len = len(request.prompt_token_ids) - request.multimodal_inputs = outputs - - # 截断超过长度限制的prompt - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] - logits_processors_args = self._update_thinking_prompt_state( - request.prompt_token_ids, getattr(request.sampling_params, "logits_processors_args", None) or {} - ) - request.sampling_params.logits_processors_args = logits_processors_args - - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - if request.sampling_params.reasoning_max_tokens is None: - request.sampling_params.reasoning_max_tokens = max(int(request.sampling_params.max_tokens * 0.8), 1) - request.reasoning_max_tokens = request.sampling_params.reasoning_max_tokens - data_processor_logger.info(f"Processed request {request}") - - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.sampling_params.n or 1 - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS: - request.sampling_params.top_p = _SAMPLING_EPS - request.sampling_params.top_k = 1 - if request.sampling_params.response_max_tokens is not None and request.enable_thinking is False: - request.sampling_params.max_tokens = min( - request.sampling_params.response_max_tokens, request.sampling_params.max_tokens - ) - return request - - def append_completion_tokens(self, multimodal_inputs, completion_token_ids): - "append already completion tokens" - - num_tokens = len(completion_token_ids) - multimodal_inputs["input_ids"].extend(completion_token_ids) - multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) - - start = multimodal_inputs["cur_position"] - for i in range(num_tokens): - multimodal_inputs["position_ids"].append([start + i] * 3) - multimodal_inputs["cur_position"] += num_tokens - - def pack_outputs(self, outs): - # Stack or nullify image-related fields - if not outs["images"]: - outs["images"] = None - outs["grid_thw"] = None - outs["image_type_ids"] = None - else: - outs["images"] = np.vstack(outs["images"]) - outs["grid_thw"] = np.vstack(outs["grid_thw"]) - outs["image_type_ids"] = np.array(outs["image_type_ids"]) - - outs["image_patch_id"] = self.image_patch_id - # Convert lists to arrays - outs["input_ids"] = np.array(outs["input_ids"], dtype=np.int64) - outs["token_type_ids"] = np.array(outs["token_type_ids"], dtype=np.int64) - outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64) - outs["mm_num_token_func"] = self.ernie4_5_processor.mm_num_tokens - return outs diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py deleted file mode 100644 index c11444e6758..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .get_image_preprocessor import get_image_preprocessor -from .image_preprocessor_adaptive import AdaptiveImageProcessor - -__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"] diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py deleted file mode 100644 index 0ff6f7d1ed5..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/get_image_preprocessor.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -"""get image preprocessor""" - -from fastdeploy.utils import data_processor_logger - -from .image_preprocessor_adaptive import AdaptiveImageProcessor - - -def get_image_preprocessor(args): - """ - get_image_preprocessor from args - """ - - if args.vision_model_name_or_path is None: - return None - - data_processor_logger.info("use AdaptiveImageProcessor") - image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path) - return image_preprocess diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py deleted file mode 100644 index 6dcdf3a4e96..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py +++ /dev/null @@ -1,587 +0,0 @@ -""" -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -"""image preprocessor adaptive""" - -import math -from typing import List, Optional, Union - -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - convert_to_rgb, - normalize, - rescale, - resize, - to_channel_dimension_format, -) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_valid_image, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -IMAGE_FACTOR = 28 -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 -MAX_RATIO = 200 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarrray"]], - List[List["paddle.Tensor"]], -] - - -__all__ = [ - "AdaptiveImageProcessor", -] - - -def is_scaled_image(image: np.ndarray) -> bool: - """ - Checks to see whether the pixel values have already been rescaled to [0, 1]. - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos -def make_batched_videos(videos) -> List[VideoInput]: - """dummy""" - if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): - return videos - - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): - if isinstance(videos[0], Image.Image): - return [videos] - elif len(videos[0].shape) == 4: - return [list(video) for video in videos] - - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] - - raise ValueError(f"Could not make batched video from {videos}") - - -class AdaptiveImageProcessor(BaseImageProcessor): - r""" - Constructs a adaptive image processor that dynamically resizes images based on the original images. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): - Resampling filter to use when resizing the image. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`): - Mean to use if normalizing the image. This is a float or list of floats for each channel in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`): - Standard deviation to use if normalizing the image. This is a float or list of floats for each channel - in the image. - do_convert_rgb (`bool`, *optional*, defaults to `True`): - Whether to convert the image to RGB. - min_pixels (`int`, *optional*, defaults to `56 * 56`): - The min pixels of the image to resize the image. - max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`): - The max pixels of the image to resize the image. - patch_size (`int`, *optional*, defaults to 14): - The spacial patch size of the vision encoder. - temporal_conv_size (`int`, *optional*, defaults to 2): - The temporal conv size in resampler. - merge_size (`int`, *optional*, defaults to 2): - The merge size of the vision encoder to llm encoder. - """ - - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 56 * 56, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_conv_size: int = 2, - merge_size: int = 2, - **kwargs, - ) -> None: - """init""" - super().__init__(**kwargs) - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_conv_size = temporal_conv_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} - self.do_convert_rgb = do_convert_rgb - - def set_pixels(self, min_pixels=None, max_pixels=None, msg=""): - """设定pixels""" - if min_pixels is not None: - assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}") - self.min_pixels = min_pixels - self.size["min_pixels"] = int(min_pixels) - if max_pixels is not None: - assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int" - data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}") - self.max_pixels = max_pixels - self.size["max_pixels"] = int(max_pixels) - - def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None): - """dummy""" - actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels - actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=actual_min_pixels, - max_pixels=actual_max_pixels, - ) - return (resized_height, resized_width), ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - do_resize: bool = True, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. - - Args: - images (`ImageInput`): - Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. - If pixel values range from 0 to 1, set `do_rescale=False`. - vision_info (`List[Dict]`, *optional*): - Optional list of dictionaries containing additional information about vision inputs. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - resample (`PILImageResampling`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Scale factor to use if rescaling the image. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Mean to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Standard deviation to use if normalizing the image. - Can be a float or a list of floats corresponding to the number of channels in the image. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - images = make_list_of_images(images) - - if do_convert_rgb: - images = [convert_to_rgb(image) for image in images] - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = height, width - processed_images = [] - - if predetermined_grid_thw is not None: - assert len(predetermined_grid_thw) == len( - images - ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}" - - for img_idx, image in enumerate(images): - if do_resize: - if predetermined_grid_thw is not None: - (resized_height, resized_width) = predetermined_grid_thw[img_idx] - resized_height *= self.patch_size - resized_width *= self.patch_size - else: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错 - # 直接fromarray,不要靠paddleformers里面的 - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - if do_rescale: - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - - processed_images.append(image) - patches = np.array(processed_images) - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) - - channel = patches.shape[1] # [time, C, H, W] - grid_t = patches.shape[0] - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - patches = patches.reshape( - [ - grid_t, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz] - patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.patch_size * self.patch_size, - ] - ) # [grid_t * grid_h * grid_w, C * psz * psz] - - return flatten_patches, (grid_t, grid_h, grid_w) - - def preprocess( - self, - images: ImageInput, - videos: VideoInput = None, - do_resize: bool = True, - size: Optional[Union[int, List[int]]] = None, - resample: PILImageResampling = None, - do_rescale: bool = True, - rescale_factor: float = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = False, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - predetermined_grid_thw=None, - ): - """ - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - videos (`VideoInput`): - Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If - passing in videos with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with - the longest edge resized to keep the input aspect ratio. - resample (`int`, *optional*, defaults to `self.resample`): - Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only - has an effect if `do_resize` is set to `True`. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to - `True`. - do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): - Whether to convert the image to RGB. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Can be one of: - - Unset: Return a list of `np.ndarray`. - - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`. - - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - Unset: Use the channel dimension format of the input image. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - - """ - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - resample = resample if resample is not None else self.resample - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if images is not None: - images = make_batched_images(images) - if videos is not None: - videos = make_batched_videos(videos) - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - if images is not None: - pixel_values, vision_grid_thws = [], [] - for img_idx, image in enumerate(images): - if predetermined_grid_thw is not None: - predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]] - else: - predetermined_grid_thw_one = None - patches, image_grid_thw = self._preprocess( - image, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw_one, - ) - pixel_values.extend(patches) - vision_grid_thws.append(image_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - data = { - "pixel_values": pixel_values, - "image_grid_thw": vision_grid_thws, - } - - if videos is not None: - pixel_values, vision_grid_thws = [], [] - for images in videos: - patches, video_grid_thw = self._preprocess( - images, - do_resize=do_resize, - resample=resample, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - data_format=data_format, - do_convert_rgb=do_convert_rgb, - input_data_format=input_data_format, - predetermined_grid_thw=predetermined_grid_thw, - ) - pixel_values.extend(patches) - vision_grid_thws.append(video_grid_thw) - pixel_values = np.array(pixel_values) - vision_grid_thws = np.array(vision_grid_thws) - - data = { - "pixel_values_videos": pixel_values, - "video_grid_thw": vision_grid_thws, - } - - return BatchFeature(data=data, tensor_type=return_tensors) - - -def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - -def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, -): - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * MAX_RATIO, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * MAX_RATIO, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process.py deleted file mode 100644 index d8b90e54d57..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/process.py +++ /dev/null @@ -1,751 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -""" process.py """ -import copy -import os -import pickle -from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import zmq -from paddleformers.transformers.image_utils import ChannelDimension -from PIL import Image - -from fastdeploy.engine.request import ImagePosition, Request -from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer -from fastdeploy.input.mm_data_processor import MMBaseDataProcessor -from fastdeploy.input.utils import IDS_TYPE_FLAG -from fastdeploy.multimodal.hasher import MultimodalHasher -from fastdeploy.utils import data_processor_logger - -from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor -from .process_video import read_frames_decord, read_video_decord -from .utils.render_timestamp import render_frame_timestamp - - -def fancy_print(input_ids, tokenizer, image_patch_id=None): - """ - input_ids: input_ids - tokenizer: the tokenizer of models - """ - i = 0 - res = "" - text_ids = [] - real_image_token_len = 0 - while i < len(input_ids): - if input_ids[i] == image_patch_id: - if len(text_ids) > 0: - res += tokenizer.decode(text_ids) - text_ids = [] - - real_image_token_len += 1 - else: - if real_image_token_len != 0: - res += f"<|IMAGE@{real_image_token_len}|>" - real_image_token_len = 0 - - text_ids.append(input_ids[i]) - - i += 1 - if len(text_ids) > 0: - - res += tokenizer.decode(text_ids) - text_ids = [] - return res - - -class DataProcessor(MMBaseDataProcessor): - """ - Processes multimodal chat messages into model-ready inputs, - handling text, images, and videos with 3D positional embeddings. - """ - - CLS_TOKEN = "<|begin_of_sentence|>" - SEP_TOKEN = "<|end_of_sentence|>" - EOS_TOKEN = "" - IMG_START = "<|IMAGE_START|>" - IMG_END = "<|IMAGE_END|>" - VID_START = "<|VIDEO_START|>" - VID_END = "<|VIDEO_END|>" - - def __init__( - self, - tokenizer_name: str, - image_preprocessor_name: str, - enable_processor_cache: bool = False, - spatial_conv_size: int = 2, - temporal_conv_size: int = 2, - image_min_pixels: int = 4 * 28 * 28, - image_max_pixels: int = 6177 * 28 * 28, - video_min_pixels: int = 299 * 28 * 28, - video_max_pixels: int = 1196 * 28 * 28, - video_target_frames: int = -1, - video_frames_sample: str = "leading", - video_max_frames: int = 180, - video_min_frames: int = 16, - video_fps: int = 2, - **kwargs, - ) -> None: - super().__init__() - # Tokenizer and image preprocessor - self.model_name_or_path = tokenizer_name - self._load_tokenizer() - self.tokenizer.ignored_index = -100 - self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name) - self.enable_processor_cache = enable_processor_cache - - # Convolution sizes for patch aggregation - self.spatial_conv_size = spatial_conv_size - self.temporal_conv_size = temporal_conv_size - - # Pixel constraints - self.image_min_pixels = image_min_pixels - self.image_max_pixels = image_max_pixels - self.video_min_pixels = video_min_pixels - self.video_max_pixels = video_max_pixels - - # Video sampling parameters - self.target_frames = video_target_frames - self.frames_sample = video_frames_sample - self.max_frames = video_max_frames - self.min_frames = video_min_frames - self.fps = video_fps - - # Special tokens and IDs - self.cls_token = self.CLS_TOKEN - self.sep_token = self.SEP_TOKEN - self.eos_token = self.EOS_TOKEN - self.image_start = self.IMG_START - self.image_end = self.IMG_END - self.video_start = self.VID_START - self.video_end = self.VID_END - self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>") - self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start) - self.image_end_id = self.tokenizer.convert_tokens_to_ids(self.image_end) - self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start) - self.video_end_id = self.tokenizer.convert_tokens_to_ids(self.video_end) - self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token) - self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token) - - self.token_type_mapping = self._build_token_type_mapping() - self.is_training = True - self.role_prefixes = { - "system": "", - "user": "User: ", - "bot": "Assistant: ", - "assistant": "Assistant: ", - "tool": "Tool: ", - } - - @staticmethod - def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]: - """ - Calculate the number of tokens in the multimodal input. - """ - if isinstance(grid_thw, paddle.Tensor): - grid_thw = grid_thw.numpy() - - if len(grid_thw) == 0: - return 0 - - def calc_one(thw): - t, h, w = map(int, thw) - if t == 1: - return t * h * w // 4 - else: - return t * h * w // 4 // 2 - - if isinstance(grid_thw[0], (list, tuple, np.ndarray)): - return [calc_one(x) for x in grid_thw] - - return calc_one(grid_thw) - - def _build_token_type_mapping(self) -> Dict[Any, int]: - mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"]) - for token in ( - self.IMG_START, - self.IMG_END, - self.VID_START, - self.VID_END, - ): - mapping[token] = IDS_TYPE_FLAG["image"] - mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"] - return mapping - - def train(self) -> None: - """Enable training mode (produces labels).""" - self.is_training = True - - def eval(self) -> None: - """Enable evaluation mode (doesn't produce labels).""" - self.is_training = False - - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): - """ - Convert chat text into model inputs. - - Args: - text (str): The chat text containing placeholders for images and videos. - images (list, optional): List of images to be processed and inserted at image placeholders. - videos (list, optional): List of videos to be processed and inserted at video placeholders. - image_uuid (list, optional): List of unique identifiers for each image, used for caching or hashing. - video_uuid (list, optional): List of unique identifiers for each video, used for caching or hashing. - Returns: - dict: A dictionary with keys input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels, etc. - """ - - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "mm_positions": [], - "mm_hashes": [], - } - - IMAGE_PLACEHOLDER = "<|image@placeholder|>" - VIDEO_PLACEHOLDER = "<|video@placeholder|>" - IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) - VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) - st, image_idx, video_idx = 0, 0, 0 - while st < len(text): - image_pos = text.find(IMAGE_PLACEHOLDER, st) - image_pos = len(text) if image_pos == -1 else image_pos - video_pos = text.find(VIDEO_PLACEHOLDER, st) - video_pos = len(text) if video_pos == -1 else video_pos - ed = min(image_pos, video_pos) - - self._add_text(text[st:ed], outputs) - if ed == len(text): - break - - if ed == image_pos: - image = images[image_idx] - uuid = image_uuid[image_idx] if image_uuid else None - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid) - else: - # cached images are already processed - self._add_processed_image(image, outputs, uuid) - image_idx += 1 - st = ed + IMAGE_PLACEHOLDER_LEN - else: - item = videos[video_idx] - uuid = video_uuid[video_idx] if video_uuid else None - if not isinstance(item, tuple): - if isinstance(item, dict): - frames = self._load_and_process_video(item["video"], item) - else: - frames = self._load_and_process_video(item, {}) - self._add_video(frames, outputs, uuid) - else: - # cached frames are already processed - self._add_processed_video(item, outputs, uuid) - video_idx += 1 - st = ed + VIDEO_PLACEHOLDER_LEN - - return outputs - - def extract_mm_items(self, request: Request): - messages = parse_chat_messages(request.messages) - mm_items = [] - for msg in messages: - role = msg.get("role") - assert role in self.role_prefixes, f"Unsupported role: {role}" - content = msg.get("content") - if not isinstance(content, list): - content = [content] - for item in content: - if item.get("type") in ["image", "video"]: - mm_items.append(item) - - missing_hashes, missing_idx = [], [] - for idx, item in enumerate(mm_items): - if not item.get("data"): - # raw data not provided, should be retrieved from processor cache - missing_hashes.append(item.get("uuid")) - missing_idx.append(idx) - - if len(missing_hashes) > 0 and not self.enable_processor_cache: - raise ValueError("Missing items cannot be retrieved without processor cache.") - - dealer = None - if self.enable_processor_cache: - context = zmq.Context() - dealer = context.socket(zmq.DEALER) - dealer.connect("ipc:///dev/shm/processor_cache.ipc") - - missing_items = self.get_processor_cache(dealer, missing_hashes) - for idx in range(len(missing_items)): - if not missing_items[idx]: - raise ValueError(f"Missing item {idx} not found in processor cache") - mm_items[missing_idx[idx]]["data"] = missing_items[idx] - - images, videos = [], [] - image_uuid, video_uuid = [], [] - for item in mm_items: - if item.get("type") == "image": - images.append(item["data"]) - image_uuid.append(item["uuid"]) - elif item.get("type") == "video": - videos.append(item["data"]) - video_uuid.append(item["uuid"]) - else: - raise ValueError(f"Unsupported multimodal type: {item.get('type')}") - return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items - - def request2ids( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - """ - Convert chat messages into model inputs. - Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels. - """ - images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) - - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat template.") - - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - message_dict = { - key: getattr(request, key, None) - for key in ["messages", "tools", "documents", "enable_thinking", "system"] - if getattr(request, key, None) is not None - } - prompt = self.tokenizer.apply_chat_template( - message_dict, - tokenize=False, - add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True, - **chat_template_kwargs, - ) - request.prompt_tokens = prompt - - outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid) - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - t, h, w = outputs["grid_thw"][idx][0] - meta["thw"] = (t, h, w) - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - if self.is_training: - assert tgts, "Training must give tgt" - self._extract_labels(outputs, tgts) - - return outputs - - def prompt_token_ids2outputs( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "mm_positions": [], - "mm_hashes": [], - } - prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else [] - prompt_token_ids_len = len(prompt_token_ids) - if not request.messages: - outputs["input_ids"].extend(prompt_token_ids) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len) - for i in range(prompt_token_ids_len): - outputs["position_ids"].append([i] * 3) - outputs["cur_position"] += prompt_token_ids_len - return outputs - images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self.extract_mm_items(request) - st, image_idx, video_idx = 0, 0, 0 - while st < prompt_token_ids_len: - cur_token_id = prompt_token_ids[st] - if cur_token_id == self.image_start_id: - if image_idx >= len(images): - raise ValueError("prompt token ids has more image placeholder than in messages") - # append image_start_id - outputs["input_ids"].extend([cur_token_id]) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].append([outputs["cur_position"]] * 3) - outputs["cur_position"] += 1 - st += 1 - # process placeholder token ids - cur_idx = st - while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_end_id: - cur_idx += 1 - if cur_idx >= prompt_token_ids_len: - raise ValueError("image token ids not complete") - image = images[image_idx] - uuid = image_uuid[image_idx] if image_uuid else None - token_len = cur_idx - st - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid, token_len) - else: - self._add_processed_image(image, outputs, uuid, token_len) - image_idx += 1 - st = cur_idx - elif cur_token_id == self.video_start_id: - if video_idx >= len(videos): - raise ValueError("prompt token ids has more video placeholder than in messages") - # append video_start_id - outputs["input_ids"].extend([cur_token_id]) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].append([outputs["cur_position"]] * 3) - outputs["cur_position"] += 1 - st += 1 - # process placeholder token ids - cur_idx = st - while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.video_end_id: - cur_idx += 1 - if cur_idx >= prompt_token_ids_len: - raise ValueError("video token ids not complete") - video = videos[video_idx] - uuid = video_uuid[video_idx] if video_uuid else None - token_len = cur_idx - st - if not isinstance(video, tuple): - if isinstance(video, dict): - frames = self._load_and_process_video(video["video"], video) - else: - frames = self._load_and_process_video(video, {}) - self._add_video(frames, outputs, uuid, token_len) - else: - self._add_processed_video(video, outputs, uuid, token_len) - video_idx += 1 - st = cur_idx - else: - outputs["input_ids"].extend([cur_token_id]) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]]) - outputs["position_ids"].append([outputs["cur_position"]] * 3) - outputs["cur_position"] += 1 - st += 1 - if image_idx != len(images): - raise ValueError("number of images does not match") - if video_idx != len(videos): - raise ValueError("number of videos does not match") - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - t, h, w = outputs["grid_thw"][idx][0] - meta["thw"] = (t, h, w) - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - return outputs - - def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None: - token_id = token if isinstance(token, int) else self.tokenizer.convert_tokens_to_ids(token) - outputs["input_ids"].append(token_id) - outputs["token_type_ids"].append(self.token_type_mapping[token]) - pos = outputs["cur_position"] - outputs["position_ids"].append([pos] * 3) - outputs["cur_position"] += 1 - - def _add_text(self, tokens, outputs: Dict) -> None: - if isinstance(tokens, str): - tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"] - outputs["input_ids"].extend(tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens)) - - start = outputs["cur_position"] - for i in range(len(tokens)): - outputs["position_ids"].append([start + i] * 3) - outputs["cur_position"] += len(tokens) - - def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len=None) -> None: - patches_h, patches_w = self.image_preprocessor.get_smarted_resize( - img.height, - img.width, - min_pixels=self.image_min_pixels, - max_pixels=self.image_max_pixels, - )[1] - num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2) - if token_len and token_len != num_tokens: - raise ValueError("image tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - outputs["num_input_image_tokens"] += num_tokens - - pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - # Preprocess pixels - ret = self.image_preprocessor.preprocess( - images=[img.convert("RGB")], - do_normalize=False, - do_rescale=False, - predetermined_grid_thw=np.array([[patches_h, patches_w]]), - do_convert_rgb=True, - input_data_format=ChannelDimension.LAST, - ) - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(ret["image_grid_thw"]) - outputs["image_type_ids"].append(0) - - def _add_processed_image( - self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None - ) -> None: - img, meta = img_cache - num_tokens = img.shape[0] // (self.spatial_conv_size**2) - if token_len and num_tokens != token_len: - raise ValueError("image tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - _, h, w = meta["thw"] - pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - outputs["images"].append(img) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[1, h, w]])) - outputs["image_type_ids"].append(0) - - def _add_video(self, frames, outputs: Dict, uuid: Optional[str], token_len=None) -> None: - patches_h, patches_w = self.image_preprocessor.get_smarted_resize( - frames[0].height, - frames[0].width, - min_pixels=self.video_min_pixels, - max_pixels=self.video_max_pixels, - )[1] - num_frames = len(frames) - num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size) - if token_len and num_tokens != token_len: - raise ValueError("video tokens num not match the size") - - pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - ret = self.image_preprocessor.preprocess( - images=None, - videos=pixel_stack, - do_normalize=False, - do_rescale=False, - predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames), - do_convert_rgb=True, - input_data_format=ChannelDimension.LAST, - ) - outputs["images"].append(ret["pixel_values_videos"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(ret["video_grid_thw"]) - outputs["image_type_ids"].extend([1] * num_frames) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["num_input_video_tokens"] += num_tokens - - pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - def _add_processed_video( - self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len=None - ) -> None: - frames, meta = frames_cache - num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size) - if token_len and num_tokens != token_len: - raise ValueError("video tokens num not match the size") - - t, h, w = meta["thw"] - outputs["images"].append(frames) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[t, h, w]])) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["image_type_ids"].extend([1] * t) - - pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"]) - outputs["position_ids"].extend(pos_ids) - outputs["cur_position"] = np.max(pos_ids) + 1 - - def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None: - input_ids = copy.deepcopy(outputs["input_ids"]) - labels = [self.tokenizer.ignored_index] * len(input_ids) - - tgt_count = input_ids.count(self.sep_token_id) - assert tgt_count == len(tgts), f"len(tgts) != len(src) {len(tgts)} vs {tgt_count}" - - tgt_index = 0 - for i, token_id in enumerate(input_ids): - if token_id == self.sep_token_id: - labels_token = self.tokenizer.tokenize(tgts[tgt_index]) - labels_token_id = self.tokenizer.convert_tokens_to_ids(labels_token) - labels[i - len(labels_token_id) : i] = labels_token_id - labels[i] = self.eos_token_id # - tgt_index += 1 - - outputs["labels"] = labels - - def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]: - reader, meta, path = read_video_decord(url, save_to_disk=False) - - video_frame_args = dict() - video_frame_args["fps"] = item.get("fps", self.fps) - video_frame_args["min_frames"] = item.get("min_frames", self.min_frames) - video_frame_args["max_frames"] = item.get("max_frames", self.max_frames) - video_frame_args["target_frames"] = item.get("target_frames", self.target_frames) - video_frame_args["frames_sample"] = item.get("frames_sample", self.frames_sample) - - video_frame_args = self._set_video_frame_args(video_frame_args, meta) - - frames_data, _, timestamps = read_frames_decord( - path, - reader, - meta, - target_frames=video_frame_args["target_frames"], - target_fps=video_frame_args["fps"], - frames_sample=video_frame_args["frames_sample"], - save_to_disk=False, - ) - - frames: List[Image.Image] = [] - for img_array, ts in zip(frames_data, timestamps): - frames.append(render_frame_timestamp(img_array, ts)) - # Ensure even number of frames for temporal conv - if len(frames) % 2 != 0: - frames.append(copy.deepcopy(frames[-1])) - return frames - - def _set_video_frame_args(self, video_frame_args, video_meta): - """ - 根据已知参数和优先级,设定最终的抽帧参数 - """ - # 优先级:video_target_frames > (video_min_frames, video_max_frames) > video_fps - if video_frame_args["target_frames"] > 0: - if video_frame_args["fps"] >= 0: - raise ValueError("fps must be negative if target_frames is given") - if ( - video_frame_args["min_frames"] > 0 - and video_frame_args["target_frames"] < video_frame_args["min_frames"] - ): - raise ValueError("target_frames must be larger than min_frames") - if ( - video_frame_args["max_frames"] > 0 - and video_frame_args["target_frames"] > video_frame_args["max_frames"] - ): - raise ValueError("target_frames must be smaller than max_frames") - else: - if video_frame_args["fps"] < 0: - raise ValueError("Must provide either positive target_fps or positive target_frames.") - # 先计算在video_fps下抽到的帧数 - frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"]) - # 判断是否在目标区间内,如果不是,则取target_frames为上界或下界 - if ( - video_frame_args["min_frames"] > 0 - and video_frame_args["max_frames"] > 0 - and video_frame_args["min_frames"] > video_frame_args["max_frames"] - ): - raise ValueError("min_frames must be smaller than max_frames") - if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]: - video_frame_args["target_frames"] = video_frame_args["min_frames"] - video_frame_args["fps"] = -1 - if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]: - video_frame_args["target_frames"] = video_frame_args["max_frames"] - video_frame_args["fps"] = -1 - - return video_frame_args - - def _compute_3d_positions(self, t: int, h: int, w: int, start_idx: int) -> List[List[int]]: - # Downsample time if needed - t_eff = t // self.temporal_conv_size if t != 1 else 1 - gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size - time_idx = np.repeat(np.arange(t_eff), gh * gw) - h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff) - w_idx = np.tile(np.arange(gw), t_eff * gh) - - coords = list(zip(time_idx, h_idx, w_idx)) - return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords] - - def _load_tokenizer(self): - """ - load tokenizer - - Returns: - tokenizer (AutoTokenizer) - """ - vocab_file_names = [ - "tokenizer.model", - "spm.model", - "ernie_token_100k.model", - ] - for i in range(len(vocab_file_names)): - if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): - Ernie4_5Tokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] - break - self.tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path) - - def get_processor_cache(self, socket, mm_hashes: list[str]) -> list: - """ - get cache correspond to given hash values - """ - req = pickle.dumps(mm_hashes) - socket.send_multipart([b"", req]) - _, resp = socket.recv_multipart() - mm_items = pickle.loads(resp) - data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") - - return mm_items - - def update_processor_cache(self, socket, mm_hashes: list[str], mm_items): - """ - update cache data - """ - req = pickle.dumps((mm_hashes, mm_items)) - socket.send_multipart([b"", req]) - data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py b/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py deleted file mode 100644 index 91120096c70..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/process_video.py +++ /dev/null @@ -1,205 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import io -import os -import random - -import numpy as np -from PIL import Image - -from fastdeploy.utils import data_processor_logger - -from .utils.io_utils import EXTRACTED_FRAME_DIR, get_filename -from .utils.video_utils import VideoReaderWrapper - - -def read_video_decord(video_path, save_to_disk): - """get reader and meta by decord""" - # video_path = get_downloadable(video_path, save_to_disk=save_to_disk) - if isinstance(video_path, VideoReaderWrapper): - video_reader = video_path - else: - if isinstance(video_path, bytes): - video_path = io.BytesIO(video_path) - video_reader = VideoReaderWrapper(video_path, num_threads=1) - vlen = len(video_reader) - fps = video_reader.get_avg_fps() - duration = vlen / float(fps) - - video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen} - - return video_reader, video_meta, video_path - - -def get_frame_indices( - vlen, - target_frames=-1, - target_fps=-1, - frames_sample="middle", - fix_start=None, - input_fps=-1, -): - """ - 取出对应的frame index - """ - assert frames_sample in ["rand", "middle", "leading"] - if target_frames > 0: - assert target_fps <= 0, "target_fps must be negative if target_frames is given." - if target_frames > vlen: - acc_samples = vlen - data_processor_logger.info( - f"target_frames={target_frames} is larger than video length {vlen}, " - f"will sample {acc_samples} frames." - ) - else: - acc_samples = target_frames - data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}") - - # split the video into `acc_samples` intervals, and sample from each interval. - intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int) - ranges = [] - for idx, interv in enumerate(intervals[:-1]): - ranges.append((interv, intervals[idx + 1] - 1)) - if frames_sample == "rand": - try: - frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] - except Exception: - frame_indices = np.random.permutation(vlen)[:acc_samples] - frame_indices.sort() - frame_indices = list(frame_indices) - elif fix_start is not None: - frame_indices = [x[0] + fix_start for x in ranges] - elif frames_sample == "leading": - frame_indices = [x[0] for x in ranges] - elif frames_sample == "middle": - frame_indices = [(x[0] + x[1]) // 2 for x in ranges] - else: - raise NotImplementedError - - elif target_fps > 0: - assert target_frames <= 0, "target_frames must be negative if target_fps is given." - assert input_fps > 0, "input_fps must be provided if target_fps is given." - data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}") - duration = float(vlen) / input_fps - delta = 1 / target_fps # gap between frames, this is also the clip length each frame represents - if frames_sample == "middle": - frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) - elif frames_sample == "leading": - frame_seconds = np.arange(0, duration, delta) - if frames_sample == "rand": - frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta) - rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5 - frame_seconds += rand_offset * delta - frame_indices = np.around(frame_seconds * input_fps).astype(int) - frame_indices = [e for e in frame_indices if e < vlen] - - else: - raise ValueError("Must provide either positive target_fps or positive target_frames.") - - return frame_indices - - -def read_frames_decord( - video_path, - video_reader, - video_meta, - target_frames=-1, - target_fps=-1, - frames_sample="middle", - fix_start=None, - save_to_disk=False, - cache_dir=EXTRACTED_FRAME_DIR, - frame_indices=None, - tol=10, -): - """get frames by decord""" - - if frame_indices is None: - frame_indices = get_frame_indices( - video_meta["num_of_frame"], - target_frames=target_frames, - target_fps=target_fps, - frames_sample=frames_sample, - fix_start=fix_start, - input_fps=video_meta["fps"], - ) - - frames = [] - for frame_indice_index in range(0, len(frame_indices)): - frame_indice = frame_indices[frame_indice_index] - try: - frames.append(video_reader[frame_indice].asnumpy()) # (T, H, W, C) - except Exception as e: - data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}") - previous_counter = 1 - later_counter = 1 - previous_after_flag = True - if frame_indice == 0 or frame_indice == len(video_reader) - 1: - cur_tol = tol * 2 - else: - cur_tol = tol - while previous_counter < cur_tol or later_counter < cur_tol: - if previous_after_flag: - if frame_indice - previous_counter < 0: - previous_counter += 1 - previous_after_flag = not previous_after_flag - continue - try: - frames.append(video_reader[frame_indice - previous_counter].asnumpy()) - data_processor_logger.info( - f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame" - ) - frame_indices[frame_indice_index] = frame_indice - previous_counter - break - except Exception as e: - previous_counter += 1 - data_processor_logger.info(f"error: {e}") - else: - if frame_indice + later_counter >= len(video_reader): - later_counter += 1 - previous_after_flag = not previous_after_flag - continue - try: - frames.append(video_reader[frame_indice + later_counter].asnumpy()) - data_processor_logger.info( - f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame" - ) - frame_indices[frame_indice_index] = frame_indice + later_counter - break - except Exception: - later_counter += 1 - previous_after_flag = not previous_after_flag - - frames = np.stack(frames, axis=0) - assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}" - - ret = [] - - url_sha1 = get_filename() - for idx, frame in enumerate(frames): - tmp = Image.fromarray(frame, "RGB") - if save_to_disk: - save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png") - if not os.path.exists(os.path.dirname(save_path)): - os.makedirs(os.path.dirname(save_path)) - tmp.save(save_path) - tmp = save_path - ret.append(tmp) - - time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices] - - return ret, frame_indices, time_stamps diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf deleted file mode 100644 index 7e3bb2f8ce7..00000000000 Binary files a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/Roboto-Regular.ttf and /dev/null differ diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py deleted file mode 100644 index 31be300c18e..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py deleted file mode 100644 index 1535b64d4f0..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/io_utils.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import base64 -import datetime -import hashlib -import os -import threading -import uuid -from pathlib import Path - -import requests -from PIL import Image - -RAW_VIDEO_DIR = "./download_tmp/raw_video/" -RAW_IMAGE_DIR = "./download_tmp/raw_images/" -EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/" -TMP_DIR = "./download_tmp/upload_tmp/" - - -def file_download(url, download_dir, save_to_disk=False, retry=0, retry_interval=3): - """ - Description: 下载url,如果url是PIL直接返回 - Args: - url(str, PIL): http/本地路径/io.Bytes,注意io.Bytes是图片字节流 - download_path: 在save_to_disk=True的情况下生效,返回保存地址 - save_to_disk: 是否保存在本地路径 - - """ - from .video_utils import VideoReaderWrapper - - if isinstance(url, Image.Image): - return url - elif isinstance(url, VideoReaderWrapper): - return url - elif url.startswith("http"): - response = requests.get(url) - bytes_data = response.content - elif os.path.isfile(url): - if save_to_disk: - return url - bytes_data = open(url, "rb").read() - else: - bytes_data = base64.b64decode(url) - if not save_to_disk: - return bytes_data - - download_path = os.path.join(download_dir, get_filename(url)) - Path(download_path).parent.mkdir(parents=True, exist_ok=True) - with open(download_path, "wb") as f: - f.write(bytes_data) - return download_path - - -def get_filename(url=None): - """ - Get Filename - """ - if url is None: - return str(uuid.uuid4()).replace("-", "") - t = datetime.datetime.now() - if not isinstance(url, bytes): - url = url.encode("utf-8") - - md5_hash = hashlib.md5(url).hexdigest() - pid = os.getpid() - tid = threading.get_ident() - - # 去掉后缀,防止save-jpg报错 - image_filname = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}" - return image_filname - - -def get_downloadable( - url, - download_dir=RAW_VIDEO_DIR, - save_to_disk=False, - retry=0, - retry_interval=3, -): - """download video and store it in the disk - - return downloaded **path** if save_to_disk is set to true - return downloaded **bytes** if save_to_disk is set to false - """ - - if not os.path.exists(download_dir): - os.makedirs(download_dir) - downloaded_path = file_download( - url, - download_dir, - save_to_disk=save_to_disk, - retry=retry, - retry_interval=retry_interval, - ) - return downloaded_path diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py deleted file mode 100644 index 9b24226ed88..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/render_timestamp.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import os -from pathlib import Path - -from PIL import Image, ImageDraw, ImageFont - -cur_directory = Path(__file__).parent.absolute() -FONT_PATH = os.path.join(cur_directory, "Roboto-Regular.ttf") - - -def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH): - """ - 函数功能: 给pil.image的图片渲染时间戳 - 时间戳的大小为 min(width, height)的rate - 字体的颜色为黑色, 轮廓是白色, 轮廓的大小是字体的10% - 返回一个 Image 对象 - """ - draw = ImageDraw.Draw(image) # 创建一个可绘制对象 - width, height = image.size # 获取图片大小 - font_size = int(min(width, height) * rate) # 设置字体大小 - outline_size = int(font_size * 0.1) # 设置轮廓大小 - font = ImageFont.truetype(font_path, font_size) # 加载字体文件, 设置字体大小 - x = 0 - y = 0 # 文本的x坐标, y坐标 - - # 绘制黑色的时间戳,白色的边框 - draw.text( - (x, y), - number, - font=font, - fill=(0, 0, 0), - stroke_width=outline_size, - stroke_fill=(255, 255, 255), - ) - - return image - - -def timestamp_converting(time_stamp_in_seconds): - """ - convert timestamp format from seconds to hr:min:sec - """ - # get hours - hours = 0 - while time_stamp_in_seconds >= 3600: - hours += 1 - time_stamp_in_seconds -= 3600 - # get minutes - mins = 0 - while time_stamp_in_seconds >= 60: - mins += 1 - time_stamp_in_seconds -= 60 - time_hours = f"{int(hours):02d}" - time_mins = f"{int(mins):02d}" - time_secs = f"{time_stamp_in_seconds:05.02f}" - fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs - - return fi_time_stamp - - -def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration): - """ - function: get the timestamp of a frame, 在均匀抽帧时用。 - - num_frames: 总帧数 - frameid_list: 被抽帧的帧的索引 - duration: 视频的总时长 - return: timestamp; xx:xx:xx (str) - """ - time_stamp = duration * 1.0 * frame_id / num_frames - - return time_stamp - - -def render_frame_timestamp(frame, timestamp, font_rate=0.1): - """ - 函数功能, 给frame, 按照顺序将 index 渲染上去 - 逻辑思路: 把index渲染到图片的左上方 - - frame: 帧,PIL.Image object - timestamp: 时间戳,单位是秒 - font_rate: 字体大小占 min(wi, hei)的比率 - """ - - time_stamp = "time: " + timestamp_converting(timestamp) - new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate) - - return new_frame diff --git a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py b/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py deleted file mode 100644 index a4769ca8ecc..00000000000 --- a/fastdeploy/input/v1/ernie4_5_vl_processor/utils/video_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import io -import os -from tempfile import NamedTemporaryFile as ntf - -import decord - -try: - # moviepy 1.0 - import moviepy.editor as mp -except: - # moviepy 2.0 - import moviepy as mp - - -def is_gif(data: bytes) -> bool: - """ - check if a bytes is a gif based on the magic head - """ - return data[:6] in (b"GIF87a", b"GIF89a") - - -class VideoReaderWrapper(decord.VideoReader): - """ - Solving memory leak bug - - https://github.com/dmlc/decord/issues/208 - """ - - def __init__(self, video_path, *args, **kwargs): - with ntf(delete=True, suffix=".gif") as gif_file: - gif_input = None - self.original_file = None - if isinstance(video_path, str): - self.original_file = video_path - if video_path.lower().endswith(".gif"): - gif_input = video_path - elif isinstance(video_path, bytes): - if is_gif(video_path): - gif_file.write(video_path) - gif_input = gif_file.name - elif isinstance(video_path, io.BytesIO): - video_path.seek(0) - tmp_bytes = video_path.read() - video_path.seek(0) - if is_gif(tmp_bytes): - gif_file.write(tmp_bytes) - gif_input = gif_file.name - - if gif_input is not None: - clip = mp.VideoFileClip(gif_input) - mp4_file = ntf(delete=False, suffix=".mp4") - clip.write_videofile(mp4_file.name, verbose=False, logger=None) - clip.close() - video_path = mp4_file.name - self.original_file = video_path - - super().__init__(video_path, *args, **kwargs) - self.seek(0) - - def __getitem__(self, key): - frames = super().__getitem__(key) - self.seek(0) - return frames - - def __del__(self): - if self.original_file and os.path.exists(self.original_file): - os.remove(self.original_file) diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py b/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py deleted file mode 100644 index 8f79e65d634..00000000000 --- a/fastdeploy/input/v1/paddleocr_vl_processor/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .paddleocr_vl_processor import PaddleOCRVLProcessor -from .process import DataProcessor - -__all__ = ["DataProcessor", "PaddleOCRVLProcessor"] diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py deleted file mode 100644 index 8e333d5bf96..00000000000 --- a/fastdeploy/input/v1/paddleocr_vl_processor/image_processor.py +++ /dev/null @@ -1,275 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -"""Image processor class for Keye.""" - -# TODO: Support videos - -import json -import logging -import math -from pathlib import Path -from typing import Dict, List, Optional, Union - -import numpy as np -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_utils import ( - ImageInput, - is_valid_image, - make_list_of_images, - to_numpy_array, -) - -_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - - -def make_batched_images(images) -> List[List[ImageInput]]: - """ - Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: - images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): - The input image. - - Returns: - list: A list of images. - """ - if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): - return [img for img_list in images for img in img_list] - - elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): - return images - - elif is_valid_image(images): - return [images] - - raise ValueError(f"Could not make batched images from {images}") - - -def adjust_size(size, patch_size): - num_patches = size // patch_size - if num_patches % 2 != 0: - num_patches -= 1 - return num_patches * patch_size - - -def smart_resize( - height: int, - width: int, - factor: int = 28, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, -): - """Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - - """ - # if height < factor or width < factor: - # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") - # if int(height < factor//4) + int(width < factor//4): - # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}") - - if height < factor: - logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor") - width = round((width * factor) / height) - height = factor - - if width < factor: - logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor") - height = round((height * factor) / width) - width = factor - - if max(height, width) / min(height, width) > 200: - raise ValueError( - f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" - ) - h_bar = round(height / factor) * factor - w_bar = round(width / factor) * factor - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = math.floor(height / beta / factor) * factor - w_bar = math.floor(width / beta / factor) * factor - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = math.ceil(height * beta / factor) * factor - w_bar = math.ceil(width * beta / factor) * factor - return h_bar, w_bar - - -class ImageProcessor(BaseImageProcessor): - model_input_names = [ - "pixel_values", - "image_grid_thw", - "pixel_values_videos", - "video_grid_thw", - ] - - def __init__( - self, - do_resize: bool = True, - resample: int = 3, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: bool = True, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, - patch_size: int = 14, - temporal_patch_size: int = 1, - merge_size: int = 2, - **kwargs, - ) -> None: - super().__init__() - self.do_resize = do_resize - self.resample = resample - self.do_rescale = do_rescale - self.rescale_factor = rescale_factor - self.do_normalize = do_normalize - self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN - self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD - self.min_pixels = min_pixels - self.max_pixels = max_pixels - self.patch_size = patch_size - self.temporal_patch_size = temporal_patch_size - self.merge_size = merge_size - self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used - self.do_convert_rgb = do_convert_rgb - - @classmethod - def from_pretrained(cls, pretrained_model_dir): - pretrained_model_dir = Path(pretrained_model_dir) - image_processor_config_path = pretrained_model_dir / "preprocessor_config.json" - with open(image_processor_config_path, "r", encoding="utf-8") as f: - image_processor_config = json.load(f) - return cls(**image_processor_config) - - def _preprocess( - self, - images, - do_resize: Optional[bool] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - ): - images = make_list_of_images(images) - - if do_convert_rgb: - images = [image.convert("RGB") for image in images] - - width, height = images[0].size - resized_height, resized_width = height, width - processed_images = [] - - for image in images: - if do_resize: - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, - min_pixels=self.min_pixels, - max_pixels=self.max_pixels, - ) - - image = image.resize((resized_width, resized_height), resample=self.resample) - - image = to_numpy_array(image) - - if do_rescale: - image = (image * rescale_factor).astype(np.float32) - - if do_normalize: - image = image.astype(np.float32) - image -= np.array(image_mean, dtype=np.float32) - image /= np.array(image_std, dtype=np.float32) - - processed_images.append(image) - - patches = np.array(processed_images) - patches = patches.transpose(0, 3, 1, 2) - if patches.shape[0] == 1: - patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) - channel = patches.shape[1] - grid_t = patches.shape[0] // self.temporal_patch_size - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - - patches = patches.reshape( - grid_t, - self.temporal_patch_size, - channel, - grid_h, - self.patch_size, - grid_w, - self.patch_size, - ) - patches = patches.transpose(0, 3, 5, 2, 1, 4, 6) - assert self.temporal_patch_size == 1 - flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size) - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images, - videos=None, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - do_convert_rgb: Optional[bool] = None, - return_tensors=None, - ): - do_resize = do_resize if do_resize is not None else self.do_resize - size = size if size is not None else self.size - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb - - if videos is not None: - raise NotImplementedError("Videos are not yet supported") - - patches, image_grid_thw = self._preprocess( - images, - do_resize=do_resize, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - do_convert_rgb=do_convert_rgb, - ) - pixel_values = np.array(patches) - data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py b/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py deleted file mode 100644 index f7d7cfbd2b1..00000000000 --- a/fastdeploy/input/v1/paddleocr_vl_processor/paddleocr_vl_processor.py +++ /dev/null @@ -1,322 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import numpy as np - -from fastdeploy.engine.request import Request -from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor -from fastdeploy.utils import data_processor_logger - -from .process import DataProcessor - -_SAMPLING_EPS = 1e-5 -from fastdeploy.input.utils import process_stop_token_ids - - -class PaddleOCRVLProcessor(TextProcessor): - """ - PaddleOCR Vision-Language processor for handling multimodal inputs. - - This processor extends TextProcessor to support: - - Image processing - - Multimodal feature extraction - - Tokenization and position encoding - - Request processing and model input generation - - Attributes: - processor (DataProcessor): Underlying data processor instance - tokenizer: Text tokenizer instance - limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt - """ - - def __init__( - self, - config, - model_name_or_path, - limit_mm_per_prompt=None, - mm_processor_kwargs=None, - reasoning_parser_obj=None, - tool_parser_obj=None, - enable_processor_cache=False, - ): - """ - Initialize PaddleOCRVLProcessor instance. - - Args: - config: Model configuration object - model_name_or_path (str): Pretrained model name or path - limit_mm_per_prompt (dict, optional): Limits for multimodal inputs - mm_processor_kwargs (dict, optional): Multimodal processor arguments - reasoning_parser_obj: Reasoning parser instance - tool_parser_obj: Tool parser instance - """ - super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj) - data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") - processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) - self.processor = DataProcessor( - model_path=model_name_or_path, - enable_processor_cache=enable_processor_cache, - tokens_per_second=config.vision_config.tokens_per_second, - tokenizer=self.tokenizer, - **processor_kwargs, - ) - self.image_patch_id = self.processor.image_patch_id - self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) - - def process_request(self, request, max_model_len=None, **kwargs): - """ - Process incoming request and generate model inputs. - - Args: - request: Input request object - max_model_len (int, optional): Maximum context length - **kwargs: Additional processing parameters - - Returns: - Request: Processed request with model inputs - """ - task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", False) - self.process_request_dict(task, max_model_len) - request = Request.from_dict(task) - request = self._apply_default_parameters(request) - return request - - def _parse_processor_kwargs(self, kwargs): - """ - Parse and validate multimodal processor arguments. - - Args: - kwargs (dict): Processor configuration arguments - - Returns: - dict: Validated processor arguments - - Raises: - ValueError: If arguments format is invalid - """ - if not kwargs: - return {} - - try: - if not isinstance(kwargs, dict): - raise ValueError("mm-processor-kwargs must be a dictionary") - - # Validate kwargs types against expected schema - data_processor_logger.info(f"Processing kwargs: {kwargs}") - expected_types = { - "video_max_frames": int, # Maximum video frames parameter - "video_min_frames": int, # Minimum video frames parameter - } - - for key, value in kwargs.items(): - if key in expected_types and not isinstance(value, expected_types[key]): - raise ValueError( - f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}" - ) - - return kwargs - - except Exception as e: - data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") - return {} - - def _parse_limits(self, limits): - """ - Parse and validate multimodal input limits. - - Args: - limits (dict): Input limits configuration - - Returns: - dict: Validated limits with defaults - - Raises: - ValueError: If limits format is invalid - """ - DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1} - - if not limits: - return DEFAULT_LIMITS - - try: - if not isinstance(limits, dict): - raise ValueError("limit-mm-per-prompt must be a dictionary") - data_processor_logger.info(f"_parse_limits:{limits}") - return {**DEFAULT_LIMITS, **limits} - except Exception as e: - data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") - return DEFAULT_LIMITS - - def _check_mm_limits(self, item): - """ - Validate multimodal inputs against configured limits. - - Args: - item: Input request item to validate - - Raises: - ValueError: If input exceeds configured limits - """ - if isinstance(item, dict): - # 请求包含prompt和multi_modal_data - mm_data = item - else: - # 请求包含messages - mm_data = {"image": [], "video": []} - - for message in item: - if isinstance(message.get("content"), list): - for part in message["content"]: - if part.get("type") in ["image_url", "image"]: - mm_data["image"].append(part) - elif part.get("type") in ["video_url", "video"]: - mm_data["video"].append(part) - - for modality, data in mm_data.items(): - if modality in self.limit_mm_per_prompt: - limit = self.limit_mm_per_prompt[modality] - if len(data) > limit: - raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """ - Process request dictionary into model inputs. - - Args: - request (dict): Input request dictionary - max_model_len (int, optional): Maximum context length - - Returns: - dict: Processed request with model inputs - - Raises: - ValueError: If request format is invalid - """ - - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - if request.prompt: - multimodal_data = request.multimodal_data - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request.prompt, images, videos) - - elif request.messages: - messages = request.messages - self._check_mm_limits(messages) - outputs = self.processor.request2ids(request) - - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - metadata = request.metadata - # Handle continuation of previous generation by appending existing tokens - if metadata and metadata.get("generated_token_ids"): - self.append_generated_tokens(outputs, metadata["generated_token_ids"]) - outputs = self.pack_outputs(outputs) - - request.prompt_token_ids = outputs["input_ids"].tolist() - request.prompt_token_ids_len = len(request.prompt_token_ids) - request.multimodal_inputs = outputs - - # Handle prompt truncation if exceeds model context length - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[ - : max_model_len - 1 - ] # Leave space for at least 1 new token - - # Set default max_tokens if not specified - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - - if request.sampling_params.top_p is not None and request.sampling_params.top_p < _SAMPLING_EPS: - request.sampling_params.top_p = _SAMPLING_EPS - request.sampling_params.top_k = 1 - - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - - return request - - def append_generated_tokens(self, multimodal_inputs, generated_token_ids): - """ - Append generated tokens to existing outputs. - - Args: - outputs: Current model outputs - generated_token_ids: Generated tokens to append - """ - num_tokens = len(generated_token_ids) - multimodal_inputs["input_ids"].extend(generated_token_ids) - multimodal_inputs["token_type_ids"].extend([0] * num_tokens) - - pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens) - multimodal_inputs["position_ids"].append(pos_ids) - multimodal_inputs["cur_position"] += num_tokens - - def pack_outputs(self, outputs): - """ - Prepare final output dictionary for model. - - Args: - outputs: Intermediate processing outputs - - Returns: - dict: Packed output dictionary with all required fields - """ - if not outputs["images"]: - outputs["images"] = None # No images case - outputs["grid_thw"] = None # No spatial dimensions - outputs["image_type_ids"] = None # No type IDs - else: - outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically - outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions - outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array - - # Convert all outputs to numpy arrays with appropriate types - outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64 - outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64 - outputs["position_ids"] = np.concatenate( - outputs["position_ids"], axis=1, dtype=np.int64 - ) # Concatenate position ID - - outputs["image_patch_id"] = self.processor.image_token_id - outputs["video_patch_id"] = self.processor.video_token_id - outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) - outputs["mm_num_token_func"] = self.processor.mm_num_tokens - return outputs diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process.py b/fastdeploy/input/v1/paddleocr_vl_processor/process.py deleted file mode 100644 index ea6b63ee9d8..00000000000 --- a/fastdeploy/input/v1/paddleocr_vl_processor/process.py +++ /dev/null @@ -1,622 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import pickle -from typing import Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import zmq -from paddleformers.transformers import AutoTokenizer -from PIL import Image - -from fastdeploy.engine.request import ImagePosition, Request -from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord -from fastdeploy.input.mm_data_processor import MMBaseDataProcessor -from fastdeploy.input.utils import IDS_TYPE_FLAG -from fastdeploy.multimodal.hasher import MultimodalHasher -from fastdeploy.utils import data_processor_logger - -from .image_processor import ImageProcessor -from .process_video import sample_frames - - -class DataProcessor(MMBaseDataProcessor): - """ - Processes multimodal inputs (text, images, videos) into model-ready formats. - - Handles: - - Tokenization of text with special tokens for visual content - - Image and video preprocessing - - Generation of 3D positional embeddings - - Conversion of chat messages to model inputs - - Attributes: - tokenizer: Text tokenizer instance - image_processor: Image/video preprocessor - image_token: Special token for image placeholders - video_token: Special token for video placeholders - vision_start: Token marking start of visual content - """ - - def __init__( - self, - model_path: str, - enable_processor_cache: bool = False, - video_min_frames: int = 4, - video_max_frames: int = 768, - video_target_frames: int = -1, - video_fps: int = -1, - tokens_per_second: int = 2, - tokenizer=None, - **kwargs, - ) -> None: - """ - Initialize the data processor. - - Args: - model_path: Path to pretrained model - video_min_frames: Minimum frames to sample from videos - video_max_frames: Maximum frames to sample from videos - tokens_per_second: Temporal resolution for positional embeddings - **kwargs: Additional configuration - """ - super().__init__() - self.min_frames = video_min_frames - self.max_frames = video_max_frames - self.target_frames = video_target_frames - self.fps = video_fps - - # Initialize tokenizer with left padding and fast tokenizer - if tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True) - self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation - else: - self.tokenizer = tokenizer - self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor - self.enable_processor_cache = enable_processor_cache - - # Convolution sizes for patch aggregation - self.spatial_conv_size = self.image_processor.merge_size - self.temporal_conv_size = self.image_processor.temporal_patch_size - - # Special tokens and IDs - self.image_token = "<|IMAGE_PLACEHOLDER|>" - self.video_token = "<|video_pad|>" - - self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token) - self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token) - self.image_patch_id = self.image_token_id - - self.vision_start = "<|IMAGE_START|>" - self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start) - - self.tokens_per_second = tokens_per_second - - self.role_prefixes = { - "system": "", - "user": "User: ", - "bot": "Assistant: ", - "assistant": "Assistant: ", - } - - @staticmethod - def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]: - """ - Calculate the number of tokens in the multimodal input. - """ - if isinstance(grid_thw, paddle.Tensor): - grid_thw = grid_thw.numpy() - - if len(grid_thw) == 0: - return 0 - - def calc_one(thw): - t, h, w = map(int, thw) - return t * h * w // 4 - - if isinstance(grid_thw[0], (list, tuple, np.ndarray)): - return [calc_one(x) for x in grid_thw] - - return calc_one(grid_thw) - - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): - """ - Convert text with image/video placeholders into model inputs. - - Args: - text: Input text with <|image@placeholder|> and <|video@placeholder|> markers - images: List of PIL Images corresponding to image placeholders - videos: List of video data corresponding to video placeholders - image_uuid: List of unique identifiers for each image, used for caching or hashing. - video_uuid: List of unique identifiers for each video, used for caching or hashing. - - Returns: - Dict containing: - - input_ids: Token IDs - - token_type_ids: Type identifiers (text/image/video) - - position_ids: 3D positional embeddings - - images: Preprocessed visual features - - grid_thw: Spatial/temporal dimensions - - image_type_ids: Visual content type (0=image, 1=video) - """ - - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "fps": [], - "mm_positions": [], - "mm_hashes": [], - "vit_seqlen": [], - "vit_position_ids": [], - } - - # Define placeholders and their lengths - IMAGE_PLACEHOLDER = self.image_token - VIDEO_PLACEHOLDER = self.video_token - IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) - VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) - - # Initialize tracking variables for text parsing - st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter - while st < len(text): - # Find next image or video placeholder in text - image_pos = text.find(IMAGE_PLACEHOLDER, st) - image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found - video_pos = text.find(VIDEO_PLACEHOLDER, st) - video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found - ed = min(image_pos, video_pos) # End position is first placeholder found - - self._add_text(text[st:ed], outputs) - if ed == len(text): - break - - if ed == image_pos: - image = images[image_idx] - uuid = image_uuid[image_idx] if image_uuid else None - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid) - else: - self._add_processed_image(image, outputs, uuid) - image_idx += 1 - st = ed + IMAGE_PLACEHOLDER_LEN - else: - item = videos[video_idx] - uuid = video_uuid[video_idx] if video_uuid else None - if not isinstance(item, tuple): - if isinstance(item, dict): - frames, meta = self._load_and_process_video(item["video"], item) - else: - frames, meta = self._load_and_process_video(item, {}) - self._add_video(frames, meta, outputs, uuid) - else: - # cached frames are already processed - self._add_processed_video(item, outputs, uuid) - video_idx += 1 - st = ed + VIDEO_PLACEHOLDER_LEN - - return outputs - - def request2ids( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - """ - Convert chat request with multimodal messages into model inputs. - - Args: - request: Dictionary containing: - - messages: List of chat messages with text/image/video content - - request_id: Unique identifier for logging - tgts: Optional target sequences - - Returns: - Dict with same structure as text2ids() output - """ - - # Parse and validate chat messages - messages = parse_chat_messages(request.messages) - mm_items = [] - for msg in messages: - role = msg.get("role") - assert role in self.role_prefixes, f"Unsupported role: {role}" - - # Normalize content to list format - content = msg.get("content") - if not isinstance(content, list): - content = [content] - # Collect all visual content items - for item in content: - if item.get("type") in ["image", "video"]: - mm_items.append(item) - - missing_hashes, missing_idx = [], [] - for idx, item in enumerate(mm_items): - if not item.get("data"): - # raw data not provided, should be retrieved from processor cache - missing_hashes.append(item.get("uuid")) - missing_idx.append(idx) - - if len(missing_hashes) > 0 and not self.enable_processor_cache: - raise ValueError("Missing items cannot be retrieved without processor cache.") - - if self.enable_processor_cache: - context = zmq.Context() - dealer = context.socket(zmq.DEALER) - dealer.connect("ipc:///dev/shm/processor_cache.ipc") - - missing_items = self.get_processor_cache(dealer, missing_hashes) - for idx in range(len(missing_items)): - if not missing_items[idx]: - raise ValueError(f"Missing item {idx} not found in processor cache") - mm_items[missing_idx[idx]]["data"] = missing_items[idx] - - images, videos = [], [] - image_uuid, video_uuid = [], [] - for item in mm_items: - if item.get("type") == "image": - images.append(item["data"]) - image_uuid.append(item["uuid"]) - elif item.get("type") == "video": - videos.append(item["data"]) - video_uuid.append(item["uuid"]) - else: - raise ValueError(f"Unsupported multimodal type: {item.get('type')}") - - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat template.") - - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True, - **chat_template_kwargs, - ) - request.prompt_tokens = prompt - - outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid) - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - t, h, w = outputs["grid_thw"][idx] - meta["thw"] = (t, h, w) - meta["fps"] = outputs["fps"][idx] - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - return outputs - - def _add_text(self, tokens, outputs: Dict) -> None: - """ - Add text tokens to model inputs dictionary. - - Args: - tokens: Text string or already tokenized IDs - outputs: Dictionary accumulating model inputs - - Note: - - Handles both raw text and pre-tokenized inputs - - Updates position IDs for 3D embeddings - """ - if not tokens: - return None - - if isinstance(tokens, str): - tokens_str = self.tokenizer.tokenize(tokens) - tokens = self.tokenizer.convert_tokens_to_ids(tokens_str) - - num_tokens = len(tokens) - outputs["input_ids"].extend(tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) - - pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray: - """ - Generate 3D positional embeddings for text tokens. - - Args: - start_pos: Starting position index - num_tokens: Number of tokens to generate positions for - - Returns: - numpy.ndarray: 3D position IDs shaped (3, num_tokens) - """ - text_array = np.arange(num_tokens).reshape(1, -1) - text_index = np.broadcast_to(text_array, (3, num_tokens)) - position = text_index + start_pos - return position - - def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - """ - Add image data to model inputs dictionary. - - Args: - img: PIL Image to process - outputs: Dictionary accumulating model inputs - - Note: - - Preprocesses image and calculates spatial dimensions - - Adds image token IDs and type markers - - Generates appropriate position embeddings - """ - ret = self.image_processor.preprocess(images=[img.convert("RGB")]) - num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["grid_thw"].tolist() - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - outputs["num_input_image_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].append(0) - - # position_ids - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - outputs["fps"].append(0) - numel = h * w - outputs["vit_seqlen"].append(numel) - outputs["vit_position_ids"].append(np.arange(numel) % numel) - - def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: - img, meta = img_cache - num_tokens = img.shape[0] // self.image_processor.merge_size**2 - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - _, h, w = meta["thw"] - pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["images"].append(img) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[1, h, w]])) - outputs["image_type_ids"].append(0) - - outputs["fps"].append(0) - - def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None: - """ - Add video data to model inputs dictionary. - - Args: - frames: Video frames as numpy array - meta: Video metadata containing fps/duration - outputs: Dictionary accumulating model inputs - - Note: - - Handles temporal dimension in position embeddings - - Uses video-specific token IDs and type markers - """ - ret = self.image_processor.preprocess(images=frames) - - num_tokens = ret["image_grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["image_grid_thw"].tolist() - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.video_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["num_input_video_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].extend([1] * grid_thw[0]) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - outputs["fps"].append(fps) - numel = h * w - outputs["vit_seqlen"].append(numel) - outputs["vit_position_ids"].append(np.arange(numel) % numel) - - def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: - frames, meta = frames_cache - num_tokens = frames.shape[0] // self.image_processor.merge_size**2 - - t, h, w = meta["thw"] - outputs["images"].append(frames) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[t, h, w]])) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["image_type_ids"].extend([1] * t) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(fps) - - def _compute_vision_positions( - self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float - ) -> np.ndarray: - """ - Generate 3D position IDs for visual inputs. - - Args: - start_pos: Base position in sequence - t: Temporal patches (1 for images) - h: Height in patches - w: Width in patches - second_per_grid_t: Time per temporal patch - - Returns: - np.ndarray: Position IDs for [t,h,w] dimensions - """ - h //= self.spatial_conv_size - w //= self.spatial_conv_size - - tn = np.arange(t).reshape(-1, 1) - tn = np.broadcast_to(tn, (t, h * w)) - tn = tn * int(second_per_grid_t) * self.tokens_per_second - t_index = tn.flatten() - - hn = np.arange(h).reshape(1, -1, 1) - h_index = np.broadcast_to(hn, (t, h, w)).flatten() - - wn = np.arange(w).reshape(1, 1, -1) - w_index = np.broadcast_to(wn, (t, h, w)).flatten() - - position = np.stack([t_index, h_index, w_index]) + start_pos - return position - - def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]: - """ - Load and preprocess video into frames. - - Args: - url: Video file path or bytes - item: Dictionary containing processing parameters - - Returns: - tuple: (frames, metadata) where: - - frames: Processed video frames as numpy array - - metadata: Updated video metadata dictionary - """ - reader, meta, _ = read_video_decord(url, save_to_disk=False) - - # Apply frame sampling if fps or target_frames specified - fps = item.get("fps", self.fps) - num_frames = item.get("target_frames", self.target_frames) - - frame_indices = list(range(meta["num_of_frame"])) - if fps > 0 or num_frames > 0: - # Get frame sampling constraints - min_frames = item.get("min_frames", self.min_frames) - max_frames = item.get("max_frames", self.max_frames) - - # Sample frames according to specifications - frame_indices = sample_frames( - frame_factor=self.temporal_conv_size, # Ensure divisible by temporal patch size - min_frames=min_frames, - max_frames=max_frames, - metadata=meta, - fps=fps, - num_frames=num_frames, - ) - - # Update metadata with new frame count and fps - meta["num_of_frame"] = len(frame_indices) - if fps is not None: - meta["fps"] = fps # Use specified fps - meta["duration"] = len(frame_indices) / fps - else: - meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames - - frames = [] - for idx in frame_indices: - frame = reader[idx].asnumpy() - image = Image.fromarray(frame, "RGB") - frames.append(image) - frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - - return frames, meta - - def get_processor_cache(self, socket, mm_hashes: list[str]) -> list: - """ - get cache correspond to given hash values - """ - req = pickle.dumps(mm_hashes) - socket.send_multipart([b"", req]) - _, resp = socket.recv_multipart() - mm_items = pickle.loads(resp) - data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") - - return mm_items - - def update_processor_cache(self, socket, mm_hashes: list[str], mm_items): - """ - update cache data - """ - req = pickle.dumps((mm_hashes, mm_items)) - socket.send_multipart([b"", req]) - data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") - - def apply_chat_template(self, request): - """ - Apply chat template to convert messages into token sequence. - - Args: - request: Dictionary containing chat messages - - Returns: - List of token IDs - - Raises: - ValueError: If model doesn't support chat templates - """ - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat_template.") - - raw_prompt = self.tokenizer.apply_chat_template( - request["messages"], - tokenize=False, - add_generation_prompt=request.get("add_generation_prompt", True), - chat_template=request.get("chat_template", None), - ) - prompt_token_str = raw_prompt.replace(self.image_token, "").replace(self.video_token, "") - request["text_after_process"] = raw_prompt - - tokens = self.tokenizer.tokenize(prompt_token_str) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - data_processor_logger.info( - f"req_id:{request.get('request_id', ''), } prompt: {raw_prompt} tokens: {tokens}, token_ids: {token_ids}" - ) - return token_ids diff --git a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py b/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py deleted file mode 100644 index c7089d26dc2..00000000000 --- a/fastdeploy/input/v1/paddleocr_vl_processor/process_video.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import math -from typing import Optional, Union - -import numpy as np - - -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = None, - num_frames: Optional[int] = None, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - num_frames = total_num_frames / metadata["fps"] * fps - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = math.floor(num_frames / frame_factor) * frame_factor - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices diff --git a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py b/fastdeploy/input/v1/qwen3_vl_processor/__init__.py deleted file mode 100644 index 9f959610c8d..00000000000 --- a/fastdeploy/input/v1/qwen3_vl_processor/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .process import DataProcessor -from .qwen3_vl_processor import Qwen3VLProcessor - -__all__ = [ - "DataProcessor", - "Qwen3VLProcessor", -] diff --git a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py deleted file mode 100644 index 167f3e340db..00000000000 --- a/fastdeploy/input/v1/qwen3_vl_processor/image_processor.py +++ /dev/null @@ -1,413 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import math -from typing import List, Optional, Union - -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, -) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.utils import data_processor_logger - -IMAGE_MEAN = [0.5, 0.5, 0.5] -IMAGE_STD = [0.5, 0.5, 0.5] - -MIN_PIXELS = 65536 -MAX_PIXELS = 16777216 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -def round_by_factor(number: int, factor: int) -> int: - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - return math.floor(number / factor) * factor - - -def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200): - """ - Smart image resizing that maintains aspect ratio and respects constraints. - - Args: - height: Original image height - width: Original image width - factor: Patch size factor - min_pixels: Minimum allowed pixels - max_pixels: Maximum allowed pixels - max_ratio: Maximum allowed aspect ratio - - Returns: - tuple: (new_height, new_width) - - Raises: - ValueError: If calculated dimensions are invalid - """ - if max(height, width) / min(height, width) > max_ratio: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * max_ratio, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * max_ratio, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar - - -def is_scaled_image(image: np.ndarray) -> bool: - """ - Check if image pixel values are already normalized to [0, 1] range. - - Args: - image: Input image array - - Returns: - bool: True if image is already scaled - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 16, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = IMAGE_MEAN, - image_std: Union[float, List[float]] = IMAGE_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - # mutual exclusion and upper branch - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/v1/qwen3_vl_processor/process.py b/fastdeploy/input/v1/qwen3_vl_processor/process.py deleted file mode 100644 index 0731d04972d..00000000000 --- a/fastdeploy/input/v1/qwen3_vl_processor/process.py +++ /dev/null @@ -1,814 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import pickle -from typing import Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import zmq -from paddleformers.transformers import AutoTokenizer -from PIL import Image - -from fastdeploy.engine.request import ImagePosition, Request -from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord -from fastdeploy.input.mm_data_processor import MMBaseDataProcessor -from fastdeploy.input.utils import IDS_TYPE_FLAG -from fastdeploy.multimodal.hasher import MultimodalHasher -from fastdeploy.utils import data_processor_logger - -from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor - -VIDEO_MIN_PIXELS = 128 * 28 * 28 -VIDEO_MAX_PIXELS = 768 * 28 * 28 -FRAME_FACTOR = 2 -FPS = 2.0 -FPS_MIN_FRAMES = 4 -FPS_MAX_FRAMES = 768 - - -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = -1, - num_frames: Optional[int] = -1, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - min_frames = ceil_by_factor(min_frames, frame_factor) - max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor) - - num_frames = total_num_frames / metadata["fps"] * fps - - if num_frames > total_num_frames: - data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]") - - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = floor_by_factor(num_frames, frame_factor) - - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Hack code ensures that num_frames can always be divided by 4 - # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2)) - if num_frames > 2 and num_frames % 4 != 0: - num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数 - total_num_frames = (total_num_frames // 4) * 4 - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices - - -class DataProcessor(MMBaseDataProcessor): - """ - Processes multimodal inputs (text, images, videos) into model-ready formats. - - Handles: - - Tokenization of text with special tokens for visual content - - Image and video preprocessing - - Generation of 3D positional embeddings - - Conversion of chat messages to model inputs - - Attributes: - tokenizer: Text tokenizer instance - image_processor: Image/video preprocessor - image_token: Special token for image placeholders - video_token: Special token for video placeholders - vision_start: Token marking start of visual content - """ - - def __init__( - self, - model_path: str, - enable_processor_cache: bool = False, - video_min_frames: int = FPS_MIN_FRAMES, - video_max_frames: int = FPS_MAX_FRAMES, - video_target_frames: int = -1, - video_fps: int = FPS, - tokens_per_second: int = 2, - tokenizer=None, - **kwargs, - ) -> None: - """ - Initialize the data processor. - - Args: - model_path: Path to pretrained model - video_min_frames: Minimum frames to sample from videos - video_max_frames: Maximum frames to sample from videos - tokens_per_second: Temporal resolution for positional embeddings - **kwargs: Additional configuration - """ - super().__init__() - self.min_frames = video_min_frames - self.max_frames = video_max_frames - self.target_frames = video_target_frames - self.fps = video_fps - self.frame_factor = FRAME_FACTOR - - # Initialize tokenizer with left padding and fast tokenizer - if tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True) - self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation - else: - self.tokenizer = tokenizer - - self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor - self.enable_processor_cache = enable_processor_cache - - # Convolution sizes for patch aggregation - self.spatial_conv_size = self.image_processor.merge_size - self.temporal_conv_size = self.image_processor.temporal_patch_size - - # Special tokens and IDs - self.image_token = "<|image_pad|>" - self.video_token = "<|video_pad|>" - - self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token) - self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token) - - self.vision_start = "<|vision_start|>" - self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start) - - self.tokens_per_second = tokens_per_second - - self.role_prefixes = { - "system": "", - "user": "User: ", - "bot": "Assistant: ", - "assistant": "Assistant: ", - } - - @staticmethod - def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]: - """ - Calculate the number of tokens in the multimodal input. - """ - if isinstance(grid_thw, paddle.Tensor): - grid_thw = grid_thw.numpy() - - if len(grid_thw) == 0: - return 0 - - def calc_one(thw): - t, h, w = map(int, thw) - return t * h * w // 4 - - if isinstance(grid_thw[0], (list, tuple, np.ndarray)): - return [calc_one(x) for x in grid_thw] - - return calc_one(grid_thw) - - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): - """ - Convert text with image/video placeholders into model inputs. - - Args: - text: Input text with <|image@placeholder|> and <|video@placeholder|> markers - images: List of PIL Images corresponding to image placeholders - videos: List of video data corresponding to video placeholders - image_uuid: List of unique identifiers for each image, used for caching or hashing. - video_uuid: List of unique identifiers for each video, used for caching or hashing. - - Returns: - Dict containing: - - input_ids: Token IDs - - token_type_ids: Type identifiers (text/image/video) - - position_ids: 3D positional embeddings - - images: Preprocessed visual features - - grid_thw: Spatial/temporal dimensions - - image_type_ids: Visual content type (0=image, 1=video) - """ - - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "fps": [], - "mm_positions": [], - "mm_hashes": [], - } - - # Define placeholders and their lengths - IMAGE_PLACEHOLDER = "<|image_pad|>" - VIDEO_PLACEHOLDER = "<|video_pad|>" - IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) - VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) - - # Initialize tracking variables for text parsing - st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter - while st < len(text): - # Find next image or video placeholder in text - image_pos = text.find(IMAGE_PLACEHOLDER, st) - image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found - video_pos = text.find(VIDEO_PLACEHOLDER, st) - video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found - ed = min(image_pos, video_pos) # End position is first placeholder found - - self._add_text(text[st:ed], outputs) - if ed == len(text): - break - - if ed == image_pos: - image = images[image_idx] - uuid = image_uuid[image_idx] if image_uuid else None - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid) - else: - self._add_processed_image(image, outputs, uuid) - image_idx += 1 - st = ed + IMAGE_PLACEHOLDER_LEN - else: - item = videos[video_idx] - uuid = video_uuid[video_idx] if video_uuid else None - if not isinstance(item, tuple): - if isinstance(item, dict): - frames, meta = self._load_and_process_video(item["video"], item) - else: - frames, meta = self._load_and_process_video(item, {}) - self._add_video(frames, meta, outputs, uuid) - else: - # cached frames are already processed - self._add_processed_video(item, outputs, uuid) - video_idx += 1 - st = ed + VIDEO_PLACEHOLDER_LEN - - return outputs - - def prompt_token_ids2outputs( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "fps": [], - "mm_positions": [], - "mm_hashes": [], - } - prompt_token_ids = request.prompt_token_ids if request.prompt_token_ids else [] - prompt_token_ids_len = len(prompt_token_ids) - - if not request.messages: - self._add_text(prompt_token_ids, outputs) - return outputs - - messages = parse_chat_messages(request.messages) - mm_items = [] - for msg in messages: - role = msg.get("role") - assert role in self.role_prefixes, f"Unsupported role: {role}" - - content = msg.get("content") - if not isinstance(content, list): - content = [content] - for item in content: - if item.get("type") in ["image", "video"]: - mm_items.append(item) - - missing_hashes, missing_idx = [], [] - for idx, item in enumerate(mm_items): - if not item.get("data"): - missing_hashes.append(item.get("uuid")) - missing_idx.append(idx) - - if len(missing_hashes) > 0 and not self.enable_processor_cache: - raise ValueError("Missing items cannot be retrieved without processor cache.") - - dealer = None - if self.enable_processor_cache: - context = zmq.Context() - dealer = context.socket(zmq.DEALER) - dealer.connect("ipc:///dev/shm/processor_cache.ipc") - - missing_items = self.get_processor_cache(dealer, missing_hashes) - for idx in range(len(missing_items)): - if not missing_items[idx]: - raise ValueError(f"Missing item {idx} not found in processor cache") - mm_items[missing_idx[idx]]["data"] = missing_items[idx] - - st, mm_idx = 0, 0 - while st < prompt_token_ids_len: - if prompt_token_ids[st] != self.image_token_id: - cur_idx = st - while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id: - cur_idx += 1 - self._add_text(prompt_token_ids[st:cur_idx], outputs) - st = cur_idx - continue - - if mm_idx >= len(mm_items): - raise ValueError("prompt token ids has more multimodal placeholder than in messages") - - cur_idx = st - while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id: - cur_idx += 1 - - item = mm_items[mm_idx] - uuid = item.get("uuid") - token_len = cur_idx - st - if item.get("type") == "image": - image = item.get("data") - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid, token_len) - else: - self._add_processed_image(image, outputs, uuid, token_len) - elif item.get("type") == "video": - video = item.get("data") - if not isinstance(video, tuple): - if isinstance(video, dict): - frames, meta = self._load_and_process_video(video["video"], video) - else: - frames, meta = self._load_and_process_video(video, {}) - self._add_video(frames, meta, outputs, uuid, token_len) - else: - self._add_processed_video(video, outputs, uuid, token_len) - else: - raise ValueError(f"Unsupported multimodal type: {item.get('type')}") - mm_idx += 1 - st = cur_idx - - if mm_idx != len(mm_items): - raise ValueError("number of multimodal items does not match prompt token ids") - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - grid_thw = np.asarray(outputs["grid_thw"][idx]) - if grid_thw.ndim > 1: - t, h, w = grid_thw[0] - else: - t, h, w = grid_thw - meta["thw"] = (int(t), int(h), int(w)) - meta["fps"] = outputs["fps"][idx] - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - if hashes_to_cache: - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - return outputs - - def request2ids( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - """ - Convert chat request with multimodal messages into model inputs. - - Args: - request: Request containing: - - messages: List of chat messages with text/image/video content - - request_id: Unique identifier for logging - tgts: Optional target sequences - - Returns: - Dict with same structure as text2ids() output - """ - - messages = parse_chat_messages(request.messages) - mm_items = [] - for msg in messages: - role = msg.get("role") - assert role in self.role_prefixes, f"Unsupported role: {role}" - - content = msg.get("content") - if not isinstance(content, list): - content = [content] - for item in content: - if item.get("type") in ["image", "video"]: - mm_items.append(item) - - missing_hashes, missing_idx = [], [] - for idx, item in enumerate(mm_items): - if not item.get("data"): - missing_hashes.append(item.get("uuid")) - missing_idx.append(idx) - - if len(missing_hashes) > 0 and not self.enable_processor_cache: - raise ValueError("Missing items cannot be retrieved without processor cache.") - - if self.enable_processor_cache: - context = zmq.Context() - dealer = context.socket(zmq.DEALER) - dealer.connect("ipc:///dev/shm/processor_cache.ipc") - - missing_items = self.get_processor_cache(dealer, missing_hashes) - for idx in range(len(missing_items)): - if not missing_items[idx]: - raise ValueError(f"Missing item {idx} not found in processor cache") - mm_items[missing_idx[idx]]["data"] = missing_items[idx] - - images, videos = [], [] - image_uuid, video_uuid = [], [] - for item in mm_items: - if item.get("type") == "image": - images.append(item["data"]) - image_uuid.append(item["uuid"]) - elif item.get("type") == "video": - videos.append(item["data"]) - video_uuid.append(item["uuid"]) - else: - raise ValueError(f"Unsupported multimodal type: {item.get('type')}") - - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat template.") - - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True, - **chat_template_kwargs, - ) - request.prompt_tokens = prompt - - outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid) - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - grid_thw = np.asarray(outputs["grid_thw"][idx]) - if grid_thw.ndim > 1: - t, h, w = grid_thw[0] - else: - t, h, w = grid_thw - meta["thw"] = (int(t), int(h), int(w)) - meta["fps"] = outputs["fps"][idx] - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - if hashes_to_cache: - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - return outputs - - def _add_text(self, tokens, outputs: Dict) -> None: - """ - Add text tokens to model inputs dictionary. - - Args: - tokens: Text string or already tokenized IDs - outputs: Dictionary accumulating model inputs - - Note: - - Handles both raw text and pre-tokenized inputs - - Updates position IDs for 3D embeddings - """ - if not tokens: - return None - - if isinstance(tokens, str): - tokens_str = self.tokenizer.tokenize(tokens) - tokens = self.tokenizer.convert_tokens_to_ids(tokens_str) - - num_tokens = len(tokens) - outputs["input_ids"].extend(tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) - - pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray: - """ - Generate 3D positional embeddings for text tokens. - - Args: - start_pos: Starting position index - num_tokens: Number of tokens to generate positions for - - Returns: - numpy.ndarray: 3D position IDs shaped (3, num_tokens) - """ - text_array = np.arange(num_tokens).reshape(1, -1) - text_index = np.broadcast_to(text_array, (3, num_tokens)) - position = text_index + start_pos - return position - - def _add_image(self, img, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None) -> None: - """ - Add image data to model inputs dictionary. - - Args: - img: PIL Image to process - outputs: Dictionary accumulating model inputs - - Note: - - Preprocesses image and calculates spatial dimensions - - Adds image token IDs and type markers - - Generates appropriate position embeddings - """ - ret = self.image_processor.preprocess(images=[img.convert("RGB")]) - num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["grid_thw"].tolist() - if token_len is not None and token_len != num_tokens: - raise ValueError("image tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - outputs["num_input_image_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].append(0) - - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0) - - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(0) - - def _add_processed_image( - self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None - ) -> None: - img, meta = img_cache - num_tokens = img.shape[0] // self.image_processor.merge_size**2 - if token_len is not None and token_len != num_tokens: - raise ValueError("image tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - _, h, w = meta["thw"] - pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["images"].append(img) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[1, h, w]])) - outputs["image_type_ids"].append(0) - - outputs["fps"].append(0) - - def _add_video( - self, frames, meta: Dict, outputs: Dict, uuid: Optional[str], token_len: Optional[int] = None - ) -> None: - """ - Add video data to model inputs dictionary. - - Args: - frames: Video frames as numpy array - meta: Video metadata containing fps/duration - outputs: Dictionary accumulating model inputs - - Note: - - Handles temporal dimension in position embeddings - - Uses video-specific token IDs and type markers - """ - ret = self.image_processor.preprocess( - images=frames, - min_pixels=VIDEO_MIN_PIXELS, - max_pixels=VIDEO_MAX_PIXELS, - ) - - num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["grid_thw"].tolist() - if token_len is not None and token_len != num_tokens: - raise ValueError("video tokens num not match the size") - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # Hack code. In order to adapt to the framework, only image_token can be passed - # The correct way should be to use [self.video_token_id] * num_tokens - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["num_input_video_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].extend([1] * grid_thw[0]) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(fps) - - def _add_processed_video( - self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str, token_len: Optional[int] = None - ) -> None: - frames, meta = frames_cache - num_tokens = frames.shape[0] // self.image_processor.merge_size**2 - if token_len is not None and token_len != num_tokens: - raise ValueError("video tokens num not match the size") - - t, h, w = meta["thw"] - outputs["images"].append(frames) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[t, h, w]])) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["image_type_ids"].extend([1] * t) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(fps) - - def _compute_vision_positions( - self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float - ) -> np.ndarray: - """ - Generate 3D position IDs for visual inputs. - - Args: - start_pos: Base position in sequence - t: Temporal patches (1 for images) - h: Height in patches - w: Width in patches - second_per_grid_t: Time per temporal patch - - Returns: - np.ndarray: Position IDs for [t,h,w] dimensions - """ - h //= self.spatial_conv_size - w //= self.spatial_conv_size - - tn = np.arange(t).reshape(-1, 1) - tn = np.broadcast_to(tn, (t, h * w)) - tn = tn * int(second_per_grid_t) * self.tokens_per_second - t_index = tn.flatten() - - hn = np.arange(h).reshape(1, -1, 1) - h_index = np.broadcast_to(hn, (t, h, w)).flatten() - - wn = np.arange(w).reshape(1, 1, -1) - w_index = np.broadcast_to(wn, (t, h, w)).flatten() - - position = np.stack([t_index, h_index, w_index]) + start_pos - return position - - def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]: - """ - Load and preprocess video into frames. - - Args: - url: Video file path or bytes - item: Dictionary containing processing parameters - - Returns: - tuple: (frames, metadata) where: - - frames: Processed video frames as numpy array - - metadata: Updated video metadata dictionary - """ - reader, meta, _ = read_video_decord(url, save_to_disk=False) - - # Apply frame sampling if fps or target_frames specified - fps = item.get("fps", self.fps) - num_frames = item.get("target_frames", self.target_frames) - - frame_indices = list(range(meta["num_of_frame"])) - if fps > 0 or num_frames > 0: - # Get frame sampling constraints - min_frames = item.get("min_frames", self.min_frames) - max_frames = item.get("max_frames", self.max_frames) - - # Sample frames according to specifications - frame_indices = sample_frames( - frame_factor=self.frame_factor, # Ensure divisible by temporal patch size - min_frames=min_frames, - max_frames=max_frames, - metadata=meta, - fps=fps, - num_frames=num_frames, - ) - - # Update metadata with new frame count and fps - meta["num_of_frame"] = len(frame_indices) - if fps is not None: - meta["fps"] = fps # Use specified fps - meta["duration"] = len(frame_indices) / fps - else: - meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames - - frames = [] - for idx in frame_indices: - frame = reader[idx].asnumpy() - image = Image.fromarray(frame, "RGB") - frames.append(image) - frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - - return frames, meta - - def get_processor_cache(self, socket, mm_hashes: list[str]) -> list: - """ - get cache correspond to given hash values - """ - req = pickle.dumps(mm_hashes) - socket.send_multipart([b"", req]) - _, resp = socket.recv_multipart() - mm_items = pickle.loads(resp) - data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") - - return mm_items - - def update_processor_cache(self, socket, mm_hashes: list[str], mm_items): - """ - update cache data - """ - req = pickle.dumps((mm_hashes, mm_items)) - socket.send_multipart([b"", req]) - data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") diff --git a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py b/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py deleted file mode 100644 index c72de49bcc3..00000000000 --- a/fastdeploy/input/v1/qwen3_vl_processor/qwen3_vl_processor.py +++ /dev/null @@ -1,341 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import numpy as np - -from fastdeploy.engine.request import Request -from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor -from fastdeploy.utils import data_processor_logger - -from .process import DataProcessor - - -class Qwen3VLProcessor(TextProcessor): - """ - Qwen Vision-Language processor for handling multimodal inputs. - - This processor extends TextProcessor to support: - - Image and video processing - - Multimodal feature extraction - - Tokenization and position encoding - - Request processing and model input generation - - Attributes: - processor (DataProcessor): Underlying data processor instance - tokenizer: Text tokenizer instance - limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt - """ - - def __init__( - self, - config, - model_name_or_path, - limit_mm_per_prompt=None, - mm_processor_kwargs=None, - reasoning_parser_obj=None, - tool_parser_obj=None, - enable_processor_cache=False, - ): - """ - Initialize QwenVLProcessor instance. - - Args: - config: Model configuration object - model_name_or_path (str): Pretrained model name or path - limit_mm_per_prompt (dict, optional): Limits for multimodal inputs - mm_processor_kwargs (dict, optional): Multimodal processor arguments - reasoning_parser_obj: Reasoning parser instance - tool_parser_obj: Tool parser instance - """ - super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj) - - data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") - processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) - self.processor = DataProcessor( - model_path=model_name_or_path, - enable_processor_cache=enable_processor_cache, - # tokens_per_second=config.vision_config.tokens_per_second, - tokenizer=self.tokenizer, - **processor_kwargs, - ) - self.image_patch_id = self.processor.image_token_id - self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) - - def _parse_processor_kwargs(self, kwargs): - """ - Parse and validate multimodal processor arguments. - - Args: - kwargs (dict): Processor configuration arguments - - Returns: - dict: Validated processor arguments - - Raises: - ValueError: If arguments format is invalid - """ - if not kwargs: - return {} - - try: - if not isinstance(kwargs, dict): - raise ValueError("mm-processor-kwargs must be a dictionary") - - # Validate kwargs types against expected schema - data_processor_logger.info(f"Processing kwargs: {kwargs}") - expected_types = { - "video_max_frames": int, # Maximum video frames parameter - "video_min_frames": int, # Minimum video frames parameter - } - - for key, value in kwargs.items(): - if key in expected_types and not isinstance(value, expected_types[key]): - raise ValueError( - f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}" - ) - - return kwargs - - except Exception as e: - data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") - return {} - - def _parse_limits(self, limits): - """ - Parse and validate multimodal input limits. - - Args: - limits (dict): Input limits configuration - - Returns: - dict: Validated limits with defaults - - Raises: - ValueError: If limits format is invalid - """ - DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1} - - if not limits: - return DEFAULT_LIMITS - - try: - if not isinstance(limits, dict): - raise ValueError("limit-mm-per-prompt must be a dictionary") - data_processor_logger.info(f"_parse_limits:{limits}") - return {**DEFAULT_LIMITS, **limits} - except Exception as e: - data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") - return DEFAULT_LIMITS - - def _check_mm_limits(self, item): - """ - Validate multimodal inputs against configured limits. - - Args: - item: Input request item to validate - - Raises: - ValueError: If input exceeds configured limits - """ - if isinstance(item, dict): - # 请求包含prompt和multi_modal_data - mm_data = item - else: - # 请求包含messages - mm_data = {"image": [], "video": []} - - for message in item: - if isinstance(message.get("content"), list): - for part in message["content"]: - if part.get("type") in ["image_url", "image"]: - mm_data["image"].append(part) - elif part.get("type") in ["video_url", "video"]: - mm_data["video"].append(part) - - for modality, data in mm_data.items(): - if modality in self.limit_mm_per_prompt: - limit = self.limit_mm_per_prompt[modality] - if len(data) > limit: - raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") - - def process_request(self, request, max_model_len=None, **kwargs): - """ - Process incoming request and generate model inputs. - - Args: - request: Input request object - max_model_len (int, optional): Maximum context length - **kwargs: Additional processing parameters - - Returns: - Request: Processed request with model inputs - """ - task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", False) - self.process_request_dict(task, max_model_len) - request = Request.from_dict(task) - request = self._apply_default_parameters(request) - return request - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """ - Process request dictionary into model inputs. - - Args: - request Request: Input request dictionary - max_model_len (int, optional): Maximum context length - - Returns: - Request: Processed request with model inputs - - Raises: - ValueError: If request format is invalid - """ - - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - stop_sequences = request.sampling_params.stop - if stop_sequences: - stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) - request.sampling_params.stop_token_ids = stop_seqs - request.sampling_params.stop_seqs_len = stop_seqs_len - - bad_words = request.sampling_params.bad_words - bad_words_token_ids = request.sampling_params.bad_words_token_ids - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request.sampling_params.bad_words_token_ids = bad_words_token_ids - - if request.prompt_token_ids: - messages = request.messages - if messages: - self._check_mm_limits(messages) - if request.enable_thinking is None: - request.enable_thinking = kwargs.get("enable_thinking", False) - outputs = self.processor.prompt_token_ids2outputs(request) - - elif request.prompt: - multimodal_data = request.multimodal_data - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request.prompt, images, videos) - - elif request.messages: - messages = request.messages - self._check_mm_limits(messages) - chat_template_kwargs = request.chat_template_kwargs - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if getattr(request, k, None) is None: - setattr(request, k, v) - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - if request.enable_thinking is None: - request.enable_thinking = kwargs.get("enable_thinking", False) - outputs = self.processor.request2ids(request) - delattr(request, "chat_template_kwargs") - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - # Handle continuation of previous generation by appending existing tokens - if request.completion_token_ids: - self.append_completion_tokens(outputs, request.completion_token_ids) - - # qwen25_vl not support thinking - request.enable_thinking = False - - outputs = self.pack_outputs(outputs) - - request.prompt_token_ids = ( - outputs["input_ids"].tolist() - if not getattr(request, "prompt_token_ids", None) - else request.prompt_token_ids - ) - request.prompt_token_ids_len = len(request.prompt_token_ids) - request.multimodal_inputs = outputs - - # Handle prompt truncation if exceeds model context length - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[ - : max_model_len - 1 - ] # Leave space for at least 1 new token - - # Set default max_tokens if not specified - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - data_processor_logger.info(f"Processed request {request}") - - return request - - def append_completion_tokens(self, multimodal_inputs, completion_token_ids): - """ - Append completion tokens to existing outputs. - - Args: - outputs: Current model outputs - completion_token_ids: completion tokens to append - """ - - num_tokens = len(completion_token_ids) - multimodal_inputs["input_ids"].extend(completion_token_ids) - multimodal_inputs["token_type_ids"].extend([0] * num_tokens) - - pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens) - multimodal_inputs["position_ids"].append(pos_ids) - multimodal_inputs["cur_position"] += num_tokens - - def pack_outputs(self, outputs): - """ - Prepare final output dictionary for model. - - Args: - outputs: Intermediate processing outputs - - Returns: - dict: Packed output dictionary with all required fields - """ - if not outputs["images"]: - outputs["images"] = None # No images case - outputs["grid_thw"] = None # No spatial dimensions - outputs["image_type_ids"] = None # No type IDs - else: - outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically - outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions - outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array - - # Convert all outputs to numpy arrays with appropriate types - outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64 - outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64 - outputs["position_ids"] = np.concatenate( - outputs["position_ids"], axis=1, dtype=np.int64 - ) # Concatenate position ID - - outputs["image_patch_id"] = self.processor.image_token_id - outputs["video_patch_id"] = self.processor.video_token_id - outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) - - outputs["mm_num_token_func"] = self.processor.mm_num_tokens - - return outputs diff --git a/fastdeploy/input/v1/qwen_vl_processor/__init__.py b/fastdeploy/input/v1/qwen_vl_processor/__init__.py deleted file mode 100644 index c876cde7125..00000000000 --- a/fastdeploy/input/v1/qwen_vl_processor/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from .process import DataProcessor -from .qwen_vl_processor import QwenVLProcessor - -__all__ = [ - "DataProcessor", - "QwenVLProcessor", -] diff --git a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py b/fastdeploy/input/v1/qwen_vl_processor/image_processor.py deleted file mode 100644 index b6a1db19bc5..00000000000 --- a/fastdeploy/input/v1/qwen_vl_processor/image_processor.py +++ /dev/null @@ -1,442 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import math -from typing import List, Optional, Union - -import numpy as np -import paddle -import PIL -from paddleformers.transformers.feature_extraction_utils import BatchFeature -from paddleformers.transformers.image_processing_utils import BaseImageProcessor -from paddleformers.transformers.image_transforms import ( - normalize, - rescale, - resize, - to_channel_dimension_format, -) -from paddleformers.transformers.image_utils import ( - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - make_list_of_images, - to_numpy_array, - valid_images, -) -from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType -from PIL import Image - -from fastdeploy.utils import data_processor_logger - -OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] -OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] - -MIN_PIXELS = 4 * 28 * 28 -MAX_PIXELS = 16384 * 28 * 28 - - -VideoInput = Union[ - List["PIL.Image.Image"], - "np.ndarray", - "paddle.Tensor", - List["np.ndarray"], - List["paddle.Tensor"], - List[List["PIL.Image.Image"]], - List[List["np.ndarray"]], - List[List["paddle.Tensor"]], -] - - -def round_by_factor(number: int, factor: int) -> int: - """ - Round number to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - """ - Round number up to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - """ - Round number down to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return math.floor(number / factor) * factor - - -def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200): - """ - Smart image resizing that maintains aspect ratio and respects constraints. - - Args: - height: Original image height - width: Original image width - factor: Patch size factor - min_pixels: Minimum allowed pixels - max_pixels: Maximum allowed pixels - max_ratio: Maximum allowed aspect ratio - - Returns: - tuple: (new_height, new_width) - - Raises: - ValueError: If calculated dimensions are invalid - """ - if max(height, width) / min(height, width) > max_ratio: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * max_ratio, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * max_ratio, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar - - -def is_scaled_image(image: np.ndarray) -> bool: - """ - Check if image pixel values are already normalized to [0, 1] range. - - Args: - image: Input image array - - Returns: - bool: True if image is already scaled - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - -class ImageProcessor(BaseImageProcessor): - """ - Adaptive image processor for dynamic image resizing and preprocessing. - - This processor handles image resizing, rescaling, normalization and format conversion. - It dynamically adjusts image dimensions based on original size and specified constraints. - """ - - def __init__( - self, - patch_size: int = 14, - merge_size: int = 2, - temporal_patch_size: int = 2, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, - image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN, - image_std: Union[float, List[float]] = OPENAI_CLIP_STD, - rescale_factor: float = 1 / 255, - do_rescale: bool = True, - do_normalize: bool = True, - resample: PILImageResampling = PILImageResampling.BICUBIC, - **kwargs, - ) -> None: - """ - Initialize image processor with configuration parameters. - - Args: - patch_size (int): Spatial patch size for vision encoder - merge_size (int): Merge size between vision and LLM encoders - temporal_patch_size (int): Temporal patch size for video processing - min_pixels (int): Minimum allowed pixels in resized image - max_pixels (int): Maximum allowed pixels in resized image - image_mean (float/list): Mean values for normalization per channel - image_std (float/list): Std values for normalization per channel - rescale_factor (float): Scaling factor for pixel values (default 1/255) - do_rescale (bool): Whether to rescale images - do_normalize (bool): Whether to normalize images - resample: Resampling method for image resizing - **kwargs: Additional base class arguments - """ - super().__init__(**kwargs) - self.patch_size = patch_size - self.merge_size = merge_size - self.temporal_patch_size = temporal_patch_size - - self.min_pixels = min_pixels - self.max_pixels = max_pixels - - self.image_mean = image_mean - self.image_std = image_std - self.rescale_factor = rescale_factor - self.do_rescale = do_rescale - self.do_normalize = do_normalize - - self.resample = resample - - def _preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: int, - max_pixels: int, - image_mean: Optional[Union[float, List[float]]], - image_std: Optional[Union[float, List[float]]], - rescale_factor: float, - do_rescale: bool, - do_normalize: bool, - resample: PILImageResampling, - data_format: Optional[ChannelDimension], - input_data_format: Optional[Union[str, ChannelDimension]], - ): - """ - Internal method for image preprocessing pipeline. - - Args: - images: Input image or batch of images - min_pixels: Minimum allowed pixels in output - max_pixels: Maximum allowed pixels in output - image_mean: Normalization mean values - image_std: Normalization std values - rescale_factor: Pixel value scaling factor - do_rescale: Whether to rescale pixel values - do_normalize: Whether to normalize pixel values - resample: Resampling method - data_format: Output channel format - input_data_format: Input channel format - - Returns: - tuple: (flatten_patches, grid_dimensions) - - flatten_patches: Flattened image patches - - grid_dimensions: Grid dimensions [t, h, w] - """ - images = make_list_of_images(images) - - # All transformations expect numpy arrays. - images = [to_numpy_array(image) for image in images] - - if is_scaled_image(images[0]) and do_rescale: - data_processor_logger.warning( - "It looks like you are trying to rescale already rescaled images. If the input" - " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." - ) - if input_data_format is None: - # We assume that all images have the same channel dimension format. - input_data_format = infer_channel_dimension_format(images[0]) - - # Get original dimensions and calculate optimal resize dimensions - height, width = get_image_size(images[0], channel_dim=input_data_format) - resized_height, resized_width = smart_resize( - height, - width, - factor=self.patch_size * self.merge_size, # Combine patch and merge factors - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - - processed_images = [] - for image in images: - if height != resized_height or width != resized_width: - # Convert to uint8 before resizing to avoid double scaling - image = image.astype("uint8") - # Convert to PIL Image and resize - image = Image.fromarray(image) - image = resize( - image, - size=(resized_height, resized_width), - resample=resample, - data_format=input_data_format, - ) - - if do_rescale and do_normalize: - # Adjust mean and std for combined rescale+normalize - image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor) - image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor) - do_rescale = False # Skip separate rescale step - - if do_rescale: - image = image.astype(np.float32) - image = rescale(image, scale=rescale_factor, data_format=input_data_format) - - if do_normalize: - image = image.astype(np.float32) - image = normalize( - image=image, - mean=image_mean, - std=image_std, - data_format=input_data_format, - ) - - image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] - processed_images.append(image) - - # Convert processed images to numpy array - patches = np.array(processed_images) - - # Pad temporal dimension if needed - if patches.shape[0] % self.temporal_patch_size != 0: - repeats = np.repeat( - patches[-1][np.newaxis], - self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size), - axis=0, - ) - patches = np.concatenate([patches, repeats], axis=0) - - # Convert to channels-first format if needed - if data_format == ChannelDimension.LAST: - patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W] - - grid_t, channel = patches.shape[:2] - grid_t = grid_t // self.temporal_patch_size - - grid_h, grid_w = ( - resized_height // self.patch_size, - resized_width // self.patch_size, - ) - # Reshape into hierarchical patch structure - patches = patches.reshape( - [ - grid_t, - self.temporal_patch_size, - channel, - grid_h // self.merge_size, - self.merge_size, - self.patch_size, - grid_w // self.merge_size, - self.merge_size, - self.patch_size, - ] - ) - # Reorder dimensions for better memory access pattern - # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz] - patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8]) - - flatten_patches = patches.reshape( - [ - grid_t * grid_h * grid_w, - channel * self.temporal_patch_size * self.patch_size * self.patch_size, - ] - ) - - return flatten_patches, np.array([grid_t, grid_h, grid_w]) - - def preprocess( - self, - images: Union[ImageInput, VideoInput], - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - rescale_factor: Optional[float] = None, - do_rescale: Optional[bool] = None, - do_normalize: Optional[bool] = None, - resample: Optional[PILImageResampling] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST, - ): - """ - Main preprocessing method for images/videos. - - Args: - images: Input image/video data - min_pixels: Override for minimum pixels - max_pixels: Override for maximum pixels - image_mean: Override for normalization mean - image_std: Override for normalization std - rescale_factor: Override for rescaling factor - do_rescale: Override for rescaling flag - do_normalize: Override for normalization flag - resample: Override for resampling method - return_tensors: Desired output tensor format - data_format: Output channel dimension format - input_data_format: Input channel dimension format - - Returns: - BatchFeature: Processed features containing: - - pixel_values: Preprocessed pixel data - - grid_thw: Grid dimensions [temporal, height, width] - - Raises: - ValueError: For invalid image types or dimensions - """ - min_pixels = min_pixels if min_pixels is not None else self.min_pixels - max_pixels = max_pixels if max_pixels is not None else self.max_pixels - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - - if images is not None and not valid_images(images): - raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") - - pixel_values, grid_thw = self._preprocess( - images, - min_pixels=min_pixels, - max_pixels=max_pixels, - image_mean=image_mean, - image_std=image_std, - rescale_factor=rescale_factor, - do_rescale=do_rescale, - do_normalize=do_normalize, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - ) - data = {"pixel_values": pixel_values, "grid_thw": grid_thw} - return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/fastdeploy/input/v1/qwen_vl_processor/process.py b/fastdeploy/input/v1/qwen_vl_processor/process.py deleted file mode 100644 index 3aaade025cf..00000000000 --- a/fastdeploy/input/v1/qwen_vl_processor/process.py +++ /dev/null @@ -1,591 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import pickle -from typing import Dict, List, Optional, Tuple, Union - -import numpy as np -import paddle -import zmq -from paddleformers.transformers import AutoTokenizer -from PIL import Image - -from fastdeploy.engine.request import ImagePosition, Request -from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord -from fastdeploy.input.mm_data_processor import MMBaseDataProcessor -from fastdeploy.input.utils import IDS_TYPE_FLAG -from fastdeploy.multimodal.hasher import MultimodalHasher -from fastdeploy.utils import data_processor_logger - -from .image_processor import ImageProcessor -from .process_video import sample_frames - -FRAME_FACTOR = 2 -FPS = 2.0 -FPS_MIN_FRAMES = 4 -FPS_MAX_FRAMES = 768 - - -class DataProcessor(MMBaseDataProcessor): - """ - Processes multimodal inputs (text, images, videos) into model-ready formats. - - Handles: - - Tokenization of text with special tokens for visual content - - Image and video preprocessing - - Generation of 3D positional embeddings - - Conversion of chat messages to model inputs - - Attributes: - tokenizer: Text tokenizer instance - image_processor: Image/video preprocessor - image_token: Special token for image placeholders - video_token: Special token for video placeholders - vision_start: Token marking start of visual content - """ - - def __init__( - self, - model_path: str, - enable_processor_cache: bool = False, - video_min_frames: int = FPS_MIN_FRAMES, - video_max_frames: int = FPS_MAX_FRAMES, - video_target_frames: int = -1, - video_fps: int = FPS, - tokens_per_second: int = 2, - tokenizer=None, - **kwargs, - ) -> None: - """ - Initialize the data processor. - - Args: - model_path: Path to pretrained model - video_min_frames: Minimum frames to sample from videos - video_max_frames: Maximum frames to sample from videos - tokens_per_second: Temporal resolution for positional embeddings - **kwargs: Additional configuration - """ - super().__init__() - self.min_frames = video_min_frames - self.max_frames = video_max_frames - self.target_frames = video_target_frames - self.fps = video_fps - self.frame_factor = FRAME_FACTOR - - # Initialize tokenizer with left padding and fast tokenizer - if tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left", use_fast=True) - self.tokenizer.ignored_index = -100 # Set ignored index for loss calculation - else: - self.tokenizer = tokenizer - self.image_processor = ImageProcessor.from_pretrained(model_path) # Initialize image processor - self.enable_processor_cache = enable_processor_cache - - # Convolution sizes for patch aggregation - self.spatial_conv_size = self.image_processor.merge_size - self.temporal_conv_size = self.image_processor.temporal_patch_size - - # Special tokens and IDs - self.image_token = "<|image_pad|>" - self.video_token = "<|video_pad|>" - - self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token) - self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token) - - self.vision_start = "<|vision_start|>" - self.vision_start_id = self.tokenizer.convert_tokens_to_ids(self.vision_start) - - self.tokens_per_second = tokens_per_second - - self.role_prefixes = { - "system": "", - "user": "User: ", - "bot": "Assistant: ", - "assistant": "Assistant: ", - } - - @staticmethod - def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]: - """ - Calculate the number of tokens in the multimodal input. - """ - if isinstance(grid_thw, paddle.Tensor): - grid_thw = grid_thw.numpy() - - if len(grid_thw) == 0: - return 0 - - def calc_one(thw): - t, h, w = map(int, thw) - return t * h * w // 4 - - if isinstance(grid_thw[0], (list, tuple, np.ndarray)): - return [calc_one(x) for x in grid_thw] - - return calc_one(grid_thw) - - def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None): - """ - Convert text with image/video placeholders into model inputs. - - Args: - text: Input text with <|image@placeholder|> and <|video@placeholder|> markers - images: List of PIL Images corresponding to image placeholders - videos: List of video data corresponding to video placeholders - image_uuid: List of unique identifiers for each image, used for caching or hashing. - video_uuid: List of unique identifiers for each video, used for caching or hashing. - - Returns: - Dict containing: - - input_ids: Token IDs - - token_type_ids: Type identifiers (text/image/video) - - position_ids: 3D positional embeddings - - images: Preprocessed visual features - - grid_thw: Spatial/temporal dimensions - - image_type_ids: Visual content type (0=image, 1=video) - """ - - outputs = { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "fps": [], - "mm_positions": [], - "mm_hashes": [], - } - - # Define placeholders and their lengths - IMAGE_PLACEHOLDER = "<|image_pad|>" - VIDEO_PLACEHOLDER = "<|video_pad|>" - IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) - VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER) - - # Initialize tracking variables for text parsing - st, image_idx, video_idx = 0, 0, 0 # Start position, image counter, video counter - while st < len(text): - # Find next image or video placeholder in text - image_pos = text.find(IMAGE_PLACEHOLDER, st) - image_pos = len(text) if image_pos == -1 else image_pos # Set to end if not found - video_pos = text.find(VIDEO_PLACEHOLDER, st) - video_pos = len(text) if video_pos == -1 else video_pos # Set to end if not found - ed = min(image_pos, video_pos) # End position is first placeholder found - - self._add_text(text[st:ed], outputs) - if ed == len(text): - break - - if ed == image_pos: - image = images[image_idx] - uuid = image_uuid[image_idx] if image_uuid else None - if not isinstance(image, tuple): - self._add_image(image, outputs, uuid) - else: - self._add_processed_image(image, outputs, uuid) - image_idx += 1 - st = ed + IMAGE_PLACEHOLDER_LEN - else: - item = videos[video_idx] - uuid = video_uuid[video_idx] if video_uuid else None - if not isinstance(item, tuple): - if isinstance(item, dict): - frames, meta = self._load_and_process_video(item["video"], item) - else: - frames, meta = self._load_and_process_video(item, {}) - self._add_video(frames, meta, outputs, uuid) - else: - # cached frames are already processed - self._add_processed_video(item, outputs, uuid) - video_idx += 1 - st = ed + VIDEO_PLACEHOLDER_LEN - - return outputs - - def request2ids( - self, request: Request, tgts: List[str] = None - ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: - """ - Convert chat request with multimodal messages into model inputs. - - Args: - request: Dictionary containing: - - messages: List of chat messages with text/image/video content - - request_id: Unique identifier for logging - tgts: Optional target sequences - - Returns: - Dict with same structure as text2ids() output - """ - - # Parse and validate chat messages - messages = parse_chat_messages(request.messages) - mm_items = [] - for msg in messages: - role = msg.get("role") - assert role in self.role_prefixes, f"Unsupported role: {role}" - - # Normalize content to list format - content = msg.get("content") - if not isinstance(content, list): - content = [content] - # Collect all visual content items - for item in content: - if item.get("type") in ["image", "video"]: - mm_items.append(item) - - missing_hashes, missing_idx = [], [] - for idx, item in enumerate(mm_items): - if not item.get("data"): - # raw data not provided, should be retrieved from processor cache - missing_hashes.append(item.get("uuid")) - missing_idx.append(idx) - - if len(missing_hashes) > 0 and not self.enable_processor_cache: - raise ValueError("Missing items cannot be retrieved without processor cache.") - - if self.enable_processor_cache: - context = zmq.Context() - dealer = context.socket(zmq.DEALER) - dealer.connect("ipc:///dev/shm/processor_cache.ipc") - - missing_items = self.get_processor_cache(dealer, missing_hashes) - for idx in range(len(missing_items)): - if not missing_items[idx]: - raise ValueError(f"Missing item {idx} not found in processor cache") - mm_items[missing_idx[idx]]["data"] = missing_items[idx] - - images, videos = [], [] - image_uuid, video_uuid = [], [] - for item in mm_items: - if item.get("type") == "image": - images.append(item["data"]) - image_uuid.append(item["uuid"]) - elif item.get("type") == "video": - videos.append(item["data"]) - video_uuid.append(item["uuid"]) - else: - raise ValueError(f"Unsupported multimodal type: {item.get('type')}") - - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat template.") - - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - prompt = self.tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=request.add_generation_prompt if request.add_generation_prompt is not None else True, - **chat_template_kwargs, - ) - request.prompt_tokens = prompt - - outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid) - - if self.enable_processor_cache: - missing_idx = set(missing_idx) - hashes_to_cache, items_to_cache = [], [] - for idx in range(len(mm_items)): - if idx in missing_idx: - continue - meta = {} - t, h, w = outputs["grid_thw"][idx] - meta["thw"] = (t, h, w) - meta["fps"] = outputs["fps"][idx] - hashes_to_cache.append(outputs["mm_hashes"][idx]) - items_to_cache.append((outputs["images"][idx], meta)) - self.update_processor_cache(dealer, hashes_to_cache, items_to_cache) - - return outputs - - def _add_text(self, tokens, outputs: Dict) -> None: - """ - Add text tokens to model inputs dictionary. - - Args: - tokens: Text string or already tokenized IDs - outputs: Dictionary accumulating model inputs - - Note: - - Handles both raw text and pre-tokenized inputs - - Updates position IDs for 3D embeddings - """ - if not tokens: - return None - - if isinstance(tokens, str): - tokens_str = self.tokenizer.tokenize(tokens) - tokens = self.tokenizer.convert_tokens_to_ids(tokens_str) - - num_tokens = len(tokens) - outputs["input_ids"].extend(tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) - - pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - def _compute_text_positions(self, start_pos: int, num_tokens: int) -> np.ndarray: - """ - Generate 3D positional embeddings for text tokens. - - Args: - start_pos: Starting position index - num_tokens: Number of tokens to generate positions for - - Returns: - numpy.ndarray: 3D position IDs shaped (3, num_tokens) - """ - text_array = np.arange(num_tokens).reshape(1, -1) - text_index = np.broadcast_to(text_array, (3, num_tokens)) - position = text_index + start_pos - return position - - def _add_image(self, img, outputs: Dict, uuid: Optional[str]) -> None: - """ - Add image data to model inputs dictionary. - - Args: - img: PIL Image to process - outputs: Dictionary accumulating model inputs - - Note: - - Preprocesses image and calculates spatial dimensions - - Adds image token IDs and type markers - - Generates appropriate position embeddings - """ - ret = self.image_processor.preprocess(images=[img.convert("RGB")]) - num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["grid_thw"].tolist() - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - outputs["num_input_image_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].append(0) - - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0) - - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(0) - - def _add_processed_image(self, img_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: - img, meta = img_cache - num_tokens = img.shape[0] // self.image_processor.merge_size**2 - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens) - - _, h, w = meta["thw"] - pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["images"].append(img) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[1, h, w]])) - outputs["image_type_ids"].append(0) - - outputs["fps"].append(0) - - def _add_video(self, frames, meta: Dict, outputs: Dict, uuid: Optional[str]) -> None: - """ - Add video data to model inputs dictionary. - - Args: - frames: Video frames as numpy array - meta: Video metadata containing fps/duration - outputs: Dictionary accumulating model inputs - - Note: - - Handles temporal dimension in position embeddings - - Uses video-specific token IDs and type markers - """ - ret = self.image_processor.preprocess(images=frames) - - num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2 - grid_thw = ret["grid_thw"].tolist() - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - # Hack code. In order to adapt to the framework, only image_token can be passed - # The correct way should be to use [self.video_token_id] * num_tokens - outputs["input_ids"].extend([self.image_token_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["num_input_video_tokens"] += int(num_tokens) - - outputs["images"].append(ret["pixel_values"]) - if not uuid: - outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"])) - else: - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(grid_thw) - outputs["image_type_ids"].extend([1] * grid_thw[0]) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - t, h, w = grid_thw - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(fps) - - def _add_processed_video(self, frames_cache: Tuple[np.ndarray, dict], outputs: Dict, uuid: str) -> None: - frames, meta = frames_cache - num_tokens = frames.shape[0] // self.image_processor.merge_size**2 - - t, h, w = meta["thw"] - outputs["images"].append(frames) - outputs["mm_hashes"].append(uuid) - outputs["grid_thw"].append(np.array([[t, h, w]])) - - outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens)) - outputs["input_ids"].extend([self.image_patch_id] * num_tokens) - outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens) - outputs["image_type_ids"].extend([1] * t) - - fps = meta["fps"] - second_per_grid_t = self.temporal_conv_size / fps - pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t) - outputs["position_ids"].append(pos_ids) - outputs["cur_position"] = pos_ids.max() + 1 - - outputs["fps"].append(fps) - - def _compute_vision_positions( - self, start_pos: int, t: int, h: int, w: int, second_per_grid_t: float - ) -> np.ndarray: - """ - Generate 3D position IDs for visual inputs. - - Args: - start_pos: Base position in sequence - t: Temporal patches (1 for images) - h: Height in patches - w: Width in patches - second_per_grid_t: Time per temporal patch - - Returns: - np.ndarray: Position IDs for [t,h,w] dimensions - """ - h //= self.spatial_conv_size - w //= self.spatial_conv_size - - tn = np.arange(t).reshape(-1, 1) - tn = np.broadcast_to(tn, (t, h * w)) - tn = tn * int(second_per_grid_t) * self.tokens_per_second - t_index = tn.flatten() - - hn = np.arange(h).reshape(1, -1, 1) - h_index = np.broadcast_to(hn, (t, h, w)).flatten() - - wn = np.arange(w).reshape(1, 1, -1) - w_index = np.broadcast_to(wn, (t, h, w)).flatten() - - position = np.stack([t_index, h_index, w_index]) + start_pos - return position - - def _load_and_process_video(self, url: str, item: Dict) -> Tuple[np.ndarray, Dict]: - """ - Load and preprocess video into frames. - - Args: - url: Video file path or bytes - item: Dictionary containing processing parameters - - Returns: - tuple: (frames, metadata) where: - - frames: Processed video frames as numpy array - - metadata: Updated video metadata dictionary - """ - reader, meta, _ = read_video_decord(url, save_to_disk=False) - - # Apply frame sampling if fps or target_frames specified - fps = item.get("fps", self.fps) - num_frames = item.get("target_frames", self.target_frames) - - frame_indices = list(range(meta["num_of_frame"])) - if fps > 0 or num_frames > 0: - # Get frame sampling constraints - min_frames = item.get("min_frames", self.min_frames) - max_frames = item.get("max_frames", self.max_frames) - - # Sample frames according to specifications - frame_indices = sample_frames( - frame_factor=self.frame_factor, # Ensure divisible by temporal patch size - min_frames=min_frames, - max_frames=max_frames, - metadata=meta, - fps=fps, - num_frames=num_frames, - ) - - # Update metadata with new frame count and fps - meta["num_of_frame"] = len(frame_indices) - if fps is not None: - meta["fps"] = fps # Use specified fps - meta["duration"] = len(frame_indices) / fps - else: - meta["fps"] = len(frame_indices) / meta["duration"] # Calculate fps from sampled frames - - frames = [] - for idx in frame_indices: - frame = reader[idx].asnumpy() - image = Image.fromarray(frame, "RGB") - frames.append(image) - frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0) - - return frames, meta - - def get_processor_cache(self, socket, mm_hashes: list[str]) -> list: - """ - get cache correspond to given hash values - """ - req = pickle.dumps(mm_hashes) - socket.send_multipart([b"", req]) - _, resp = socket.recv_multipart() - mm_items = pickle.loads(resp) - data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}") - - return mm_items - - def update_processor_cache(self, socket, mm_hashes: list[str], mm_items): - """ - update cache data - """ - req = pickle.dumps((mm_hashes, mm_items)) - socket.send_multipart([b"", req]) - data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}") diff --git a/fastdeploy/input/v1/qwen_vl_processor/process_video.py b/fastdeploy/input/v1/qwen_vl_processor/process_video.py deleted file mode 100644 index 891f272033b..00000000000 --- a/fastdeploy/input/v1/qwen_vl_processor/process_video.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from typing import Optional, Union - -import numpy as np - -from fastdeploy.utils import data_processor_logger - -from .image_processor import ceil_by_factor, floor_by_factor - - -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = -1, - num_frames: Optional[int] = -1, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - min_frames = ceil_by_factor(min_frames, frame_factor) - max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor) - - num_frames = total_num_frames / metadata["fps"] * fps - - if num_frames > total_num_frames: - data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]") - - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = floor_by_factor(num_frames, frame_factor) - - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Hack code ensures that num_frames can always be divided by 4 - # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2)) - if num_frames > 2 and num_frames % 4 != 0: - num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数 - total_num_frames = (total_num_frames // 4) * 4 - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices diff --git a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py deleted file mode 100644 index e0d846d53a2..00000000000 --- a/fastdeploy/input/v1/qwen_vl_processor/qwen_vl_processor.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import numpy as np - -from fastdeploy.engine.request import Request -from fastdeploy.input.utils import process_stop_token_ids -from fastdeploy.input.v1.text_processor import DataProcessor as TextProcessor -from fastdeploy.utils import data_processor_logger - -from .process import DataProcessor - - -class QwenVLProcessor(TextProcessor): - """ - Qwen Vision-Language processor for handling multimodal inputs. - - This processor extends TextProcessor to support: - - Image and video processing - - Multimodal feature extraction - - Tokenization and position encoding - - Request processing and model input generation - - Attributes: - processor (DataProcessor): Underlying data processor instance - tokenizer: Text tokenizer instance - limit_mm_per_prompt (dict): Limits for multimodal inputs per prompt - """ - - def __init__( - self, - config, - model_name_or_path, - limit_mm_per_prompt=None, - mm_processor_kwargs=None, - reasoning_parser_obj=None, - tool_parser_obj=None, - enable_processor_cache=False, - ): - """ - Initialize QwenVLProcessor instance. - - Args: - config: Model configuration object - model_name_or_path (str): Pretrained model name or path - limit_mm_per_prompt (dict, optional): Limits for multimodal inputs - mm_processor_kwargs (dict, optional): Multimodal processor arguments - reasoning_parser_obj: Reasoning parser instance - tool_parser_obj: Tool parser instance - """ - super().__init__(model_name_or_path, reasoning_parser_obj, tool_parser_obj) - - data_processor_logger.info(f"model_name_or_path: {model_name_or_path}") - processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) - self.processor = DataProcessor( - model_path=model_name_or_path, - enable_processor_cache=enable_processor_cache, - tokens_per_second=config.vision_config.tokens_per_second, - tokenizer=self.tokenizer, - **processor_kwargs, - ) - self.image_patch_id = self.processor.image_token_id - self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt) - - def _parse_processor_kwargs(self, kwargs): - """ - Parse and validate multimodal processor arguments. - - Args: - kwargs (dict): Processor configuration arguments - - Returns: - dict: Validated processor arguments - - Raises: - ValueError: If arguments format is invalid - """ - if not kwargs: - return {} - - try: - if not isinstance(kwargs, dict): - raise ValueError("mm-processor-kwargs must be a dictionary") - - # Validate kwargs types against expected schema - data_processor_logger.info(f"Processing kwargs: {kwargs}") - expected_types = { - "video_max_frames": int, # Maximum video frames parameter - "video_min_frames": int, # Minimum video frames parameter - } - - for key, value in kwargs.items(): - if key in expected_types and not isinstance(value, expected_types[key]): - raise ValueError( - f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}" - ) - - return kwargs - - except Exception as e: - data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}") - return {} - - def _parse_limits(self, limits): - """ - Parse and validate multimodal input limits. - - Args: - limits (dict): Input limits configuration - - Returns: - dict: Validated limits with defaults - - Raises: - ValueError: If limits format is invalid - """ - DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1} - - if not limits: - return DEFAULT_LIMITS - - try: - if not isinstance(limits, dict): - raise ValueError("limit-mm-per-prompt must be a dictionary") - data_processor_logger.info(f"_parse_limits:{limits}") - return {**DEFAULT_LIMITS, **limits} - except Exception as e: - data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits") - return DEFAULT_LIMITS - - def _check_mm_limits(self, item): - """ - Validate multimodal inputs against configured limits. - - Args: - item: Input request item to validate - - Raises: - ValueError: If input exceeds configured limits - """ - if isinstance(item, dict): - # 请求包含prompt和multi_modal_data - mm_data = item - else: - # 请求包含messages - mm_data = {"image": [], "video": []} - - for message in item: - if isinstance(message.get("content"), list): - for part in message["content"]: - if part.get("type") in ["image_url", "image"]: - mm_data["image"].append(part) - elif part.get("type") in ["video_url", "video"]: - mm_data["video"].append(part) - - for modality, data in mm_data.items(): - if modality in self.limit_mm_per_prompt: - limit = self.limit_mm_per_prompt[modality] - if len(data) > limit: - raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") - - def process_request(self, request, max_model_len=None, **kwargs): - """ - Process incoming request and generate model inputs. - - Args: - request: Input request object - max_model_len (int, optional): Maximum context length - **kwargs: Additional processing parameters - - Returns: - Request: Processed request with model inputs - """ - task = request.to_dict() - task["enable_thinking"] = kwargs.get("enable_thinking", False) - self.process_request_dict(task, max_model_len) - request = Request.from_dict(task) - request = self._apply_default_parameters(request) - return request - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """ - Process request dictionary into model inputs. - - Args: - request (dict): Input request dictionary - max_model_len (int, optional): Maximum context length - - Returns: - dict: Processed request with model inputs - - Raises: - ValueError: If request format is invalid - """ - - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - bad_words = request.sampling_params.bad_words - bad_words_token_ids = request.sampling_params.bad_words_token_ids - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request.sampling_params.bad_words_token_ids = bad_words_token_ids - - if request.prompt: - multimodal_data = request.multimodal_data - if multimodal_data is None: - multimodal_data = {} - self._check_mm_limits(multimodal_data) - images = multimodal_data.get("image", None) - videos = multimodal_data.get("video", None) - outputs = self.processor.text2ids(request.prompt, images, videos) - - elif request.messages: - messages = request.messages - self._check_mm_limits(messages) - chat_template_kwargs = request.chat_template_kwargs - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if getattr(request, k, v): - setattr(request, k, v) - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - if getattr(request, "enable_thinking") is None: - setattr(request, "enable_thinking", True) - outputs = self.processor.request2ids(request) - delattr(request, "chat_template_kwargs") - else: - raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - - # Handle continuation of previous generation by appending existing tokens - if request.completion_token_ids: - self.append_completion_tokens(outputs, request.completion_token_ids) - - # qwen25_vl not support thinking - request.enable_thinking = False - - outputs = self.pack_outputs(outputs) - - request.prompt_token_ids = outputs["input_ids"].tolist() - request.prompt_token_ids_len = len(request.prompt_token_ids) - request.multimodal_inputs = outputs - - # Handle prompt truncation if exceeds model context length - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[ - : max_model_len - 1 - ] # Leave space for at least 1 new token - - # Set default max_tokens if not specified - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.sampling_params.n or 1 - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - data_processor_logger.info(f"Processed request {request}") - - return request - - def append_completion_tokens(self, multimodal_inputs, completion_token_ids): - """ - Append completion tokens to existing outputs. - - Args: - outputs: Current model outputs - completion_token_ids: completion tokens to append - """ - - num_tokens = len(completion_token_ids) - multimodal_inputs["input_ids"].extend(completion_token_ids) - multimodal_inputs["token_type_ids"].extend([0] * num_tokens) - - pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens) - multimodal_inputs["position_ids"].append(pos_ids) - multimodal_inputs["cur_position"] += num_tokens - - def pack_outputs(self, outputs): - """ - Prepare final output dictionary for model. - - Args: - outputs: Intermediate processing outputs - - Returns: - dict: Packed output dictionary with all required fields - """ - if not outputs["images"]: - outputs["images"] = None # No images case - outputs["grid_thw"] = None # No spatial dimensions - outputs["image_type_ids"] = None # No type IDs - else: - outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically - outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions - outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array - - # Convert all outputs to numpy arrays with appropriate types - outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64 - outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64 - outputs["position_ids"] = np.concatenate( - outputs["position_ids"], axis=1, dtype=np.int64 - ) # Concatenate position ID - - outputs["image_patch_id"] = self.processor.image_token_id - outputs["video_patch_id"] = self.processor.video_token_id - outputs["position_ids"] = outputs["position_ids"].transpose(1, 0) - - outputs["mm_num_token_func"] = self.processor.mm_num_tokens - return outputs diff --git a/fastdeploy/input/v1/text_processor.py b/fastdeploy/input/v1/text_processor.py deleted file mode 100644 index f83a0e0f12e..00000000000 --- a/fastdeploy/input/v1/text_processor.py +++ /dev/null @@ -1,925 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from abc import ABC, abstractmethod -from collections import OrderedDict - -import numpy as np -from paddleformers.generation import GenerationConfig -from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer - -from fastdeploy import envs -from fastdeploy.input.utils import process_stop_token_ids -from fastdeploy.utils import data_processor_logger - -_SAMPLING_EPS = 1e-5 - - -class BaseDataProcessor(ABC): - """base class for data processor""" - - def __init__(self): - """ - Returns: - None - """ - self.tokenizer = self._load_tokenizer() - self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.bos_token) - self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.cls_token) - self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.sep_token) - self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token) - self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.mask_token) - data_processor_logger.info( - ( - f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ", - f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, " - f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, " - f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, " - f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}", - ) - ) - self._tokenize_cache = OrderedDict() - self._tokenize_cache_capacity = 128 - - def _apply_default_parameters(self, request): - """ - Apply default value for parameters in request - """ - - def set_value(req, key, value): - value = getattr(self.generation_config, key, value) - if getattr(req.sampling_params, key) is None: - setattr(req.sampling_params, key, value) - - set_value(request, "top_p", 0.7) - set_value(request, "temperature", 1.0) - set_value(request, "repetition_penalty", 1.0) - set_value(request, "frequency_penalty", 0.0) - set_value(request, "presence_penalty", 0.0) - return request - - @abstractmethod - def process_request_dict(self, request, **kwargs): - """ - Preprocess the request - - Args: - request Request: may contain text and messages fields - **kwargs: others - - Returns: - bool: Whether preprocessing is successful - str: error message - """ - raise NotImplementedError - - @abstractmethod - def process_response_dict(self, response_obj): - """ - Preprocess the response - - Args: - response_obj RequestOutput: response for engine, contain ids fields - - Returns: - RequestOutput: response contain text fields - """ - raise NotImplementedError - - def text2ids(self, text, max_model_len=None): - """ - text to token ids - - Args: - text (str): text - - Returns: - List[int]: token ids list - """ - raise NotImplementedError - - def encode_with_cache(self, text, max_model_len=None, add_special_tokens=False): - """ - Encode text into token ids with a small LRU cache. - """ - if not hasattr(self, "_tokenize_cache"): - self._tokenize_cache = OrderedDict() - self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128) - key = (text, bool(add_special_tokens)) - cached = self._tokenize_cache.get(key) - if cached is not None: - self._tokenize_cache.move_to_end(key) - return cached - token_ids = self.text2ids(text, max_model_len, add_special_tokens=add_special_tokens) - if hasattr(token_ids, "tolist"): - token_ids = token_ids.tolist() - elif not isinstance(token_ids, list): - token_ids = list(token_ids) - self._tokenize_cache[key] = token_ids - if len(self._tokenize_cache) > self._tokenize_cache_capacity: - self._tokenize_cache.popitem(last=False) - return token_ids - - def _encode_literal_text_with_cache(self, text): - if not hasattr(self, "_tokenize_cache"): - self._tokenize_cache = OrderedDict() - self._tokenize_cache_capacity = getattr(self, "_tokenize_cache_capacity", 128) - key = ("literal_text", text) - cached = self._tokenize_cache.get(key) - if cached is not None: - self._tokenize_cache.move_to_end(key) - return cached - token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) - if hasattr(token_ids, "tolist"): - token_ids = token_ids.tolist() - elif not isinstance(token_ids, list): - token_ids = list(token_ids) - self._tokenize_cache[key] = token_ids - if len(self._tokenize_cache) > self._tokenize_cache_capacity: - self._tokenize_cache.popitem(last=False) - return token_ids - - def messages2ids(self, messages): - """ - Convert multi-turn messages into ID sequences. - - Args: - messages (List[List[Dict[str, Any]]]): multi-turn messages. - - Returns: - List[int]: ID sequences - """ - raise NotImplementedError - - def _get_think_token_ids(self): - think_token_ids = getattr(self, "_think_token_ids", None) - if think_token_ids is not None: - return think_token_ids - tokenizer = getattr(self, "tokenizer", None) - vocab = tokenizer.get_vocab() if tokenizer is not None else {} - think_start_id = vocab.get("", -1) - think_end_id = vocab.get("", -1) - self._think_token_ids = (think_start_id, think_end_id) - return self._think_token_ids - - def _prepare_think_stop_sentence(self, logits_processors_args, max_model_len=None): - if not isinstance(logits_processors_args, dict): - return logits_processors_args - think_stop_sentence = logits_processors_args.get("think_stop_sentence") - if isinstance(think_stop_sentence, str) and think_stop_sentence: - sentence_token_ids = self._encode_literal_text_with_cache(think_stop_sentence) - logits_processors_args["think_stop_sentence_token_ids"] = sentence_token_ids - logits_processors_args.pop("think_stop_sentence", None) - return logits_processors_args - - def _update_thinking_prompt_state(self, prompt_token_ids, logits_processors_args): - if not isinstance(logits_processors_args, dict): - return logits_processors_args - thinking_budget = logits_processors_args.get("thinking_budget") - if thinking_budget is None or not isinstance(thinking_budget, int) or thinking_budget < 0: - return logits_processors_args - if logits_processors_args.get("think_prompt_checked"): - return logits_processors_args - if prompt_token_ids is None: - return logits_processors_args - token_len = getattr(prompt_token_ids, "size", None) or len(prompt_token_ids) - if token_len == 0: - return logits_processors_args - think_start_id, think_end_id = self._get_think_token_ids() - if think_start_id < 0 or think_end_id < 0: - return logits_processors_args - - if hasattr(prompt_token_ids, "tolist"): - token_list = prompt_token_ids.tolist() - else: - token_list = list(prompt_token_ids) - - started = False - ended = False - tokens_after_start = 0 - last_token_id = None - in_thinking = False - for token_id in token_list: - if token_id == think_start_id: - started = True - ended = False - in_thinking = True - elif token_id == think_end_id and in_thinking: - ended = True - in_thinking = False - if started and token_list: - # Align with operator-level reasoning_max_tokens: prompt-side tokens - # inside do not consume thinking budget. - last_token_id = int(token_list[-1]) - - logits_processors_args["think_prompt_checked"] = True - logits_processors_args["think_prompt_started"] = started - logits_processors_args["think_prompt_ended"] = ended - logits_processors_args["think_prompt_tokens_after_start"] = tokens_after_start - if last_token_id is not None: - logits_processors_args["think_prompt_last_token_id"] = last_token_id - else: - logits_processors_args.pop("think_prompt_last_token_id", None) - return logits_processors_args - - def ids2tokens(self, token_id, task_id=None): - """ - token ids to strings - - Args: - token_id (List[int]): token id - task_id (str): task id - - Returns: - List[str]: strings - """ - raise NotImplementedError - - @abstractmethod - def _load_tokenizer(self): - """ - load tokenizer - - Returns: - tokenizer (AutoTokenizer) - """ - raise NotImplementedError - - -class DataProcessor(BaseDataProcessor): - def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_obj=None): - """ - Initializes the DecodeStatus object. - - Args: - model_name_or_path (str): The name or path of the pre-trained model to be loaded. - Can also be a path to a directory containing the pre-trained model file. - - Returns: - None. - - Raises: - None. - """ - - self.model_name_or_path = model_name_or_path - - # Generation config - try: - self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path) - except Exception as e: - data_processor_logger.warning( - f"Can't find generation config: {e}, so it will not use generation_config field in the model config" - ) - self.generation_config = None - - self.decode_status = dict() - self.model_status_dict = dict() - self.tool_parser_dict = dict() - self.tokenizer = self._load_tokenizer() - self._tokenize_cache = OrderedDict() - self._tokenize_cache_capacity = 128 - data_processor_logger.info( - f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, \ - eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} " - ) - - try: - from paddleformers.trl.llm_utils import get_eos_token_id - except Exception: - from paddleformers.cli.utils.llm_utils import get_eos_token_id - - self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config) - data_processor_logger.info( - f"The eos_token_ids obtained by merging tokenizer and generation_config is {self.eos_token_ids}" - ) - self.eos_token_id_len = len(self.eos_token_ids) - self.pad_token_id = self.get_pad_id() - self.reasoning_parser = None - self.tool_parser_obj = tool_parser_obj - if reasoning_parser_obj: - self.reasoning_parser = reasoning_parser_obj(self.tokenizer) - self.tokenizer.pad_token_id = self.pad_token_id - - def process_request(self, request, max_model_len=None, **kwargs): - """ - Preprocess the request - - Args: - request (Dict): may contain text and messages fields - - Returns: - bool: Whether preprocessing is successful - str: error message - """ - data_processor_logger.info(f"Start processing request: {request}") - request = self._apply_default_parameters(request) - if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - # processing bad_words - bad_words = request.get("bad_words") - bad_words_token_ids = request.get("bad_words_token_ids") - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request["bad_words_token_ids"] = bad_words_token_ids - - logits_processors_args = self._prepare_think_stop_sentence( - request.get("logits_processors_args") or {}, max_model_len - ) - request["logits_processors_args"] = logits_processors_args - - # processing prompt_token_ids - if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: - if request.prompt is not None: - prompt = request.prompt - add_special_tokens = request.get("add_special_tokens", False) - assert isinstance(prompt, str) or ( - isinstance(prompt, list) and all([isinstance(t, int) for t in prompt]) - ), f"prompt must be a string or a list of integers, but got {type(prompt)}" - if isinstance(prompt, list): # if prompt is a token id list - request.prompt_token_ids = prompt - else: - request.prompt_token_ids = self.text2ids( - request.prompt, max_model_len, add_special_tokens=add_special_tokens - ) - elif request.messages is not None: - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat_template.") - task = request.to_dict() - chat_template_kwargs = kwargs.get("chat_template_kwargs", {}) - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if k not in task or task[k] is None: - task[k] = v - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - task.setdefault("enable_thinking", True) - request.prompt_token_ids = self.messages2ids(task, **chat_template_kwargs) - else: - raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") - - if len(request.prompt_token_ids) == 0: - raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") - - # truncate prompts that exceed the length limit - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] - - logits_processors_args = request.get("logits_processors_args") or {} - logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args) - request["logits_processors_args"] = logits_processors_args - - max_tokens = max_model_len - len(request.prompt_token_ids) - if request.get("max_tokens") is None: - request.set("max_tokens", max(1, max_tokens)) - else: - request.set("max_tokens", min(max_tokens, request.get("max_tokens"))) - if request.get("temperature") < _SAMPLING_EPS: - # zero temperature means greedy decoding: set top_k=1 to force argmax - request.set("temperature", 1) - request.set("top_k", 1) - if request.get("top_p") < _SAMPLING_EPS: - request.set("top_p", _SAMPLING_EPS) - request.set("top_k", 1) - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.get("n", 1) - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - - data_processor_logger.info(f"Processed request: {request}") - return request - - def process_request_dict(self, request, max_model_len=None, **kwargs): - """ - Preprocess the request - - Args: - request Request: may contain text and messages fields - - Returns: - bool: Whether preprocessing is successful - str: error message - """ - data_processor_logger.info(f"Start processing request: {request}") - request = self._apply_default_parameters(request) - if not request.eos_token_ids: - request.eos_token_ids = self.eos_token_ids - - # processing stop_sequences and stop_token_ids - process_stop_token_ids(request, self.update_stop_seq) - - # processing bad_words - bad_words = request.sampling_params.bad_words - bad_words_token_ids = request.sampling_params.bad_words_token_ids - if bad_words: - bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids) - request.sampling_params.bad_words_token_ids = bad_words_token_ids - - logits_processors_args = self._prepare_think_stop_sentence( - getattr(request.sampling_params, "logits_processors_args", None) or {}, max_model_len - ) - request.sampling_params.logits_processors_args = logits_processors_args - - # processing prompt_token_ids - if not request.prompt_token_ids: - if request.prompt: - prompt = request.prompt - add_special_tokens = getattr(request, "add_special_tokens", None) or False - assert isinstance(prompt, str) or ( - isinstance(prompt, list) and all([isinstance(t, int) for t in prompt]) - ), f"prompt must be a string or a list of integers, but got {type(prompt)}" - if isinstance(prompt, list): # if prompt is a token id list - request.prompt_token_ids = prompt - else: - request.prompt_token_ids = self.text2ids( - request.prompt, max_model_len, add_special_tokens=add_special_tokens - ).tolist() - elif request.messages: - if self.tokenizer.chat_template is None: - raise ValueError("This model does not support chat_template.") - chat_template_kwargs = kwargs.get("chat_template_kwargs", {}) - if not chat_template_kwargs: - chat_template_kwargs = request.chat_template_kwargs if request.chat_template_kwargs else {} - if chat_template_kwargs: - if isinstance(chat_template_kwargs, dict): - for k, v in chat_template_kwargs.items(): - if not getattr(request, k, None): - setattr(request, k, v) - else: - raise ValueError("Invalid input: chat_template_kwargs must be a dict") - if getattr(request, "enable_thinking") is None: - setattr(request, "enable_thinking", True) - request.prompt_token_ids = self.messages2ids(request, **chat_template_kwargs) - delattr(request, "chat_template_kwargs") - else: - raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") - - if len(request.prompt_token_ids) == 0: - raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") - - # truncate prompts that exceed the length limit - if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] - logits_processors_args = getattr(request.sampling_params, "logits_processors_args", None) or {} - logits_processors_args = self._update_thinking_prompt_state(request.prompt_token_ids, logits_processors_args) - request.sampling_params.logits_processors_args = logits_processors_args - - max_tokens = max_model_len - len(request.prompt_token_ids) - if getattr(request.sampling_params, "max_tokens", None) is None: - request.sampling_params.max_tokens = max(1, max_tokens) - else: - request.sampling_params.max_tokens = min(max_tokens, request.sampling_params.max_tokens) - - if request.sampling_params.temperature < _SAMPLING_EPS: - # zero temperature means greedy decoding: set top_k=1 to force argmax - request.sampling_params.temperature = 1 - request.sampling_params.top_k = 1 - if request.sampling_params.top_p < _SAMPLING_EPS: - request.sampling_params.top_p = _SAMPLING_EPS - request.sampling_params.top_k = 1 - if self.reasoning_parser: - model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids) - parts = request.request_id.split("_") - if len(parts) > 1: - real_req_id = parts[0] - index = int(parts[1]) - n = request.sampling_params.n or 1 - for idx in range(index * n, (index + 1) * n): - self.model_status_dict[f"{real_req_id}_{idx}"] = model_status - else: - self.model_status_dict[request.request_id] = model_status - request.enable_thinking = model_status == "think_start" - - data_processor_logger.info(f"Processed request: {request}") - return request - - def process_logprob_response(self, token_ids, **kwargs): - full_text = self.tokenizer.decode(token_ids, **kwargs) - return full_text - - def process_response(self, response_dict, **kwargs): - """ - Preprocess the response - - Args: - response_dict (Dict): response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - req_id = response_dict.request_id - token_ids = response_dict.outputs.token_ids - if token_ids[-1] == self.tokenizer.eos_token_id: - token_ids = token_ids[:-1] - full_text = self.tokenizer.decode(token_ids) - response_dict.outputs.text = full_text - if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict, self.model_status_dict[req_id] - ) - response_dict.outputs.text = text - response_dict.outputs.reasoning_content = reasoning_content - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, response_dict) - if tool_call_info.tools_called: - response_dict.outputs.tool_calls = tool_call_info.tool_calls - response_dict.outputs.text = tool_call_info.content - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}") - - return response_dict - - def process_response_obj_normal(self, response_obj, **kwargs): - """ - Preprocess the response - - Args: - response_obj :response for engine, contain ids fields - - Returns: - RequestOutput: response contain text fields - """ - output = response_obj.outputs - token_ids = output.token_ids - is_end = response_obj.finished - req_id = response_obj.request_id - request = kwargs.get("request", None) - if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] in self.eos_token_ids: - token_ids = token_ids[:-1] - delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id) - if is_end: - full_text = previous_texts + delta_text - response_obj.outputs.completion_tokens = full_text - response_obj.outputs.text = full_text - if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, - request, - self.model_status_dict[req_id], - ) - response_obj.outputs.text = text - response_obj.outputs.reasoning_content = reasoning_content - reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else [] - response_obj.outputs.reasoning_token_num = len(reasoning_tokens) - if self.tool_parser_obj: - tool_parser = self.tool_parser_obj(self.tokenizer) - tool_call_info = tool_parser.extract_tool_calls(full_text, request) - if tool_call_info.tools_called: - response_obj.outputs.tool_calls = tool_call_info.tool_calls - response_obj.outputs.text = tool_call_info.content - data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") - del self.decode_status[req_id] - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - return response_obj - - def process_response_obj_streaming(self, response_obj, **kwargs): - """ - Preprocess the response - - Args: - response_obj : response for engine, contain ids fields - - Returns: - RequestOutput: response contain text fields - """ - output = response_obj.outputs - token_ids = output.token_ids - is_end = response_obj.finished - req_id = response_obj.request_id - request = kwargs.get("request", None) - - if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): - if token_ids[-1] in self.eos_token_ids: - token_ids = token_ids[:-1] - delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) - response_obj.outputs.completion_tokens = delta_text - if self.reasoning_parser: - reasoning_delta_message = self.reasoning_parser.extract_reasoning_content_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - self.model_status_dict[req_id], - ) - response_obj.outputs.delta_message = reasoning_delta_message - reasoning_content = reasoning_delta_message.reasoning_content if reasoning_delta_message else None - reasoning_tokens = self.tokenizer.tokenize(reasoning_content) if reasoning_content else [] - response_obj.outputs.reasoning_token_num = len(reasoning_tokens) - if self.tool_parser_obj: - if req_id not in self.tool_parser_dict: - self.tool_parser_dict[req_id] = self.tool_parser_obj(self.tokenizer) - tool_parser = self.tool_parser_dict[req_id] - tool_call = tool_parser.extract_tool_calls_streaming( - previous_texts, - previous_texts + delta_text, - delta_text, - previous_token_ids, - previous_token_ids + token_ids, - token_ids, - request, - ) - if tool_call is None or tool_call.tool_calls: - response_obj.outputs.delta_message = tool_call - response_obj.outputs.text = delta_text - if is_end: - data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") - del self.decode_status[req_id] - if req_id in self.tool_parser_dict: - del self.tool_parser_dict[req_id] - if req_id in self.model_status_dict: - del self.model_status_dict[req_id] - return response_obj - - def process_response_dict(self, response_dict, **kwargs): - """ - Preprocess the response - - Args: - response_obj: response for engine, contain ids fields - - Returns: - Dict: response contain text fields - """ - stream = kwargs.get("stream", True) - if stream: - return self.process_response_obj_streaming(response_dict, **kwargs) - else: - return self.process_response_obj_normal( - response_dict, - **kwargs, - ) - - def text2ids(self, text, max_model_len, **kwargs): - """ - text to token ids - - Args: - text (str): text - - Returns: - List[int]: token ids list - """ - - add_special_tokens = kwargs.get("add_special_tokens") - if envs.FD_USE_HF_TOKENIZER: - tokens = self.tokenizer( - text, - return_tensors="np", - padding=True, - truncation=True, - ) - else: - text = [text] if isinstance(text, str) else text - - tokens = self.tokenizer( - text, - return_tensors="np", - padding=True, - truncation=True, - max_length=max_model_len, - add_special_tokens=add_special_tokens, - ) - - return tokens["input_ids"][0] - - def messages2ids(self, request, **kwargs): - """ - Convert multi-turn messages into ID sequences. - - Args: - messages (List[List[Dict[str, Any]]]): multi-turn messages. - - Returns: - List[int]: ID sequences - """ - message_dict = { - key: getattr(request, key, None) - for key in ["messages", "tools", "documents", "enable_thinking", "system"] - if getattr(request, key, None) is not None - } - if "add_generation_prompt" not in kwargs: - kwargs["add_generation_prompt"] = ( - request.add_generation_prompt if request.add_generation_prompt is not None else True - ) - spliced_message = self.tokenizer.apply_chat_template( - message_dict, - tokenize=False, - split_special_tokens=False, - add_special_tokens=False, - **kwargs, - ) - request.prompt_tokens = spliced_message - tokens = self.tokenizer.tokenize(spliced_message) - req_id = getattr(request, "request_id", None) - token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") - return token_ids - - def ids2tokens(self, token_id, task_id): - """ - token ids to strings - - Args: - token_ids (List[int]): token ids - task_id (str): task id - - Returns: - List[str]: strings - """ - if envs.FD_USE_HF_TOKENIZER: - if task_id not in self.decode_status: - # history token ids & history token strings & befer decode str - self.decode_status[task_id] = [[], [], ""] - - previous_token_ids = self.decode_status[task_id][0] - decode_str = self.tokenizer.batch_decode( - [previous_token_ids + token_id], - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - if isinstance(decode_str, list) and len(decode_str): - new_str = decode_str[0].replace(self.decode_status[task_id][2], "", 1) - self.decode_status[task_id][1].append(new_str) - self.decode_status[task_id][2] = decode_str[0] - else: - new_str = "" - self.decode_status[task_id][0] += token_id - return new_str - else: - if task_id not in self.decode_status: - # prefix offset & read offset & history token ids & history token strings - self.decode_status[task_id] = [0, 0, [], ""] - - prefix_offset = self.decode_status[task_id][0] - read_offset = self.decode_status[task_id][1] - previous_token_ids = self.decode_status[task_id][2] - previous_texts = self.decode_status[task_id][3] - decode_str, prefix_offset, read_offset = self.tokenizer.decode_token( - previous_token_ids + token_id, prefix_offset, read_offset - ) - self.decode_status[task_id][0] = prefix_offset - self.decode_status[task_id][1] = read_offset - self.decode_status[task_id][2] += token_id - self.decode_status[task_id][3] += decode_str - - return decode_str, previous_token_ids, previous_texts - - def _load_tokenizer(self): - """ - load tokenizer - - Returns: - tokenizer (AutoTokenizer) - """ - if envs.FD_USE_HF_TOKENIZER: - from transformers import AutoTokenizer - - return AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=False) - else: - from paddleformers.transformers import AutoTokenizer - - return AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True) - - def clear_request_status(self, task_id): - """ - clear request status - - Args: - task_id (str): task id - - Returns: - results_all (str): all token strings - """ - results_all = "" - if task_id in self.decode_status: - if envs.FD_USE_HF_TOKENIZER: - results_all = self.decode_status[task_id][2] - else: - results_all = "".join(self.decode_status[task_id][3]) - del self.decode_status[task_id] - return results_all - - def get_pad_id(self): - """ - get pad_token_id, if not pad_token_id, use eos_token - - Returns: - int: pad_token_id - """ - if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id: - return self.tokenizer.eos_token - return self.tokenizer.pad_token_id - - def pad_batch_data( - self, - insts, - pad_id=0, - return_seq_len=False, - return_array=True, - pad_style="right", - ): - """Pad the instances to the max sequence length in batch.""" - if len(insts) == 0: - padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]] - if return_seq_len: - seq_len = np.array([], dtype=np.int64) if return_array else [] - return padded_insts, seq_len - return padded_insts - - max_len = max(map(len, insts)) - if pad_style == "left": - padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts] - else: - padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts] - if return_array: - padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len]) - - if return_seq_len: - seq_len = [len(inst) for inst in insts] - if return_array: - seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1) - return padded_insts, seq_len - return padded_insts - - def update_stop_seq(self, stop_sequences): - """ - Update stop sequences from request. - """ - stop_seqs = [] - for seq in stop_sequences: - if seq != self.tokenizer.eos_token_id: - stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq))) - stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False) - data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") - return stop_seqs, stop_seqs_len - - def update_bad_words(self, bad_words, bad_words_token_ids): - """Support bad words""" - - token_ids = bad_words_token_ids - - if token_ids is None: - token_ids = [] - for bad_word in bad_words: - # To prohibit words both at the beginning - # and in the middle of text - # (related to add_prefix_space tokenizer parameter) - for add_prefix_space in [False, True]: - prefix = " " if add_prefix_space else "" - prompt = prefix + bad_word.lstrip() - prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt)) - - if len(prompt_token_ids) != 1: - if not add_prefix_space: - data_processor_logger.warning( - f"Skip bad_words: <{prompt}>." - f"Bad words should be a single token." - f"Got tokens: {prompt_token_ids}." - ) - continue - - if prompt_token_ids[0] > self.tokenizer.vocab_size: - if not add_prefix_space: - data_processor_logger.warning( - f"Skip bad_words: <{prompt}>." - f"All token id values should be satisfying:" - f" 0 <= token_id < {self.tokenizer.vocab_size}." - f"Got token: {prompt_token_ids}." - ) - continue - - if prompt_token_ids not in token_ids: - token_ids.extend(prompt_token_ids) - return token_ids diff --git a/fastdeploy/inter_communicator/zmq_server.py b/fastdeploy/inter_communicator/zmq_server.py index 7073edb48a5..c88fe96eb08 100644 --- a/fastdeploy/inter_communicator/zmq_server.py +++ b/fastdeploy/inter_communicator/zmq_server.py @@ -169,10 +169,7 @@ def pack_aggregated_data(self, data): if len(data) > 1: for response in data[1:]: result.add(response) - if not envs.ENABLE_V1_DATA_PROCESSOR: - result = ForkingPickler.dumps([result.to_dict()]) - else: - result = ForkingPickler.dumps([result]) + result = ForkingPickler.dumps([result.to_dict()]) return result def receive_json_once(self, block=False): @@ -303,10 +300,7 @@ def _send_response_per_query(self, req_id, data): if self.aggregate_send: result = self.pack_aggregated_data(new_data) else: - if not envs.ENABLE_V1_DATA_PROCESSOR: - result = ForkingPickler.dumps([response.to_dict() for response in new_data]) - else: - result = ForkingPickler.dumps(new_data) + result = ForkingPickler.dumps([response.to_dict() for response in new_data]) with self.response_token_lock: _zmq_metrics_stats = ZMQMetricsStats() @@ -349,13 +343,10 @@ def _send_batch_response(self, batch_data, worker_pid=None): metrics_address = self.address or self.worker_push_addresses.get(worker_pid, "unknown") try: - if not envs.ENABLE_V1_DATA_PROCESSOR: - result = msgpack.packb( - [[output.to_dict() for output in outputs] for outputs in batch_data], - default=_msgpack_default, - ) - else: - result = ForkingPickler.dumps(batch_data) + result = msgpack.packb( + [[output.to_dict() for output in outputs] for outputs in batch_data], + default=_msgpack_default, + ) result_len = len(result) # Only hold lock for the actual socket send diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py index 5a6241c4433..69c6db2a753 100644 --- a/tests/engine/test_common_engine.py +++ b/tests/engine/test_common_engine.py @@ -1267,7 +1267,6 @@ def receive_json_once(self, block): with ( patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()), - patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False), patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", False), patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None), ): @@ -3277,7 +3276,6 @@ def __init__(self): with ( patch("fastdeploy.engine.common_engine.envs.ZMQ_SEND_BATCH_DATA", True), patch("fastdeploy.engine.common_engine.envs.FD_ENABLE_INTERNAL_ADAPTER", False), - patch("fastdeploy.engine.common_engine.envs.ENABLE_V1_DATA_PROCESSOR", False), patch("fastdeploy.engine.common_engine.main_process_metrics", DummyMetrics()), patch("fastdeploy.engine.common_engine.time.sleep", lambda *_: None), ): diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 8af369c396a..1b33405503f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -20,7 +20,6 @@ from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock, Mock, patch -import numpy as np import paddle import fastdeploy.envs as envs @@ -307,51 +306,29 @@ async def test_create_chat_completion_request_id_and_v1_stream(self): self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock( side_effect=ParameterError("param", "bad") ) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace: - resp = await self.chat_completion_handler.create_chat_completion( - ChatCompletionRequest( - messages=[{"role": "user", "content": "Hello"}], - request_id="abc", - stream=False, - ) + with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace: + resp = await self.chat_completion_handler.create_chat_completion( + ChatCompletionRequest( + messages=[{"role": "user", "content": "Hello"}], + request_id="abc", + stream=False, ) + ) self.assertEqual(resp.error.param, "param") self.assertIn("bad", resp.error.message) self.assertEqual(mock_trace.call_args.kwargs["rid"], "chatcmpl-abc") self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(side_effect=RuntimeError("boom")) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"): - resp = await self.chat_completion_handler.create_chat_completion( - ChatCompletionRequest( - messages=[{"role": "user", "content": "Hello"}], - request_id="err", - stream=False, - ) + with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start"): + resp = await self.chat_completion_handler.create_chat_completion( + ChatCompletionRequest( + messages=[{"role": "user", "content": "Hello"}], + request_id="err", + stream=False, ) + ) self.assertIn("generator error", resp.error.message) - self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=np.array([1, 2])) - stream_mock = Mock(return_value="streamed") - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True): - with patch( - "fastdeploy.entrypoints.openai.serving_chat.Request.from_generic_request", - return_value={"metrics": {}, "prompt_tokens": "pt", "max_tokens": 3}, - ): - with patch("fastdeploy.entrypoints.openai.serving_chat.tracing.trace_req_start") as mock_trace: - with patch.object(self.chat_completion_handler, "chat_completion_stream_generator", stream_mock): - result = await self.chat_completion_handler.create_chat_completion( - ChatCompletionRequest( - messages=[{"role": "user", "content": "Hello"}], - user="user", - stream=True, - ) - ) - self.assertEqual(result, "streamed") - self.assertTrue(mock_trace.call_args.kwargs["rid"].startswith("chatcmpl-user-")) - self.assertEqual(stream_mock.call_args.args[3], [1, 2]) - async def test_create_chat_completion_full_and_waiting_errors(self): """Test full generator error and waiting error handling.""" self.chat_completion_handler.engine_client.is_master = True @@ -361,15 +338,14 @@ async def test_create_chat_completion_full_and_waiting_errors(self): self.chat_completion_handler.engine_client.semaphore.status = Mock(return_value="ok") self.chat_completion_handler.engine_client.format_and_add_data = AsyncMock(return_value=[1, 2]) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - with patch.object( - self.chat_completion_handler, - "chat_completion_full_generator", - AsyncMock(side_effect=RuntimeError("boom")), - ): - resp = await self.chat_completion_handler.create_chat_completion( - ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False) - ) + with patch.object( + self.chat_completion_handler, + "chat_completion_full_generator", + AsyncMock(side_effect=RuntimeError("boom")), + ): + resp = await self.chat_completion_handler.create_chat_completion( + ChatCompletionRequest(messages=[{"role": "user", "content": "Hello"}], stream=False) + ) self.assertIn("full generator error", resp.error.message) with patch( diff --git a/tests/entrypoints/test_serving_completion.py b/tests/entrypoints/test_serving_completion.py index b76d798afc5..9c2beb678df 100644 --- a/tests/entrypoints/test_serving_completion.py +++ b/tests/entrypoints/test_serving_completion.py @@ -20,7 +20,6 @@ import numpy as np import paddle -import fastdeploy.envs as envs import fastdeploy.metrics.trace as tracing from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion from fastdeploy.utils import ErrorCode, ParameterError @@ -124,40 +123,21 @@ async def test_create_completion_branches(self): ec = _make_engine_client() ec.format_and_add_data = AsyncMock(side_effect=ParameterError("max_tokens", "bad")) serving = OpenAIServingCompletion(ec, None, "pid", None, -1) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens") + res = await _assert_error(self, serving, _make_request(prompt_token_ids=[1, 2]), param="max_tokens") ec.semaphore.release.assert_called_once() ec = _make_engine_client() - ec.format_and_add_data = AsyncMock(side_effect=ValueError("bad")) - serving = OpenAIServingCompletion(ec, None, "pid", None, -1) - - def fake_from_generic_request(_, request_id): - return {"prompt": "hi", "request_id": request_id, "prompt_tokens": [1], "max_tokens": 2, "metrics": {}} - - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True): - with patch( - "fastdeploy.entrypoints.openai.serving_completion.Request.from_generic_request", - side_effect=fake_from_generic_request, - ): - await _assert_error(self, serving, _make_request(prompt="hi"), code=ErrorCode.INVALID_VALUE) - ec = _make_engine_client() ec.format_and_add_data = AsyncMock(return_value=np.array([1, 2])) serving = OpenAIServingCompletion(ec, None, "pid", None, -1) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))): - await _assert_error( - self, serving, _make_request(prompt="hi"), contains="completion_full_generator error" - ) + with patch.object(serving, "completion_full_generator", AsyncMock(side_effect=RuntimeError("boom"))): + await _assert_error(self, serving, _make_request(prompt="hi"), contains="completion_full_generator error") serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - with patch.object(serving, "completion_stream_generator", return_value="streamed"): - res = await serving.create_completion(_make_request(request_id="req123", stream=True)) + with patch.object(serving, "completion_stream_generator", return_value="streamed"): + res = await serving.create_completion(_make_request(request_id="req123", stream=True)) self.assertEqual(res, "streamed") serving = OpenAIServingCompletion(_make_engine_client(), None, "pid", None, -1) - with patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - await _assert_error( - self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error" - ) + await _assert_error( + self, serving, _StreamRaiser(**_make_request().__dict__), contains="create_completion error" + ) async def test_completion_full_generator_branches(self): ec = _make_engine_client() diff --git a/tests/input/test_preprocess.py b/tests/input/test_preprocess.py index b4659261a8e..4196b729fbb 100644 --- a/tests/input/test_preprocess.py +++ b/tests/input/test_preprocess.py @@ -63,10 +63,8 @@ def test_create_processor_text_normal_path(self): mock_dp = MagicMock() with ( patch.dict("sys.modules", {"fastdeploy.plugins": None, "fastdeploy.plugins.input_processor": None}), - patch("fastdeploy.input.preprocess.envs") as mock_envs, patch("fastdeploy.input.text_processor.DataProcessor", return_value=mock_dp), ): - mock_envs.ENABLE_V1_DATA_PROCESSOR = False pp.create_processor() self.assertIs(pp.processor, mock_dp) diff --git a/tests/input/v1/test_ernie4_5_processor.py b/tests/input/v1/test_ernie4_5_processor.py deleted file mode 100644 index 13cfdb10747..00000000000 --- a/tests/input/v1/test_ernie4_5_processor.py +++ /dev/null @@ -1,448 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock, patch - -import numpy as np - -from fastdeploy.engine.request import Request, RequestOutput - -MODULE_PATH = "fastdeploy.input.v1.ernie4_5_processor" - -from fastdeploy.input.v1.ernie4_5_processor import _SAMPLING_EPS, Ernie4_5Processor - - -class MockTokenizer: - """A simple mock tokenizer used to simulate tokenization behavior in unit tests.""" - - def __init__(self): - self.bos_token = "" - self.bos_token_id = 101 - self.eos_token = "" - self.eos_token_id = 102 - self.pad_token_id = 0 - self.vocab_size = 200 - # Non-None value indicates chat_template support - self.chat_template = "dummy" - - def tokenize(self, text): - """Return multi-token output for 'multi*' to test branching; otherwise return single-token.""" - if text.startswith("multi"): - return ["multi", "word"] - return [text] - - def convert_tokens_to_ids(self, tokens): - """Map tokens to synthetic IDs for branch coverage.""" - mapping = { - "bad": 5, - " bad": 6, - "multi": 7, - "word": 8, - "oov": 250, - " oov": 251, - "hello": 9, - "REASON": 42, - } - return [mapping.get(t, 1) for t in tokens] - - def decode(self, token_ids, **kwargs): - """Simple decode implementation returning a space-separated string.""" - return " ".join(str(t) for t in token_ids) - - def decode_token(self, token_ids, prefix_offset, read_offset): - """Incremental decode used to test streaming behavior.""" - new_tokens = token_ids[read_offset:] - decode_str = " ".join(str(t) for t in new_tokens) - new_read_offset = len(token_ids) - return decode_str, prefix_offset, new_read_offset - - def apply_chat_template(self, request_or_messages, tokenize, split_special_tokens, add_special_tokens, **kwargs): - """Minimal chat template implementation used by messages2ids.""" - if isinstance(request_or_messages, dict) and "messages" in request_or_messages: - return " | ".join(m["content"] for m in request_or_messages["messages"]) - return str(request_or_messages) - - -class ErnieX1ReasoningParser: - """Mock reasoning parser to trigger reasoning-related branches during streaming.""" - - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def extract_reasoning_content(self, full_text, response_dict, model_status): - """Extract reasoning content for non-streaming responses.""" - - class ReasoningContent: - def __init__(self): - self.reasoning_content = "mock_reasoning" - self.content = "mock_content" - - return ReasoningContent() - - def extract_reasoning_content_streaming( - self, - previous_texts, - full_text, - delta_text, - previous_token_ids, - all_token_ids, - delta_token_ids, - model_status, - ): - """Return a simple object with reasoning_content to cover reasoning branch.""" - - class ReasoningDelta: - def __init__(self, content): - self.reasoning_content = content - - return ReasoningDelta(delta_text) - - -class MockToolParser: - """Mock tool parser to cover tool-related branches in both normal and streaming responses.""" - - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - class ToolDelta: - """Simple container representing detected tool calls.""" - - def __init__(self): - self.tool_calls = [{"name": "fake_tool"}] - self.tools_called = True - self.content = "tool_content" - - def extract_tool_calls(self, full_text, response_dict): - """Used in process_response and process_response_obj_normal.""" - return MockToolParser.ToolDelta() - - def extract_tool_calls_streaming( - self, - previous_texts, - full_text, - delta_text, - previous_token_ids, - all_token_ids, - delta_token_ids, - response_dict, - ): - """Used in process_response_obj_streaming.""" - return MockToolParser.ToolDelta() - - -class TestErnie4_5Processor(unittest.TestCase): - """Unit tests for Ernie4_5Processor focusing on preprocessing and postprocessing logic.""" - - def setUp(self): - """Patch external dependencies: tokenizer, generation config, eos token resolution.""" - self.gen_patcher = patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", return_value=MagicMock()) - self.tokenizer_patcher = patch( - f"{MODULE_PATH}.Ernie4_5Tokenizer.from_pretrained", side_effect=lambda path: MockTokenizer() - ) - self.eos_patcher = patch( - "paddleformers.cli.utils.llm_utils.get_eos_token_id", - side_effect=lambda tokenizer, cfg: [tokenizer.eos_token_id], - ) - - self.gen_patcher.start() - self.tokenizer_patcher.start() - self.eos_patcher.start() - - def tearDown(self): - """Stop all patches after each test.""" - self.gen_patcher.stop() - self.tokenizer_patcher.stop() - self.eos_patcher.stop() - - def _make_processor(self, reasoning=False, tool=False): - """Helper for creating a processor with optional reasoning/tool parser support.""" - reasoning_cls = ErnieX1ReasoningParser if reasoning else None - tool_cls = MockToolParser if tool else None - proc = Ernie4_5Processor("dummy-model", reasoning_parser_obj=reasoning_cls, tool_parser_obj=tool_cls) - proc._apply_default_parameters = lambda req: req - proc.model_status_dict = {"req-1": "think_start"} - return proc - - def test_update_bad_words(self): - """Verify filtering, multi-token skipping, and OOV behavior in update_bad_words.""" - proc = self._make_processor() - - bad_words = ["bad", "multi", "oov"] - token_ids = proc.update_bad_words(bad_words, bad_words_token_ids=None) - - self.assertEqual(token_ids, [5, 6, 1]) - - def test_process_request_dict_with_prompt_string(self): - """Test prompt-based tokenization, truncation, and temperature/top_p correction.""" - proc = self._make_processor() - req = { - "request_id": "test_0", - "prompt": "hello", - "temperature": 0.0, - "top_p": 0.0, - } - req = Request.from_dict(req) - - processed = proc.process_request_dict(req, max_model_len=10) - - self.assertTrue(hasattr(processed, "eos_token_ids")) - self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id]) - - expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello")) - self.assertEqual(processed.prompt_token_ids, expected_ids) - - self.assertEqual(processed.sampling_params.max_tokens, max(1, 10 - len(expected_ids))) - self.assertEqual(processed.sampling_params.temperature, 1) - self.assertEqual(processed.sampling_params.top_k, 1) - self.assertAlmostEqual(processed.sampling_params.top_p, _SAMPLING_EPS) - self.assertEqual(processed.prompt_tokens, "hello") - - def test_pad_batch_data_right_and_left_and_empty(self): - """Test left/right padding and empty input behavior.""" - proc = self._make_processor() - - insts = [[1, 2], [3]] - - padded, seq_len = proc.pad_batch_data( - insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="right" - ) - np.testing.assert_array_equal(padded, np.array([[1, 2], [3, 0]], dtype=np.int64)) - np.testing.assert_array_equal(seq_len, np.array([[2], [1]], dtype=np.int64)) - - padded_left, seq_len_left = proc.pad_batch_data( - insts, pad_id=0, return_seq_len=True, return_array=True, pad_style="left" - ) - np.testing.assert_array_equal(padded_left, np.array([[1, 2], [0, 3]], dtype=np.int64)) - np.testing.assert_array_equal(seq_len_left, np.array([[2], [1]], dtype=np.int64)) - - padded_empty, seq_len_empty = proc.pad_batch_data( - [], pad_id=0, return_seq_len=True, return_array=True, pad_style="right" - ) - np.testing.assert_array_equal(padded_empty, np.array([[]], dtype=np.int64)) - np.testing.assert_array_equal(seq_len_empty, np.array([], dtype=np.int64)) - - def test_process_response_obj_streaming_with_reasoning_and_tool(self): - """Ensure streaming mode handles reasoning and tool-call parsing correctly.""" - proc = self._make_processor(reasoning=True, tool=True) - - response = { - "finished": True, - "request_id": "req-1", - "outputs": {"token_ids": [10, 11]}, - } - response = RequestOutput.from_dict(response) - - result = proc.process_response_obj_streaming(response, enable_thinking=False, include_stop_str_in_output=False) - - outputs = result.outputs - - self.assertTrue(hasattr(outputs, "completion_tokens")) - self.assertTrue(hasattr(outputs, "text")) - self.assertEqual(outputs.completion_tokens, outputs.reasoning_content) - - self.assertTrue(hasattr(outputs, "reasoning_token_num")) - self.assertGreaterEqual(outputs.reasoning_token_num, 0) - - self.assertTrue(hasattr(outputs, "delta_message")) - delta_msg = outputs.delta_message - self.assertTrue(hasattr(delta_msg, "tool_calls")) - - self.assertNotIn("req-1", proc.decode_status) - self.assertNotIn("req-1", proc.tool_parser_dict) - - def test_update_stop_seq(self): - """Test stop sequence tokenization and padding.""" - proc = self._make_processor() - - stop_seqs, stop_lens = proc.update_stop_seq("stop") - self.assertIsInstance(stop_seqs, list) - self.assertIsInstance(stop_lens, list) - - stop_seqs2, stop_lens2 = proc.update_stop_seq(["stop", "hello"]) - self.assertEqual(len(stop_seqs2), 2) - self.assertEqual(len(stop_lens2), 2) - - def test_process_request_chat_template_kwargs(self): - """Test chat_template_kwargs application inside process_request_dict.""" - - proc = self._make_processor() - - request = { - "request_id": "test_0", - "messages": [{"role": "user", "content": "hello"}], - "temperature": 0.5, - "top_p": 0.5, - } - request = Request.from_dict(request) - - processed = proc.process_request_dict(request, max_model_len=20, chat_template_kwargs={"extra": "VALUE"}) - - self.assertEqual(processed.eos_token_ids, [proc.tokenizer.eos_token_id]) - - expected_ids = proc.tokenizer.convert_tokens_to_ids(proc.tokenizer.tokenize("hello")) - self.assertIsNotNone(processed.prompt_token_ids) - self.assertEqual(processed.prompt_token_ids, expected_ids) - - self.assertTrue(hasattr(processed.sampling_params, "max_tokens")) - self.assertEqual(processed.sampling_params.max_tokens, max(1, 20 - len(expected_ids))) - - def test_process_request_dict_chat_template_kwargs(self): - """Test chat_template_kwargs insertion in process_request_dict.""" - proc = self._make_processor() - - req = { - "request_id": "test_0", - "messages": [{"role": "user", "content": "hey"}], - "chat_template_kwargs": {"A": "B"}, - "temperature": 0.5, - "top_p": 0.5, - } - req = Request.from_dict(req) - req.chat_template_kwargs = {"A": "B"} - - result = proc.process_request_dict(req, max_model_len=30) - - self.assertTrue(hasattr(result, "prompt_token_ids")) - self.assertEqual(getattr(result, "A"), "B") - - def test_init_generation_config_exception(self): - """Test fallback behavior when GenerationConfig loading fails.""" - with patch(f"{MODULE_PATH}.GenerationConfig.from_pretrained", side_effect=Exception("fail")): - proc = self._make_processor() - self.assertIsNone(proc.generation_config) - - # def test_process_response_with_tool_parser(self): - # """Verify tool_call extraction in process_response.""" - # proc = self._make_processor(tool=True) - - # class RespObj: - # """Mock response carrying token_ids and index for testing.""" - - # def __init__(self): - # self.request_id = "reqx" - # self.outputs = MagicMock() - # self.outputs.token_ids = [9, proc.tokenizer.eos_token_id] - # self.outputs.index = 0 - - # resp = RespObj() - # result = proc.process_response(resp) - - # self.assertTrue(hasattr(result.outputs, "tool_calls")) - # self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool") - - def test_process_response_obj_normal_with_tool(self): - """Verify tool_call extraction in normal (non-streaming) response mode.""" - proc = self._make_processor(tool=True) - - resp = { - "finished": True, - "request_id": "task-99", - "outputs": {"token_ids": [10, 11], "text": ""}, - } - resp = RequestOutput.from_dict(resp) - - result = proc.process_response_obj_normal(resp, enable_thinking=False, include_stop_str_in_output=False) - - self.assertTrue(hasattr(result.outputs, "tool_calls")) - self.assertEqual(result.outputs.tool_calls[0]["name"], "fake_tool") - - def test_process_request_greedy_sets_top_k(self): - """process_request with temperature=0 should set top_k=1 for greedy decoding.""" - proc = self._make_processor() - proc.messages2ids = MagicMock(return_value=[9]) - - request = Request.from_dict( - { - "request_id": "test_greedy", - "prompt": "hello", - "temperature": 0.0, - "top_p": 0.5, - } - ) - result = proc.process_request(request, max_model_len=10) - self.assertEqual(result.get("temperature"), 1) - self.assertEqual(result.get("top_k"), 1) - - def test_process_request(self): - """Test process_request method with various input types.""" - proc = self._make_processor() - - # Test with prompt string - request = Request.from_dict( - { - "request_id": "test_1", - "prompt": "hello", - "temperature": 0.5, - "top_p": 0.5, - } - ) - result = proc.process_request(request, max_model_len=10) - self.assertEqual(result.prompt_token_ids, [9]) - - # Test with prompt token ids - request = Request.from_dict( - { - "request_id": "test_2", - "prompt_token_ids": [1, 2, 3], - "temperature": 0.5, - "top_p": 0.5, - } - ) - result = proc.process_request(request, max_model_len=5) - self.assertEqual(result.prompt_token_ids, [1, 2, 3]) - - # Test with messages - mock the messages2ids method to avoid dict attribute error - proc.messages2ids = MagicMock(return_value=[9]) - request = Request.from_dict( - { - "request_id": "test_3", - "messages": [{"role": "user", "content": "hello"}], - "temperature": 0.5, - "top_p": 0.5, - } - ) - result = proc.process_request(request, max_model_len=10) - self.assertEqual(result.prompt_token_ids, [9]) - proc.messages2ids.assert_called_once() - - def test_process_response(self): - """Test process_response method with various scenarios.""" - # Test without reasoning parser to avoid model_status_dict dependency - proc = self._make_processor(reasoning=False) - - # Test basic response - response = RequestOutput.from_dict( - {"request_id": "test_4", "outputs": {"token_ids": [10, 11, proc.tokenizer.eos_token_id], "index": 0}} - ) - result = proc.process_response(response) - self.assertEqual(result.outputs.text, "10 11") - - # Test another response to ensure consistency - response = RequestOutput.from_dict( - {"request_id": "test_5", "outputs": {"token_ids": [20, 21, proc.tokenizer.eos_token_id], "index": 0}} - ) - result = proc.process_response(response) - self.assertEqual(result.outputs.text, "20 21") - - # Test response without eos_token at the end - response = RequestOutput.from_dict({"request_id": "test_6", "outputs": {"token_ids": [30, 31], "index": 0}}) - result = proc.process_response(response) - self.assertEqual(result.outputs.text, "30 31") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_ernie_processor.py b/tests/input/v1/test_ernie_processor.py deleted file mode 100644 index 437e4029a5d..00000000000 --- a/tests/input/v1/test_ernie_processor.py +++ /dev/null @@ -1,162 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock, patch - -from fastdeploy.engine.request import Request, RequestOutput -from fastdeploy.input.v1.ernie4_5_processor import Ernie4_5Processor - - -class MockReasoningParser: - def get_model_status(self, prompt_token_ids): - return "think_start" - - -class TestErnie4_5ProcessorProcessResponseDictStreaming(unittest.TestCase): - def setUp(self): - # 创建 Ernie4_5Processor 实例的模拟对象 - with patch.object(Ernie4_5Processor, "__init__", return_value=None) as mock_init: - self.processor = Ernie4_5Processor("model_path") - mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") - - # 设置必要的属性 - self.processor.tokenizer = MagicMock() - self.processor.tokenizer.eos_token_id = 1 - self.processor.decode_status = {"test": []} - self.processor.reasoning_end_dict = {} - self.processor.tool_parser_dict = {} - self.processor.generation_config = MagicMock() - self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = MockReasoningParser() - self.processor.model_status_dict = {"request-id_0": "think_start", "test": "think_start"} - - # 模拟 ids2tokens 方法 - def mock_ids2tokens(token_ids, task_id): - self.processor.decode_status[task_id] = "mock_decode_status" - return "delta_text", [2, 3], "previous_texts" - - self.processor.ids2tokens = mock_ids2tokens - - def mock_messages2ids(request, **kwargs): - if "chat_template" in kwargs: - return [1] - else: - return [0] - - def mock_apply_default_parameters(request): - return request - - self.processor.messages2ids = mock_messages2ids - self.processor._apply_default_parameters = mock_apply_default_parameters - - # 模拟推理解析器 - self.mock_reasoning_parser = MagicMock() - self.mock_reasoning_parser.__class__.__name__ = "ErnieX1ReasoningParser" - # self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = ("reasoning", "text") - self.processor.reasoning_parser = self.mock_reasoning_parser - - # 模拟工具解析器 - self.mock_tool_parser = MagicMock() - self.mock_tool_parser.extract_tool_calls_streaming.return_value = None - self.mock_tool_parser_obj = MagicMock() - self.mock_tool_parser_obj.return_value = self.mock_tool_parser - self.processor.tool_parser_obj = self.mock_tool_parser_obj - - def test_process_response_obj_streaming_normal_case(self): - """测试正常情况下的流式响应处理""" - # 准备输入 - response_dict = {"finished": False, "request_id": "test", "outputs": {"token_ids": [4, 5]}} - kwargs = {"enable_thinking": True} - response = RequestOutput.from_dict(response_dict) - - # 调用方法 - result = self.processor.process_response_obj_streaming(response, **kwargs) - - # 验证结果 - self.assertEqual(result.outputs.completion_tokens, "delta_text") - - def test_process_request_dict(self): - request_dict = { - "request_id": "123", - "messages": [{"role": "user", "content": "Hello!"}], - "chat_template_kwargs": {"chat_template": "Hello!"}, - "eos_token_ids": [1], - "temperature": 1, - "top_p": 1, - } - request = Request.from_dict(request_dict) - request.chat_template_kwargs = {"chat_template": "Hello!"} - result = self.processor.process_request_dict(request, 100) - self.assertEqual(result.prompt_token_ids, [1]) - - def test_process_response_obj_normal(self): - mock_tokens = ["reasoning", "token", "list"] - self.processor.tokenizer.tokenize = MagicMock(return_value=mock_tokens) - self.processor.reasoning_parser.extract_reasoning_content = MagicMock( - return_value=("Mock reasoning content", "Mock final text") - ) - - self.processor.tool_parser_obj = None - - response_dict = { - "request_id": "request-id_0", - "outputs": {"token_ids": [2, 3, 4, 5, 1], "text": "Initial text", "top_logprobs": []}, - # "finish_reason": "stop", - "finished": True, - } - response = RequestOutput.from_dict(response_dict) - kwargs = {"enable_thinking": True} - - with patch("fastdeploy.input.ernie4_5_processor.data_processor_logger"): - result = self.processor.process_response_obj_normal(response, **kwargs) - - self.mock_reasoning_parser.extract_reasoning_content.assert_called_once() - self.assertEqual(result.outputs.reasoning_content, "Mock reasoning content") - self.assertEqual(result.outputs.reasoning_token_num, len(mock_tokens)) - self.assertEqual(result.outputs.text, "Mock final text") - self.assertTrue(hasattr(result.outputs, "completion_tokens")) - - def test_think_status(self): - """测试 思考机制""" - request = { - "prompt": "hello", - "request_id": "test_1", - "prompt_token_ids": [1, 2, 3], - "temperature": 0.7, - "top_p": 0.9, - } - request = Request.from_dict(request) - self.processor.reasoning_parser = MagicMock() - self.processor.reasoning_parser.get_model_status.return_value = "think_start" - self.processor.model_status_dict = {} - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - request = { - "prompt": "hello", - "request_id": "test", - "prompt_token_ids": [1, 2, 3], - "temperature": 0.7, - "top_p": 0.9, - } - request = Request.from_dict(request) - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_ernie_vl_processor.py b/tests/input/v1/test_ernie_vl_processor.py deleted file mode 100644 index 132b423f3cb..00000000000 --- a/tests/input/v1/test_ernie_vl_processor.py +++ /dev/null @@ -1,1460 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from types import SimpleNamespace -from unittest.mock import MagicMock, patch - -import numpy as np -from PIL import Image - -from fastdeploy.engine.request import CompletionOutput, Request, RequestOutput -from fastdeploy.engine.sampling_params import SamplingParams -from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer -from fastdeploy.input.utils import IDS_TYPE_FLAG -from fastdeploy.input.v1.ernie4_5_vl_processor import Ernie4_5_VLProcessor -from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import ( - AdaptiveImageProcessor, -) -from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor - - -class MockReasoningParser: - def get_model_status(self, prompt_token_ids): - return "think_start" - - -class TestErnie4_5VLProcessorProcessResponseDictStreaming(unittest.TestCase): - def setUp(self): - # Create mock object for Ernie4_5Processor instance - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None) as mock_init: - self.processor = Ernie4_5_VLProcessor("model_path") - mock_init.side_effect = lambda *args, **kwargs: print(f"__init__ called with {args}, {kwargs}") - - # Set necessary attributes - self.processor.tokenizer = MagicMock() - self.processor.tokenizer.eos_token_id = 1 - self.processor.decode_status = {"test": []} - self.processor.reasoning_end_dict = {} - self.processor.tool_parser_dict = {} - self.processor.generation_config = MagicMock() - self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = MockReasoningParser() - self.processor.model_status_dict = {"test": "think_start"} - self.processor.ernie4_5_processor = MagicMock() - - # Mock ids2tokens method - def mock_ids2tokens(token_ids, task_id): - return "delta_text", [2, 3], "previous_texts" - - self.processor.ids2tokens = mock_ids2tokens - - def mock_request2ids(request, **kwargs): - return {"input_ids": np.array([1, 2, 3]), "prompt_token_ids": [0]} - - def mock_check_mm_limits(item): - pass - - def mock_apply_default_parameters(request): - return request - - def mock_pack_outputs(outputs): - # Ensure input_ids is numpy array if it exists - result = outputs.copy() if isinstance(outputs, dict) else outputs - if isinstance(result, dict): - if "input_ids" in result and isinstance(result["input_ids"], list): - result["input_ids"] = np.array(result["input_ids"]) - if "token_type_ids" in result and isinstance(result["token_type_ids"], list): - result["token_type_ids"] = np.array(result["token_type_ids"]) - if "position_ids" in result and isinstance(result["position_ids"], list): - result["position_ids"] = np.array(result["position_ids"]) - return result - - def mock_prompt_token_ids2outputs(request): - return { - "input_ids": np.array([1, 1, 1]), - "token_type_ids": np.array([0, 0, 0]), - "position_ids": np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]]), - "images": [], - "grid_thw": [], - "image_type_ids": [], - "cur_position": 3, - } - - self.processor._apply_default_parameters = mock_apply_default_parameters - self.processor._check_mm_limits = mock_check_mm_limits - self.processor.ernie4_5_processor.request2ids = mock_request2ids - self.processor.ernie4_5_processor.prompt_token_ids2outputs = mock_prompt_token_ids2outputs - self.processor.pack_outputs = mock_pack_outputs - - # Mock reasoning parser - self.mock_reasoning_parser = MagicMock() - self.mock_reasoning_parser.extract_reasoning_content_streaming.return_value = None - self.processor.reasoning_parser = self.mock_reasoning_parser - - # Mock tool parser - self.mock_tool_parser = MagicMock() - self.mock_tool_parser.extract_tool_calls_streaming.return_value = None - self.mock_tool_parser_obj = MagicMock() - self.mock_tool_parser_obj.return_value = self.mock_tool_parser - self.processor.tool_parser_obj = self.mock_tool_parser_obj - - def test_think_status(self): - """测试 思考机制""" - request = { - "prompt": "hello", - "request_id": "test_1", - "prompt_token_ids": [1, 2, 3], - } - request = Request.from_dict(request) - self.processor.reasoning_parser = MagicMock() - self.processor.reasoning_parser.get_model_status.return_value = "think_start" - self.processor.model_status_dict = {} - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - request = { - "prompt": "hello", - "request_id": "test", - "prompt_token_ids": [1, 2, 3], - } - request = Request.from_dict(request) - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - def test_init(self): - """Test __init__ method""" - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"): - mock_dp = MagicMock() - mock_dp.image_patch_id = 1001 - mock_dp.spatial_conv_size = 14 - mock_dp.tokenizer = MagicMock() - mock_dp.tokenizer.pad_token_id = 0 - mock_dp.eval = MagicMock() - - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.DataProcessor" - ) as mock_dp_class: - mock_dp_class.return_value = mock_dp - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.GenerationConfig" - ) as mock_gen_config: - mock_gen_config.from_pretrained.return_value = MagicMock() - with patch("paddleformers.cli.utils.llm_utils.get_eos_token_id") as mock_get_eos: - mock_get_eos.return_value = [1, 2] - - # Test normal initialization - mock_reasoning_parser_class = MagicMock() - processor = Ernie4_5_VLProcessor( - "model_path", - limit_mm_per_prompt={"image": 2, "video": 1}, - mm_processor_kwargs={"spatial_conv_size": 14}, - reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser_class, - tool_parser_obj=MagicMock(), - enable_processor_cache=True, - ) - - self.assertEqual(processor.image_patch_id, 1001) - self.assertEqual(processor.spatial_conv_size, 14) - self.assertIsNotNone(processor.tokenizer) - self.assertIsNotNone(processor.generation_config) - self.assertEqual(processor.eos_token_ids, [1, 2]) - self.assertEqual(processor.limit_mm_per_prompt["image"], 2) - self.assertEqual(processor.limit_mm_per_prompt["video"], 1) - mock_dp.eval.assert_called_once() - - # Test with generation config exception - mock_gen_config.from_pretrained.side_effect = Exception("Config not found") - processor2 = Ernie4_5_VLProcessor("model_path") - self.assertIsNone(processor2.generation_config) - - # Test with reasoning_parser_obj - mock_reasoning_parser = MagicMock() - processor3 = Ernie4_5_VLProcessor( - "model_path", reasoning_parser_obj=lambda tokenizer: mock_reasoning_parser - ) - self.assertIsNotNone(processor3.reasoning_parser) - - def test_parse_processor_kwargs(self): - """Test _parse_processor_kwargs with various inputs""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor._parse_processor_kwargs = Ernie4_5_VLProcessor._parse_processor_kwargs.__get__( - processor, Ernie4_5_VLProcessor - ) - - # Test with valid kwargs - valid_kwargs = { - "spatial_conv_size": 14, - "temporal_conv_size": 2, - "image_min_pixels": 1000, - "image_max_pixels": 10000, - } - result = processor._parse_processor_kwargs(valid_kwargs) - self.assertEqual(result, valid_kwargs) - - # Test with invalid type (implementation catches exception and returns empty dict) - invalid_kwargs = {"spatial_conv_size": "invalid"} # Should be int - result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, invalid_kwargs) - self.assertEqual(result, {}) - - # Test with non-dict input (implementation catches exception and returns empty dict) - result = Ernie4_5_VLProcessor._parse_processor_kwargs(processor, "not a dict") - self.assertEqual(result, {}) - - # Test exception handling with None - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.data_processor_logger"): - result = processor._parse_processor_kwargs(None) - self.assertEqual(result, {}) - - def test_parse_limits(self): - """Test _parse_limits with various inputs""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor._parse_limits = Ernie4_5_VLProcessor._parse_limits.__get__(processor, Ernie4_5_VLProcessor) - - # Test with valid limits - valid_limits = {"image": 5, "video": 3} - result = processor._parse_limits(valid_limits) - self.assertEqual(result["image"], 5) - self.assertEqual(result["video"], 3) - self.assertEqual(result["audio"], 1) # Default value - - # Test with empty input (None) - result = processor._parse_limits(None) - self.assertEqual(result["image"], 1) - self.assertEqual(result["video"], 1) - self.assertEqual(result["audio"], 1) - - # Test with invalid type (implementation catches exception and returns default limits) - result = Ernie4_5_VLProcessor._parse_limits(processor, "not a dict") - self.assertEqual(result["image"], 1) - self.assertEqual(result["video"], 1) - self.assertEqual(result["audio"], 1) - - def test_check_mm_limits(self): - """Test _check_mm_limits with various inputs""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor._check_mm_limits = Ernie4_5_VLProcessor._check_mm_limits.__get__(processor, Ernie4_5_VLProcessor) - - # Test with dict input (should not raise) - processor.limit_mm_per_prompt = {"image": 2, "video": 1} - mm_data = {"image": [1, 2], "video": [1]} - processor._check_mm_limits(mm_data) - - # Test with messages input (should not raise) - messages = [ - {"role": "user", "content": [{"type": "image", "data": "img1"}]}, - {"role": "user", "content": [{"type": "video", "data": "vid1"}]}, - ] - processor._check_mm_limits(messages) - - # Test when limit is exceeded (should raise ValueError) - processor.limit_mm_per_prompt = {"image": 1, "video": 1} - mm_data = {"image": [1, 2, 3], "video": []} # 3 images, limit is 1 - with self.assertRaises(ValueError) as context: - processor._check_mm_limits(mm_data) - self.assertIn("Too many image items", str(context.exception)) - - def test_process_request_dict(self): - """Test process_request_dict method""" - # from fastdeploy.engine.request import Request - - # Mock the process_request_dict method - self.processor.process_request_dict = MagicMock() - - # Create a mock Request object - mock_request = MagicMock(spec=Request) - mock_request.to_dict.return_value = {"messages": [{"role": "user", "content": "Hello"}]} - - # Mock Request.from_dict to return a mock request - with patch.object(Request, "from_dict") as mock_from_dict: - mock_result_request = MagicMock(spec=Request) - mock_from_dict.return_value = mock_result_request - - self.processor.process_request_dict(mock_request, max_model_len=100, chat_template_kwargs={"key": "value"}) - - # Verify process_request_dict was called - self.processor.process_request_dict.assert_called_once() - - def test_get_pad_id(self): - """Test get_pad_id method""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor.tokenizer = MagicMock() - processor.tokenizer.pad_token_id = 100 - processor.get_pad_id = Ernie4_5_VLProcessor.get_pad_id.__get__(processor, Ernie4_5_VLProcessor) - - result = processor.get_pad_id() - self.assertEqual(result, 100) - - def test_load_tokenizer(self): - """Test _load_tokenizer method""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - mock_tokenizer = MagicMock() - processor.ernie4_5_processor = MagicMock() - processor.ernie4_5_processor.tokenizer = mock_tokenizer - processor._load_tokenizer = Ernie4_5_VLProcessor._load_tokenizer.__get__(processor, Ernie4_5_VLProcessor) - - processor._load_tokenizer() - self.assertEqual(processor.tokenizer, mock_tokenizer) - - def test_append_completion_tokens(self): - """Test append_completion_tokens method""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor.append_completion_tokens = Ernie4_5_VLProcessor.append_completion_tokens.__get__( - processor, Ernie4_5_VLProcessor - ) - - multimodal_inputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], - "cur_position": 3, - } - completion_token_ids = [10, 11, 12] - - processor.append_completion_tokens(multimodal_inputs, completion_token_ids) - - self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11, 12]) - self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0, 0]) - self.assertEqual(len(multimodal_inputs["position_ids"]), 6) - self.assertEqual(multimodal_inputs["cur_position"], 6) - - def test_pack_outputs(self): - """Test pack_outputs with and without images""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor.image_patch_id = 1001 - processor.ernie4_5_processor = SimpleNamespace(mm_num_tokens=lambda **kwargs: 123) - processor.pack_outputs = Ernie4_5_VLProcessor.pack_outputs.__get__(processor, Ernie4_5_VLProcessor) - # Test with images - outs_with_images = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], - "images": [np.array([[1, 2], [3, 4]])], - "grid_thw": [np.array([[1, 2, 2]])], - "image_type_ids": [0], - } - - result = processor.pack_outputs(outs_with_images) - self.assertIsNotNone(result["images"]) - self.assertIsNotNone(result["grid_thw"]) - self.assertIsNotNone(result["image_type_ids"]) - self.assertEqual(result["image_patch_id"], 1001) - self.assertIsInstance(result["input_ids"], np.ndarray) - self.assertIsInstance(result["token_type_ids"], np.ndarray) - self.assertIsInstance(result["position_ids"], np.ndarray) - - # Test without images - outs_without_images = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [[0, 0, 0], [1, 1, 1], [2, 2, 2]], - "images": [], - "grid_thw": [], - "image_type_ids": [], - } - - result = processor.pack_outputs(outs_without_images) - self.assertIsNone(result["images"]) - self.assertIsNone(result["grid_thw"]) - self.assertIsNone(result["image_type_ids"]) - - def test_process_response_dict(self): - """Test process_response_dict with different parameters""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor.process_response_dict = Ernie4_5_VLProcessor.process_response_dict.__get__( - processor, Ernie4_5_VLProcessor - ) - - response = RequestOutput( - request_id="test_0", - outputs=CompletionOutput(text="response", index=0, send_idx=0, token_ids=[1, 2, 3]), - ) - # Test with stream=True - processor.process_response_obj_streaming = MagicMock(return_value=response) - response_obj = RequestOutput(request_id="test_0") - result = processor.process_response_dict(response_obj, stream=True) - processor.process_response_obj_streaming.assert_called_once() - self.assertEqual(result, response) - - # Test with stream=False - processor.process_response_obj_normal = MagicMock(return_value=response) - response_obj = RequestOutput(request_id="test_0") - result = processor.process_response_dict(response_obj, stream=False) - processor.process_response_obj_normal.assert_called_once() - self.assertEqual(result, response) - - def test_apply_default_parameters(self): - """Test _apply_default_parameters with dict and object request""" - with patch.object(Ernie4_5_VLProcessor, "__init__", return_value=None): - processor = Ernie4_5_VLProcessor("model_path") - processor.generation_config = MagicMock() - processor.generation_config.top_p = 0.8 - processor.generation_config.temperature = 0.9 - processor._apply_default_parameters = Ernie4_5_VLProcessor._apply_default_parameters.__get__( - processor, Ernie4_5_VLProcessor - ) - - # Test with dict request - request = Request(request_id="test_0") - request.sampling_params = SamplingParams() - result = processor._apply_default_parameters(request) - self.assertEqual(result.sampling_params.top_p, 0.8) - self.assertEqual(result.sampling_params.temperature, 0.9) - - # Test with object request - class MockRequest: - def __init__(self): - self.sampling_params = SamplingParams() - self.sampling_params.top_p = None - self.sampling_params.temperature = None - - def get(self, key): - return getattr(self.sampling_params, key, None) - - def set(self, key, value): - setattr(self.sampling_params, key, value) - - request = MockRequest() - result = processor._apply_default_parameters(request) - self.assertEqual(result.sampling_params.top_p, 0.8) - - -class TestDataProcessorTargetMethods(unittest.TestCase): - def setUp(self): - self.mock_tokenizer = MagicMock(spec=Ernie4_5Tokenizer) - self.mock_tokenizer.ignored_index = -100 - self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids - self.mock_tokenizer.chat_template = "mock_template" - self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>" - # Mock encode method for _add_text - self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]}) - - def mock_load_tokenizer(dp_instance): - dp_instance.tokenizer = self.mock_tokenizer - - with patch.object(DataProcessor, "_load_tokenizer", side_effect=mock_load_tokenizer, autospec=True): - with patch.object(AdaptiveImageProcessor, "from_pretrained") as mock_image_preprocessor: - mock_image_preprocessor.return_value = MagicMock() - self.data_processor = DataProcessor( - tokenizer_name="mock_tokenizer", - image_preprocessor_name="mock_image_preprocessor", - enable_processor_cache=False, - ) - self.data_processor.image_patch_id = 1001 - self.data_processor.image_start_id = 1002 - self.data_processor.image_end_id = 1003 - self.data_processor.video_start_id = 1004 - self.data_processor.video_end_id = 1005 - self.data_processor.role_prefixes = {"user": "User: ", "assistant": "Assistant: "} - self.data_processor.enable_processor_cache = False - # Note: extract_mm_items is not mocked by default, only when needed - self.data_processor.extract_mm_items = MagicMock(return_value=([], [], [], [], None, [], [])) - - def _restore_real_extract_mm_items(self): - """Helper method to restore real extract_mm_items method for testing""" - from fastdeploy.input.v1.ernie4_5_vl_processor.process import DataProcessor - - original_extract_mm_items = DataProcessor.extract_mm_items - self.data_processor.extract_mm_items = original_extract_mm_items.__get__(self.data_processor, DataProcessor) - - def _mock_convert_tokens_to_ids(self, token): - token_id_map = { - "<|begin_of_sentence|>": 101, - "<|end_of_sentence|>": 102, - "": 103, - "<|IMAGE_PLACEHOLDER|>": 1001, - "<|IMAGE_START|>": 1002, - "<|IMAGE_END|>": 1003, - "<|VIDEO_START|>": 1004, - "<|VIDEO_END|>": 1005, - } - return token_id_map.get(token, 999) - - def test_prompt_token_ids2outputs_only_prompt_token_ids(self): - test_prompt_token_ids = [101, 999, 998, 997, 102] - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - } - request = Request.from_dict(request) - - outputs = self.data_processor.prompt_token_ids2outputs(request) - - prompt_len = len(test_prompt_token_ids) - - self.assertEqual( - outputs["input_ids"], - test_prompt_token_ids, - f"input_ids mismatch: actual {outputs['input_ids']}, expected {test_prompt_token_ids}", - ) - - self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len) - - expected_position_ids = [[i] * 3 for i in range(prompt_len)] - self.assertEqual(outputs["position_ids"], expected_position_ids) - - self.assertEqual(outputs["cur_position"], prompt_len) - - self.assertEqual(len(outputs["images"]), 0) - self.assertEqual(len(outputs["grid_thw"]), 0) - self.assertEqual(len(outputs["mm_positions"]), 0) - self.assertEqual(len(outputs["mm_hashes"]), 0) - self.assertEqual(outputs["video_cnt"], 0) - self.assertEqual(outputs["num_input_image_tokens"], 0) - self.assertEqual(outputs["num_input_video_tokens"], 0) - - def test_prompt_token_ids2outputs_with_messages_no_mm(self): - test_prompt_token_ids = [101, 999, 998, 997, 102] - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [{"role": "user", "content": "Hello World"}], - } - request = Request.from_dict(request) - - self.data_processor.extract_mm_items.return_value = ([], [], [], [], None, [], []) - - outputs = self.data_processor.prompt_token_ids2outputs(request) - - prompt_len = len(test_prompt_token_ids) - - self.assertEqual(outputs["input_ids"], test_prompt_token_ids) - - self.assertEqual(outputs["token_type_ids"], [IDS_TYPE_FLAG["text"]] * prompt_len) - - expected_position_ids = [[i] * 3 for i in range(prompt_len)] - self.assertEqual(outputs["position_ids"], expected_position_ids) - - self.assertEqual(outputs["cur_position"], prompt_len) - - self.assertEqual(len(outputs["images"]), 0) - self.assertEqual(outputs["video_cnt"], 0) - self.assertEqual(outputs["num_input_image_tokens"], 0) - - def test_prompt_token_ids2outputs_add_image(self): - test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] - mock_img = MagicMock() - mock_img.height = 224 - mock_img.width = 224 - mock_img.convert.return_value = mock_img - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [mock_img], - [], - ["img_uuid"], - [], - None, - [], - [{"type": "image", "data": mock_img}], - ) - mock_resize = (None, (2, 4)) - self.data_processor.image_preprocessor.get_smarted_resize.return_value = mock_resize - mock_preprocess = {"pixel_values": np.random.randn(1, 16, 16, 3), "image_grid_thw": np.array([[2, 4]])} - self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess - # self.data_processor._compute_3d_positions = MagicMock(return_value=[[i]*3 for i in range(4)]) - outputs = self.data_processor.prompt_token_ids2outputs(request) - self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102]) - self.assertEqual( - outputs["token_type_ids"], - [ - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["image"], - IDS_TYPE_FLAG["image"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - ], - ) - self.assertEqual(len(outputs["position_ids"]), 6) - self.assertEqual(outputs["cur_position"], 6) - self.assertEqual(len(outputs["images"]), 1) - self.assertIsNotNone(outputs["images"][0]) - self.assertEqual(outputs["num_input_image_tokens"], 2) - self.assertEqual(len(outputs["mm_positions"]), 1) - self.assertEqual(len(outputs["mm_hashes"]), 1) - self.assertEqual(len(outputs["grid_thw"]), 1) - self.assertEqual(len(outputs["image_type_ids"]), 1) - - def test_prompt_token_ids2outputs_add_processed_image(self): - test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] - mock_img_data = np.random.randn(8, 28, 28) - mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)}) - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [mock_img_cache], - [], - ["img_uuid"], - [], - None, - [], - [{"type": "image", "data": mock_img_cache}], - ) - outputs = self.data_processor.prompt_token_ids2outputs(request) - self.assertEqual(outputs["input_ids"], [101, 1002, 1001, 1001, 1003, 102]) - self.assertEqual( - outputs["token_type_ids"], - [ - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["image"], - IDS_TYPE_FLAG["image"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - ], - ) - self.assertEqual(len(outputs["position_ids"]), 20) - self.assertEqual(outputs["cur_position"], 8) - self.assertEqual(len(outputs["images"]), 1) - self.assertIsNotNone(outputs["images"][0]) - self.assertEqual(len(outputs["mm_positions"]), 1) - self.assertEqual(outputs["mm_hashes"][0], "img_uuid") - self.assertEqual(len(outputs["grid_thw"]), 1) - self.assertEqual(len(outputs["image_type_ids"]), 1) - - def test_prompt_token_ids2outputs_add_video(self): - test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102] - mock_frame1 = MagicMock() - mock_frame1.height = 224 - mock_frame1.width = 224 - mock_frame1.convert.return_value = mock_frame1 - mock_frame2 = MagicMock() - mock_frame2.height = 224 - mock_frame2.width = 224 - mock_frame2.convert.return_value = mock_frame2 - frames = [mock_frame1, mock_frame2] - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [], - [frames], - [], - ["vid_uuid"], - None, - [], - [{"type": "video", "data": frames}], - ) - self.data_processor._load_and_process_video = MagicMock(return_value=frames) - patches_h, patches_w = 4, 4 - self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) - mock_preprocess = { - "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3), - "video_grid_thw": np.array([[patches_h, patches_w]] * 2), - } - self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess - outputs = self.data_processor.prompt_token_ids2outputs(request) - self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]) - self.assertEqual( - outputs["token_type_ids"], - [ - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - ], - ) - self.assertEqual(len(outputs["position_ids"]), 8) - self.assertEqual(outputs["cur_position"], 6) - self.assertEqual(len(outputs["images"]), 1) - self.assertIsNotNone(outputs["images"][0]) - self.assertEqual(len(outputs["mm_positions"]), 1) - self.assertEqual(outputs["mm_hashes"][0], "vid_uuid") - self.assertEqual(len(outputs["grid_thw"]), 1) - self.assertEqual(len(outputs["image_type_ids"]), 2) - self.assertEqual(outputs["num_input_video_tokens"], 4) - - def test_prompt_token_ids2outputs_add_processed_video(self): - test_prompt_token_ids = [101, 1004, 1001, 1001, 1001, 1001, 1005, 102] - t, h, w = 2, 4, 4 - spatial_conv_size = self.data_processor.spatial_conv_size - temporal_conv_size = self.data_processor.temporal_conv_size - token_per_frame = (h // spatial_conv_size) * (w // spatial_conv_size) - num_tokens = (t // temporal_conv_size) * token_per_frame - mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28) - mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)}) - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [], - [mock_frames_cache], - [], - ["vid_uuid"], - None, - [], - [{"type": "video", "data": mock_frames_cache}], - ) - outputs = self.data_processor.prompt_token_ids2outputs(request) - self.assertEqual(outputs["input_ids"], [101, 1004, 1001, 1001, 1001, 1001, 1005, 102]) - self.assertEqual( - outputs["token_type_ids"], - [ - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["video"], - IDS_TYPE_FLAG["text"], - IDS_TYPE_FLAG["text"], - ], - ) - self.assertEqual(len(outputs["position_ids"]), 8) - self.assertEqual(outputs["cur_position"], 6) - self.assertEqual(len(outputs["images"]), 1) - self.assertIsNotNone(outputs["images"][0]) - self.assertEqual(len(outputs["mm_positions"]), 1) - self.assertEqual(outputs["mm_hashes"][0], "vid_uuid") - self.assertEqual(len(outputs["grid_thw"]), 1) - self.assertEqual(len(outputs["image_type_ids"]), 2) - - def test_prompt_token_ids2outputs_add_image_token_len_mismatch(self): - test_prompt_token_ids = [101, 1002, 1001, 1001, 1001, 1003, 102] - mock_img = MagicMock() - mock_img.height = 224 - mock_img.width = 224 - mock_img.convert.return_value = mock_img - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "image_url", "image_url": mock_img, "uuid": "img_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [mock_img], - [], - ["img_uuid"], - [], - None, - [], - [{"type": "image", "data": mock_img}], - ) - patches_h, patches_w = 8, 8 - self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) - mock_preprocess = { - "pixel_values": np.random.randn(1, patches_h, patches_w, 3), - "image_grid_thw": np.array([[patches_h, patches_w]]), - } - self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess - with self.assertRaises(ValueError) as ctx: - self.data_processor.prompt_token_ids2outputs(request) - self.assertIn("image tokens num not match the size", str(ctx.exception)) - - def test_prompt_token_ids2outputs_add_processed_image_token_len_mismatch(self): - test_prompt_token_ids = [101, 1002, 1001, 1001, 1003, 102] - spatial_conv_size = self.data_processor.spatial_conv_size - num_tokens = 4 - mock_img_data = np.random.randn(num_tokens * (spatial_conv_size**2), 28, 28) - mock_img_cache = (mock_img_data, {"thw": (1, 8, 8)}) - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "image_url", "image_url": mock_img_cache, "uuid": "img_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [mock_img_cache], - [], - ["img_uuid"], - [], - None, - [], - [{"type": "image", "data": mock_img_cache}], - ) - with self.assertRaises(ValueError) as ctx: - self.data_processor.prompt_token_ids2outputs(request) - self.assertIn("image tokens num not match the size", str(ctx.exception)) - - def test_prompt_token_ids2outputs_add_video_token_len_mismatch(self): - test_prompt_token_ids = [101, 1004, 1001, 1001, 1005, 102] - mock_frame1 = MagicMock() - mock_frame1.height = 224 - mock_frame1.width = 224 - mock_frame1.convert.return_value = mock_frame1 - mock_frame2 = MagicMock() - mock_frame2.height = 224 - mock_frame2.width = 224 - mock_frame2.convert.return_value = mock_frame2 - frames = [mock_frame1, mock_frame2] - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "video_url", "video_url": frames, "uuid": "vid_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [], - [frames], - [], - ["vid_uuid"], - None, - [], - [{"type": "video", "data": frames}], - ) - self.data_processor._load_and_process_video = MagicMock(return_value=frames) - patches_h, patches_w = 8, 8 - self.data_processor.image_preprocessor.get_smarted_resize.return_value = (None, (patches_h, patches_w)) - mock_preprocess = { - "pixel_values_videos": np.random.randn(2, patches_h, patches_w, 3), - "video_grid_thw": np.array([[patches_h, patches_w]] * 2), - } - self.data_processor.image_preprocessor.preprocess.return_value = mock_preprocess - with self.assertRaises(ValueError) as ctx: - self.data_processor.prompt_token_ids2outputs(request) - self.assertIn("video tokens num not match the size", str(ctx.exception)) - - def test_prompt_token_ids2outputs_add_processed_video_token_len_mismatch(self): - test_prompt_token_ids = [101, 1004, 1001, 1005, 102] - t, h, w = 2, 8, 8 - spatial_conv_size = self.data_processor.spatial_conv_size - temporal_conv_size = self.data_processor.temporal_conv_size - - num_tokens = 4 - mock_frames_data = np.random.randn(num_tokens * spatial_conv_size**2 * temporal_conv_size, 28, 28) - mock_frames_cache = (mock_frames_data, {"thw": (t, h, w)}) - request = { - "request_id": "test_0", - "prompt_token_ids": test_prompt_token_ids, - "messages": [ - {"role": "user", "content": [{"type": "video", "data": mock_frames_cache, "uuid": "vid_uuid"}]} - ], - } - request = Request.from_dict(request) - self.data_processor.extract_mm_items.return_value = ( - [], - [mock_frames_cache], - [], - ["vid_uuid"], - None, - [], - [{"type": "video", "data": mock_frames_cache}], - ) - with self.assertRaises(ValueError) as ctx: - self.data_processor.prompt_token_ids2outputs(request) - self.assertIn("video tokens num not match the size", str(ctx.exception)) - - def test_extract_mm_items(self): - """Test extract_mm_items with various scenarios: basic items, video, and missing data error""" - self._restore_real_extract_mm_items() - - # Test basic multimodal items (image + video) - request = { - "request_id": "test_0", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Hello"}, - {"type": "image", "data": Image.new("RGB", (224, 224)), "uuid": "img1"}, - {"type": "video", "data": [Image.new("RGB", (224, 224))], "uuid": "vid1"}, - ], - } - ], - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = request.messages - images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = ( - self.data_processor.extract_mm_items(request) - ) - self.assertEqual(len(images), 1) - self.assertEqual(len(videos), 1) - self.assertEqual(image_uuid[0], "img1") - self.assertEqual(video_uuid[0], "vid1") - self.assertEqual(len(mm_items), 2) - - # Test missing data error when cache is disabled - self.data_processor.enable_processor_cache = False - request = { - "request_id": "test_0", - "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}], - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = request.messages - with self.assertRaises(ValueError) as ctx: - self.data_processor.extract_mm_items(request) - self.assertIn("Missing items cannot be retrieved", str(ctx.exception)) - - -class TestDataProcessor(unittest.TestCase): - def setUp(self): - """Set up test environment""" - self.mock_tokenizer = MagicMock() - - def mock_convert_tokens_to_ids(x): - if isinstance(x, list): - return [hash(str(token)) % 10000 for token in x] - return hash(str(x)) % 10000 - - self.mock_tokenizer.convert_tokens_to_ids = MagicMock(side_effect=mock_convert_tokens_to_ids) - self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]}) - self.mock_tokenizer.decode = MagicMock(return_value="decoded_text") - self.mock_tokenizer.tokenize = MagicMock(return_value=["token1", "token2"]) - self.mock_tokenizer.ignored_index = -100 - self.mock_tokenizer.chat_template = MagicMock() - self.mock_tokenizer.apply_chat_template = MagicMock(return_value="formatted_prompt") - - self.mock_image_preprocessor = MagicMock() - self.mock_image_preprocessor.get_smarted_resize = MagicMock(return_value=((224, 224), (16, 16))) - self.mock_image_preprocessor.preprocess = MagicMock( - return_value={ - "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32), - "image_grid_thw": np.array([[1, 16, 16]]), - } - ) - self.mock_image_preprocessor.from_pretrained = MagicMock(return_value=self.mock_image_preprocessor) - - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.process.AdaptiveImageProcessor", - self.mock_image_preprocessor, - ): - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.Ernie4_5Tokenizer") as mock_tokenizer_class: - mock_tokenizer_class.from_pretrained = MagicMock(return_value=self.mock_tokenizer) - mock_tokenizer_class.resource_files_names = {"vocab_file": "tokenizer.model"} - with patch("os.path.exists", return_value=True): - self.processor = DataProcessor( - tokenizer_name="test_model", - image_preprocessor_name="test_model", - ) - - def _create_outputs(self): - """Helper to create outputs dict""" - return { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "mm_positions": [], - "mm_hashes": [], - "cur_position": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - } - - def _mock_video_processing(self, mock_frames=None): - """Helper to mock video processing""" - if mock_frames is None: - mock_frames = [Image.new("RGB", (224, 224)) for _ in range(4)] - mock_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_video_decord") - mock_frames_read = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.read_frames_decord") - mock_render = patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.render_frame_timestamp") - return mock_read, mock_frames_read, mock_render, mock_frames - - def _setup_video_mocks(self, mock_read, mock_frames_read, mock_render, mock_frames): - """Setup video processing mocks""" - mock_read.return_value = (None, {"duration": 2.0}, "test_path") - mock_frames_read.return_value = ( - [np.array(f) for f in mock_frames], - None, - [0.0, 0.5, 1.0, 1.5] if len(mock_frames) == 4 else [float(i) * 0.5 for i in range(len(mock_frames))], - ) - mock_render.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img) - self.mock_image_preprocessor.preprocess.return_value = { - "pixel_values_videos": np.random.rand(len(mock_frames), 256, 3 * 14 * 14).astype(np.float32), - "video_grid_thw": np.array([[len(mock_frames), 16, 16]]), - } - - def test_train_and_eval(self): - """Test training and evaluation mode switching""" - self.assertTrue(self.processor.is_training) - self.processor.eval() - self.assertFalse(self.processor.is_training) - self.processor.train() - self.assertTrue(self.processor.is_training) - - def test_build_token_type_mapping(self): - """Test token type mapping construction""" - mapping = self.processor._build_token_type_mapping() - for token in [ - self.processor.IMG_START, - self.processor.IMG_END, - self.processor.VID_START, - self.processor.VID_END, - ]: - self.assertEqual(mapping[token], IDS_TYPE_FLAG["image"]) - self.assertEqual(mapping[self.processor.image_patch_id], IDS_TYPE_FLAG["image"]) - - def test_add_text_and_special_token(self): - """Test adding text and special tokens""" - outputs = self._create_outputs() - self.processor._add_text("hello", outputs) - self.assertEqual(len(outputs["input_ids"]), 3) - self.assertEqual(outputs["cur_position"], 3) - - outputs2 = self._create_outputs() - self.processor._add_text([1, 2, 3, 4, 5], outputs2) - self.assertEqual(len(outputs2["input_ids"]), 5) - - outputs3 = self._create_outputs() - self.processor._add_special_token("<|begin_of_sentence|>", outputs3) - self.processor._add_special_token(12345, outputs3) - self.assertEqual(len(outputs3["input_ids"]), 2) - - def test_compute_3d_positions(self): - """Test 3D position computation""" - pos_ids = self.processor._compute_3d_positions(t=2, h=16, w=16, start_idx=10) - self.assertIsInstance(pos_ids, list) - self.assertGreater(len(pos_ids), 0) - self.assertEqual(len(pos_ids[0]), 3) - - pos_ids2 = self.processor._compute_3d_positions(t=1, h=16, w=16, start_idx=0) - expected_len = 1 * (16 // self.processor.spatial_conv_size) ** 2 - self.assertEqual(len(pos_ids2), expected_len) - - def test_set_video_frame_args_comprehensive(self): - """Test _set_video_frame_args with various scenarios""" - # Valid cases - result = self.processor._set_video_frame_args( - { - "target_frames": 32, - "fps": -1, - "min_frames": 16, - "max_frames": 64, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - self.assertEqual(result["target_frames"], 32) - - result = self.processor._set_video_frame_args( - { - "target_frames": -1, - "fps": 2, - "min_frames": 16, - "max_frames": 64, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - self.assertIsNotNone(result) - - # Error cases - with self.assertRaises(ValueError): - self.processor._set_video_frame_args( - { - "target_frames": -1, - "fps": -1, - "min_frames": 16, - "max_frames": 64, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - with self.assertRaises(ValueError): - self.processor._set_video_frame_args( - { - "target_frames": 10, - "fps": 2, - "min_frames": 1, - "max_frames": 100, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - with self.assertRaises(ValueError): - self.processor._set_video_frame_args( - { - "target_frames": 5, - "fps": -1, - "min_frames": 10, - "max_frames": 100, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - with self.assertRaises(ValueError): - self.processor._set_video_frame_args( - { - "target_frames": 200, - "fps": -1, - "min_frames": 1, - "max_frames": 100, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - with self.assertRaises(ValueError): - self.processor._set_video_frame_args( - { - "target_frames": -1, - "fps": 2, - "min_frames": 100, - "max_frames": 10, - "frames_sample": "leading", - }, - {"duration": 10.0}, - ) - - # Adjustment cases - result = self.processor._set_video_frame_args( - { - "target_frames": -1, - "fps": 1, - "min_frames": 10, - "max_frames": 100, - "frames_sample": "leading", - }, - {"duration": 1.0}, - ) - self.assertEqual(result["target_frames"], 10) - self.assertEqual(result["fps"], -1) - - result = self.processor._set_video_frame_args( - { - "target_frames": -1, - "fps": 10, - "min_frames": 1, - "max_frames": 100, - "frames_sample": "leading", - }, - {"duration": 100.0}, - ) - self.assertEqual(result["target_frames"], 100) - self.assertEqual(result["fps"], -1) - - def test_text2ids_comprehensive(self): - """Test text2ids with various scenarios""" - # Text only - outputs = self.processor.text2ids("Hello world") - self.assertIn("input_ids", outputs) - self.assertEqual(len(outputs["images"]), 0) - - # Empty text - outputs = self.processor.text2ids("") - self.assertEqual(len(outputs["input_ids"]), 0) - - # With image placeholder - mock_image = Image.new("RGB", (224, 224)) - outputs = self.processor.text2ids("Hello <|image@placeholder|> world", images=[mock_image]) - self.assertGreater(len(outputs["input_ids"]), 0) - self.assertGreater(len(outputs["images"]), 0) - - # With cached image - cached_image = ( - np.random.rand(256, 3 * 14 * 14).astype(np.float32), - {"thw": (1, 16, 16)}, - ) - outputs = self.processor.text2ids( - "Hello <|image@placeholder|> world", - images=[cached_image], - image_uuid=["uuid"], - ) - self.assertGreater(len(outputs["input_ids"]), 0) - - # Multiple images - outputs = self.processor.text2ids( - "Hello <|image@placeholder|> world <|image@placeholder|> end", - images=[mock_image, mock_image], - ) - self.assertEqual(len(outputs["images"]), 2) - - # With video placeholder - mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing() - with mock_read as mr, mock_frames_read as mfr, mock_render as mren: - mr.return_value = (None, {"duration": 2.0}, "test_path") - mfr.return_value = ( - [np.array(f) for f in mock_frames], - None, - [0.0, 0.5, 1.0, 1.5], - ) - mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img) - self.mock_image_preprocessor.preprocess.return_value = { - "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32), - "video_grid_thw": np.array([[4, 16, 16]]), - } - outputs = self.processor.text2ids("Hello <|video@placeholder|> world", videos=["test_video.mp4"]) - self.assertGreater(len(outputs["input_ids"]), 0) - - # Cached video - cached_video = ( - np.random.rand(256, 3 * 14 * 14).astype(np.float32), - {"thw": (4, 16, 16)}, - ) - outputs = self.processor.text2ids( - "Hello <|video@placeholder|> world", - videos=[cached_video], - video_uuid=["uuid"], - ) - self.assertGreater(len(outputs["input_ids"]), 0) - - # Video dict format - mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing() - with mock_read as mr, mock_frames_read as mfr, mock_render as mren: - mr.return_value = (None, {"duration": 2.0}, "test_path") - mfr.return_value = ( - [np.array(f) for f in mock_frames], - None, - [0.0, 0.5, 1.0, 1.5], - ) - mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img) - self.mock_image_preprocessor.preprocess.return_value = { - "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32), - "video_grid_thw": np.array([[4, 16, 16]]), - } - outputs = self.processor.text2ids( - "Hello <|video@placeholder|> world", - videos=[{"video": "test.mp4", "fps": 2}], - ) - self.assertGreater(len(outputs["input_ids"]), 0) - - # Image and video together - mock_read, mock_frames_read, mock_render, mock_frames = self._mock_video_processing() - with mock_read as mr, mock_frames_read as mfr, mock_render as mren: - mr.return_value = (None, {"duration": 2.0}, "test_path") - mfr.return_value = ( - [np.array(f) for f in mock_frames], - None, - [0.0, 0.5, 1.0, 1.5], - ) - mren.side_effect = lambda img, ts: (Image.fromarray(img) if isinstance(img, np.ndarray) else img) - self.mock_image_preprocessor.preprocess.side_effect = [ - { - "pixel_values": np.random.rand(256, 3 * 14 * 14).astype(np.float32), - "image_grid_thw": np.array([[1, 16, 16]]), - }, - { - "pixel_values_videos": np.random.rand(4, 256, 3 * 14 * 14).astype(np.float32), - "video_grid_thw": np.array([[4, 16, 16]]), - }, - ] - outputs = self.processor.text2ids( - "Hello <|image@placeholder|> world <|video@placeholder|> end", - images=[mock_image], - videos=["test_video.mp4"], - ) - self.assertGreater(len(outputs["input_ids"]), 0) - self.mock_image_preprocessor.preprocess.side_effect = None - - def test_request2ids_comprehensive(self): - """Test request2ids with various scenarios""" - self.processor.is_training = False - - # Basic request with multimodal content - covers both text and image branches in one call - mock_image = Image.new("RGB", (224, 224)) - request = { - "request_id": "test_0", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "What's in this image?"}, - {"type": "image", "data": mock_image, "uuid": "img1"}, - ], - } - ], - "add_generation_prompt": True, - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = request.messages - outputs = self.processor.request2ids(request) - self.assertIn("input_ids", outputs) - - # Error case: missing chat_template - self.processor.tokenizer.chat_template = None - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}] - with self.assertRaises(ValueError): - self.processor.request2ids(request) - self.processor.tokenizer.chat_template = MagicMock() - - # Error case: unsupported role - request = { - "request_id": "test_0", - "messages": [{"role": "invalid_role", "content": "Hello"}], - "add_generation_prompt": True, - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = [{"role": "invalid_role", "content": [{"type": "text", "text": "Hello"}]}] - with self.assertRaises(AssertionError): - self.processor.request2ids(request) - - # Error case: missing cache when cache is disabled - self.processor.enable_processor_cache = False - request = { - "request_id": "test_0", - "messages": [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}], - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = request.messages - with self.assertRaises(ValueError): - self.processor.request2ids(request) - - def test_extract_labels(self): - """Test label extraction""" - outputs = {"input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5], "labels": []} - self.processor.is_training = True - self.processor._extract_labels(outputs, ["target text"]) - self.assertEqual(len(outputs["labels"]), len(outputs["input_ids"])) - - # Multiple targets - outputs2 = { - "input_ids": [1, 2, 3, self.processor.sep_token_id, 4, 5, self.processor.sep_token_id, 6, 7], - "labels": [], - } - self.processor._extract_labels(outputs2, ["target1", "target2"]) - self.assertEqual(len(outputs2["labels"]), len(outputs2["input_ids"])) - - # Error case - outputs3 = {"input_ids": [1, 2, 3, self.processor.sep_token_id], "labels": []} - with self.assertRaises(AssertionError): - self.processor._extract_labels(outputs3, ["target1", "target2"]) - - def test_fancy_print(self): - """Test fancy_print function""" - from fastdeploy.input.v1.ernie4_5_vl_processor.process import fancy_print - - test_cases = [ - ([1, 2, 3, self.processor.image_patch_id, 4, 5], self.processor.image_patch_id, None), - ( - [ - 1, - 2, - self.processor.image_patch_id, - self.processor.image_patch_id, - self.processor.image_patch_id, - 4, - 5, - ], - self.processor.image_patch_id, - "<|IMAGE@", - ), - ([1, 2, 3, 4, 5], self.processor.image_patch_id, None), - ] - for input_ids, image_patch_id, expected_contains in test_cases: - result = fancy_print(input_ids, self.mock_tokenizer, image_patch_id) - self.assertIsInstance(result, str) - if expected_contains: - self.assertIn(expected_contains, result) - - def test_processor_cache_operations(self): - """Test processor cache get/update and request2ids with cache""" - # Test get_processor_cache - mock_socket = MagicMock() - mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data")) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle: - mock_pickle.loads = MagicMock(return_value=[{"data": "cached_item"}]) - result = self.processor.get_processor_cache(mock_socket, ["hash1", "hash2"]) - self.assertEqual(len(result), 1) - - # Test update_processor_cache - mock_socket2 = MagicMock() - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle"): - self.processor.update_processor_cache( - mock_socket2, - ["hash1"], - [(np.array([1, 2, 3]), {"meta": "data"})], - ) - mock_socket2.send_multipart.assert_called_once() - - # Test request2ids with processor cache update - self.processor.is_training = False - self.processor.enable_processor_cache = True - mock_image = Image.new("RGB", (224, 224)) - request = { - "request_id": "test_0", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Hello"}, - {"type": "image", "data": mock_image, "uuid": "img1"}, - ], - } - ], - "add_generation_prompt": True, - } - request = Request.from_dict(request) - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.zmq") as mock_zmq: - mock_context = MagicMock() - mock_socket = MagicMock() - mock_socket.recv_multipart = MagicMock(return_value=(b"", b"pickled_data")) - mock_context.socket.return_value = mock_socket - mock_zmq.Context.return_value = mock_context - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.parse_chat_messages") as mock_parse: - mock_parse.return_value = request.messages - with patch("fastdeploy.input.v1.ernie4_5_vl_processor.process.pickle") as mock_pickle: - mock_pickle.loads = MagicMock(return_value=[]) - with patch.object(self.processor, "text2ids") as mock_text2ids: - mock_text2ids.return_value = { - "input_ids": [1, 2, 3], - "token_type_ids": [0] * 3, - "position_ids": [[i] * 3 for i in range(3)], - "images": [np.random.rand(256, 3 * 14 * 14).astype(np.float32)], - "grid_thw": [np.array([[1, 16, 16]])], - "image_type_ids": [0], - "cur_position": 3, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "mm_positions": [], - "mm_hashes": ["hash1"], - } - with patch.object(self.processor, "update_processor_cache") as mock_update: - self.processor.request2ids(request) - mock_update.assert_called_once() - self.processor.enable_processor_cache = False - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_image_preprocessor_adaptive.py b/tests/input/v1/test_image_preprocessor_adaptive.py deleted file mode 100644 index 5a15244d1fd..00000000000 --- a/tests/input/v1/test_image_preprocessor_adaptive.py +++ /dev/null @@ -1,499 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import patch - -import numpy as np -from PIL import Image - -from fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import ( - AdaptiveImageProcessor, - ceil_by_factor, - floor_by_factor, - is_scaled_image, - make_batched_images, - make_batched_videos, - round_by_factor, - smart_resize, -) - - -class TestImagePreprocessorAdaptive(unittest.TestCase): - def setUp(self): - """Set up test environment""" - self.processor = AdaptiveImageProcessor( - min_pixels=56 * 56, - max_pixels=28 * 28 * 1280, - patch_size=14, - temporal_conv_size=2, - merge_size=2, - ) - - def test_init(self): - """Test initialization""" - self.assertEqual(self.processor.min_pixels, 56 * 56) - self.assertEqual(self.processor.max_pixels, 28 * 28 * 1280) - self.assertEqual(self.processor.patch_size, 14) - self.assertEqual(self.processor.temporal_conv_size, 2) - self.assertEqual(self.processor.merge_size, 2) - - def test_set_pixels(self): - """Test setting pixels with valid and invalid values (lines 205-214)""" - # Test setting only min_pixels - self.processor.set_pixels(min_pixels=100, msg="test") - self.assertEqual(self.processor.min_pixels, 100) - self.assertEqual(self.processor.size["min_pixels"], 100) - - # Test setting only max_pixels - self.processor.set_pixels(max_pixels=200, msg="test") - self.assertEqual(self.processor.max_pixels, 200) - self.assertEqual(self.processor.size["max_pixels"], 200) - - # Test setting both - self.processor.set_pixels(min_pixels=150, max_pixels=250, msg="test") - self.assertEqual(self.processor.min_pixels, 150) - self.assertEqual(self.processor.max_pixels, 250) - self.assertEqual(self.processor.size["min_pixels"], 150) - self.assertEqual(self.processor.size["max_pixels"], 250) - - # Invalid cases - with self.assertRaises(AssertionError): - self.processor.set_pixels(min_pixels=-1) - with self.assertRaises(AssertionError): - self.processor.set_pixels(max_pixels=0) - - def test_get_smarted_resize(self): - """Test get_smarted_resize with default and custom pixels""" - height, width = 224, 224 - # Test with default pixels - (resized_h, resized_w), (patches_h, patches_w) = self.processor.get_smarted_resize(height, width) - self.assertIsInstance(resized_h, int) - self.assertIsInstance(resized_w, int) - self.assertIsInstance(patches_h, int) - self.assertIsInstance(patches_w, int) - # Test with custom pixels - (resized_h, resized_w), (_, _) = self.processor.get_smarted_resize( - height, width, min_pixels=100, max_pixels=10000 - ) - self.assertIsInstance(resized_h, int) - self.assertIsInstance(resized_w, int) - - def test_round_by_factor(self): - """Test round_by_factor with various cases""" - self.assertEqual(round_by_factor(100, 28), 112) # 100/28 ≈ 3.57, round(3.57) = 4, 4*28 = 112 - self.assertEqual(round_by_factor(50, 10), 50) - self.assertEqual(round_by_factor(55, 10), 60) - # Edge cases - self.assertEqual(round_by_factor(0, 14), 0) - self.assertEqual(round_by_factor(14, 14), 14) - self.assertEqual(round_by_factor(13, 14), 14) # Round up - self.assertEqual(round_by_factor(15, 14), 14) # Round down - - def test_ceil_by_factor(self): - """Test ceil_by_factor with various cases""" - self.assertEqual(ceil_by_factor(100, 28), 112) # ceil(100/28)*28 = ceil(3.57)*28 = 4*28 = 112 - self.assertEqual(ceil_by_factor(50, 10), 50) - self.assertEqual(ceil_by_factor(55, 10), 60) - # Edge cases - self.assertEqual(ceil_by_factor(0, 14), 0) - self.assertEqual(ceil_by_factor(14, 14), 14) - self.assertEqual(ceil_by_factor(13, 14), 14) # Ceil up - self.assertEqual(ceil_by_factor(15, 14), 28) # Ceil up to next multiple - - def test_floor_by_factor(self): - """Test floor_by_factor with various cases""" - self.assertEqual(floor_by_factor(100, 28), 84) # floor(100/28)*28 = floor(3.57)*28 = 3*28 = 84 - self.assertEqual(floor_by_factor(50, 10), 50) - self.assertEqual(floor_by_factor(55, 10), 50) - # Edge cases - self.assertEqual(floor_by_factor(0, 14), 0) - self.assertEqual(floor_by_factor(14, 14), 14) - self.assertEqual(floor_by_factor(13, 14), 0) # Floor down - self.assertEqual(floor_by_factor(15, 14), 14) # Floor down to multiple - self.assertEqual(floor_by_factor(28, 14), 28) # Exact multiple - - def test_smart_resize(self): - """Test smart_resize with various scenarios (lines 557-587)""" - # Basic functionality - height, width = 224, 224 - new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280) - self.assertIsInstance(new_h, int) - self.assertIsInstance(new_w, int) - self.assertEqual(new_h % 28, 0) - self.assertEqual(new_w % 28, 0) - - # High aspect ratio (height > width) - tests lines 557-563 - height, width = 10000, 10 # aspect ratio > 200 - new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280) - self.assertIsInstance(new_h, int) - self.assertIsInstance(new_w, int) - self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200) - - # High aspect ratio (width > height) - tests lines 562-563 - height, width = 10, 10000 - new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280) - self.assertIsInstance(new_h, int) - self.assertIsInstance(new_w, int) - self.assertLessEqual(max(new_h, new_w) / min(new_h, new_w), 200) - - # Too large - tests lines 575-578 - height, width = 10000, 10000 - new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280) - self.assertLessEqual(new_h * new_w, 28 * 28 * 1280) - - # Too small - tests lines 579-582 - height, width = 10, 10 - new_h, new_w = smart_resize(height, width, factor=28, min_pixels=56 * 56, max_pixels=28 * 28 * 1280) - self.assertGreaterEqual(new_h * new_w, 56 * 56) - - # Exceeds max_pixels with custom parameters - height, width = 10000, 10000 - max_pixels = 10000 - min_pixels = 1000 - new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels) - self.assertLessEqual(new_h * new_w, max_pixels) - self.assertGreaterEqual(new_h * new_w, min_pixels) - - # Below min_pixels with custom parameters - height, width = 10, 10 - min_pixels = 10000 - max_pixels = 100000 - new_h, new_w = smart_resize(height, width, factor=14, min_pixels=min_pixels, max_pixels=max_pixels) - self.assertGreaterEqual(new_h * new_w, min_pixels) - self.assertLessEqual(new_h * new_w, max_pixels) - - # Invalid result (extreme parameters) - tests lines 584-585 - with self.assertRaises(ValueError): - smart_resize(1, 1, factor=100000, min_pixels=100, max_pixels=1000) - - def test_is_scaled_image(self): - """Test is_scaled_image with various image types""" - # uint8 image - image = np.array([[0, 255], [128, 200]], dtype=np.uint8) - self.assertFalse(is_scaled_image(image)) - image = np.random.rand(224, 224, 3).astype(np.uint8) * 255 - self.assertFalse(is_scaled_image(image)) - - # Scaled float image (values in [0, 1]) - image = np.array([[0.0, 0.5], [0.3, 1.0]], dtype=np.float32) - self.assertTrue(is_scaled_image(image)) - image = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 - self.assertTrue(is_scaled_image(image)) - - # Unscaled float image (values > 1) - image = np.array([[0.0, 255.0], [128.0, 300.0]], dtype=np.float32) - self.assertFalse(is_scaled_image(image)) - image = np.random.rand(224, 224, 3).astype(np.float32) * 255 - self.assertFalse(is_scaled_image(image)) - - # Edge cases - image = np.array([[0.0, 1.0]], dtype=np.float32) - self.assertTrue(is_scaled_image(image)) - image = np.array([[0.0, 1.1]], dtype=np.float32) - self.assertFalse(is_scaled_image(image)) - image = np.array([[-0.1, 1.0]], dtype=np.float32) - self.assertFalse(is_scaled_image(image)) - - def test_make_batched_images(self): - """Test make_batched_images with various input types""" - # Single image - img = Image.new("RGB", (224, 224)) - result = make_batched_images(img) - self.assertEqual(len(result), 1) - self.assertEqual(result[0], img) - - # List of images - imgs = [Image.new("RGB", (224, 224)) for _ in range(3)] - result = make_batched_images(imgs) - self.assertEqual(len(result), 3) - self.assertEqual(result, imgs) - - # Nested list - imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)] - result = make_batched_images(imgs) - self.assertEqual(len(result), 4) # 2*2 = 4 - - # Invalid inputs - with self.assertRaises(ValueError) as context: - make_batched_images("invalid") - self.assertIn("Could not make batched images", str(context.exception)) - with self.assertRaises(ValueError) as context: - make_batched_images([[1, 2, 3], [4, 5, 6]]) - self.assertIn("Could not make batched images", str(context.exception)) - - def test_make_batched_videos(self): - """Test make_batched_videos with various input types""" - # List of images - imgs = [Image.new("RGB", (224, 224)) for _ in range(3)] - result = make_batched_videos(imgs) - self.assertEqual(len(result), 1) - self.assertEqual(len(result[0]), 3) - - # Single image in list - img = Image.new("RGB", (224, 224)) - result = make_batched_videos([img]) - self.assertEqual(len(result), 1) - self.assertEqual(len(result[0]), 1) - - # Nested list - imgs = [[Image.new("RGB", (224, 224)) for _ in range(2)] for _ in range(2)] - result = make_batched_videos(imgs) - self.assertEqual(len(result), 2) - self.assertEqual(len(result[0]), 2) - - # 4D array (single) - video = np.random.rand(3, 224, 224, 3).astype(np.uint8) - result = make_batched_videos(video) - self.assertEqual(len(result), 1) - self.assertIsInstance(result[0], list) - - # 4D array in list (lines 119-120) - videos = [np.random.rand(3, 224, 224, 3).astype(np.uint8)] - result = make_batched_videos(videos) - self.assertEqual(len(result), 1) - self.assertIsInstance(result[0], list) - - # Invalid input - with self.assertRaises(ValueError) as context: - make_batched_videos("invalid") - self.assertIn("Could not make batched video", str(context.exception)) - - def test_preprocess_images(self): - """Test preprocess handling images""" - img = Image.new("RGB", (224, 224)) - result = self.processor.preprocess(images=img) - self.assertIn("pixel_values", result) - self.assertIn("image_grid_thw", result) - # Verify pixel_values shape - pixel_values = result["pixel_values"] - self.assertIsInstance(pixel_values, np.ndarray) - - def test_preprocess_videos(self): - """Test preprocess handling videos""" - frames = [Image.new("RGB", (224, 224)) for _ in range(4)] - result = self.processor.preprocess(images=None, videos=frames) - self.assertIn("pixel_values_videos", result) - self.assertIn("video_grid_thw", result) - - def test_preprocess_invalid_images(self): - """Test preprocess handling invalid image""" - with self.assertRaises(ValueError): - self.processor.preprocess(images="invalid") - - def test_preprocess_with_predetermined_grid_thw(self): - """Test preprocess using predetermined_grid_thw""" - img = Image.new("RGB", (224, 224)) - # predetermined_grid_thw should be (h, w) format, not [1, h, w] - predetermined_grid_thw = [(16, 16)] # For single image, should be (h, w) tuple - result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw) - self.assertIn("pixel_values", result) - - def test_preprocess_flags(self): - """Test preprocess with various flags disabled""" - img = Image.new("RGB", (224, 224)) - # Test without resize - result = self.processor.preprocess(images=img, do_resize=False) - self.assertIn("pixel_values", result) - # Test without rescale - result = self.processor.preprocess(images=img, do_rescale=False) - self.assertIn("pixel_values", result) - # Test without normalize - result = self.processor.preprocess(images=img, do_normalize=False) - self.assertIn("pixel_values", result) - - def test_preprocess_custom_mean_std(self): - """Test preprocess using custom mean and std""" - img = Image.new("RGB", (224, 224)) - # Test with simple custom mean/std - result = self.processor.preprocess(images=img, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5]) - self.assertIn("pixel_values", result) - # Test with ImageNet-style mean/std - result = self.processor.preprocess( - images=img, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225] - ) - self.assertIn("pixel_values", result) - - def test_preprocess_do_convert_rgb(self): - """Test preprocess with do_convert_rgb=True (line 289)""" - img = Image.new("L", (224, 224)) # Grayscale image - result = self.processor.preprocess(images=img, do_convert_rgb=True) - self.assertIn("pixel_values", result) - - def test_preprocess_scaled_image_warning(self): - """Test warning for scaled image in preprocess (lines 294-298)""" - # Create a scaled image (values between 0-1) - img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 - # Use patch to capture warning - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger" - ) as mock_logger: - # Directly call _preprocess, pass scaled image - self.processor._preprocess( - [img_array], # Pass scaled numpy array - do_rescale=True, - do_convert_rgb=False, - ) - # Verify warning is called when is_scaled_image returns True and do_rescale is True - mock_logger.warning.assert_called() - - def test_preprocess_invalid_images_check(self): - """Test invalid image check in preprocess (line 464)""" - # Test invalid image type - need to ensure valid_images returns False - # Use patch to make valid_images return False, but make_batched_images succeeds - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images" - ) as mock_valid: - mock_valid.return_value = False - valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False - with self.assertRaises(ValueError) as context: - self.processor.preprocess(images=valid_images_list) - self.assertIn("Invalid image type", str(context.exception)) - - def test_preprocess_predetermined_grid_thw_multiple_images(self): - """Test preprocess with predetermined_grid_thw for multiple images (lines 307-310)""" - imgs = [Image.new("RGB", (224, 224)) for _ in range(2)] - predetermined_grid_thw = [(16, 16), (20, 20)] - result = self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw) - self.assertIn("pixel_values", result) - - def test_preprocess_predetermined_grid_thw_length_mismatch(self): - """Test preprocess with predetermined_grid_thw length mismatch (lines 307-310, 470)""" - imgs = [Image.new("RGB", (224, 224)) for _ in range(2)] - predetermined_grid_thw = [(16, 16)] # Length mismatch - only 1 element for 2 images - # The function raises IndexError when accessing predetermined_grid_thw[img_idx] with img_idx=1 - with self.assertRaises(IndexError): - self.processor.preprocess(images=imgs, predetermined_grid_thw=predetermined_grid_thw) - - def test_preprocess_with_input_data_format(self): - """Test preprocess with input_data_format parameter (lines 299-301)""" - img = Image.new("RGB", (224, 224)) - from paddleformers.transformers.image_utils import ChannelDimension - - # Test with FIRST - result = self.processor.preprocess(images=img, input_data_format=ChannelDimension.FIRST) - self.assertIn("pixel_values", result) - # Test with None - result = self.processor.preprocess(images=img, input_data_format=None) - self.assertIn("pixel_values", result) - - def test_preprocess_do_resize_with_predetermined_grid_thw(self): - """Test preprocess with do_resize=True and predetermined_grid_thw (lines 314-317)""" - img = Image.new("RGB", (224, 224)) - predetermined_grid_thw = [(16, 16)] - result = self.processor.preprocess(images=img, predetermined_grid_thw=predetermined_grid_thw, do_resize=True) - self.assertIn("pixel_values", result) - - def test_preprocess_videos_with_predetermined_grid_thw(self): - """Test preprocess videos with predetermined_grid_thw (lines 511)""" - frames = [Image.new("RGB", (224, 224)) for _ in range(4)] - predetermined_grid_thw = [(16, 16)] * 4 - result = self.processor.preprocess(images=None, videos=frames, predetermined_grid_thw=predetermined_grid_thw) - self.assertIn("pixel_values_videos", result) - - def test_preprocess_return_tensors(self): - """Test preprocess with return_tensors parameter (lines 396, 523)""" - img = Image.new("RGB", (224, 224)) - # Use string instead of TensorType enum which may not be available - result = self.processor.preprocess(images=img, return_tensors="np") - self.assertIn("pixel_values", result) - - def test_preprocess_do_rescale_false_with_scaled_image(self): - """Test preprocess with do_rescale=False and scaled image (line 335)""" - # Create a scaled image - img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 # Values in [0, 0.5] - img = Image.fromarray((img_array * 255).astype(np.uint8)) - result = self.processor.preprocess(images=img, do_rescale=False) - self.assertIn("pixel_values", result) - - def test_preprocess_custom_resample(self): - """Test preprocess with custom resample parameter (line 332)""" - img = Image.new("RGB", (224, 224)) - from PIL import Image as PILImage - - result = self.processor.preprocess(images=img, resample=PILImage.BILINEAR) - self.assertIn("pixel_values", result) - - def test_preprocess_custom_rescale_factor(self): - """Test preprocess with custom rescale_factor (line 336)""" - img = Image.new("RGB", (224, 224)) - result = self.processor.preprocess(images=img, rescale_factor=1.0 / 128.0) - self.assertIn("pixel_values", result) - - def test_preprocess_data_format(self): - """Test preprocess with different data_format values""" - img = Image.new("RGB", (224, 224)) - from paddleformers.transformers.image_utils import ChannelDimension - - # Test with FIRST - result = self.processor.preprocess(images=img, data_format=ChannelDimension.FIRST) - self.assertIn("pixel_values", result) - # Test with LAST - result = self.processor.preprocess(images=img, data_format=ChannelDimension.LAST) - self.assertIn("pixel_values", result) - - def test_preprocess_multiple_images_loop(self): - """Test preprocess loop with multiple images (lines 312-348, 468-488)""" - images = [Image.new("RGB", (224, 224)) for _ in range(3)] - result = self.processor.preprocess(images=images) - self.assertIn("pixel_values", result) - self.assertIn("image_grid_thw", result) - pixel_values = result["pixel_values"] - self.assertIsInstance(pixel_values, np.ndarray) - self.assertEqual(len(pixel_values.shape), 2) # Should be [grid_t * grid_h * grid_w, C * psz * psz] - - def test_preprocess_videos_loop(self): - """Test preprocess with videos in loop (lines 496-521)""" - # Test with multiple videos - videos = [ - [Image.new("RGB", (224, 224)) for _ in range(4)], - [Image.new("RGB", (224, 224)) for _ in range(4)], - ] - result = self.processor.preprocess(images=None, videos=videos) - self.assertIn("pixel_values_videos", result) - self.assertIn("video_grid_thw", result) - self.assertIsInstance(result["pixel_values_videos"], np.ndarray) - # Test with nested list format - videos = [[Image.new("RGB", (224, 224)) for _ in range(4)] for _ in range(2)] - result = self.processor.preprocess(images=None, videos=videos) - self.assertIn("pixel_values_videos", result) - self.assertIn("video_grid_thw", result) - self.assertIsInstance(result["pixel_values_videos"], np.ndarray) - - def test_preprocess_both_images_and_videos(self): - """Test preprocess with both images and videos (lines 458-523)""" - images = [Image.new("RGB", (224, 224))] - videos = [[Image.new("RGB", (224, 224)) for _ in range(4)]] - result = self.processor.preprocess(images=images, videos=videos) - # Due to implementation, only video results are returned when both are provided - self.assertIn("pixel_values_videos", result) - self.assertIn("video_grid_thw", result) - - def test_preprocess_invalid_images_check_list_input(self): - """Test preprocess with invalid images check (line 464) - - Note: The error is raised by make_batched_images before valid_images check, - so the error message is different. - """ - invalid_images = ["not an image", "also not an image"] - - with self.assertRaises(ValueError) as context: - self.processor.preprocess(images=invalid_images) - self.assertIn("Could not make batched images", str(context.exception)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_paddleocr_vl_processor.py b/tests/input/v1/test_paddleocr_vl_processor.py deleted file mode 100644 index 3c1e83b42ff..00000000000 --- a/tests/input/v1/test_paddleocr_vl_processor.py +++ /dev/null @@ -1,1182 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import pickle -import unittest -from unittest.mock import ANY, MagicMock, patch - -import numpy as np -import zmq -from PIL import Image - -from fastdeploy.engine.request import Request -from fastdeploy.input.v1.paddleocr_vl_processor.image_processor import ( - ImageProcessor, - smart_resize, -) -from fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor import ( - PaddleOCRVLProcessor, -) -from fastdeploy.input.v1.paddleocr_vl_processor.process import DataProcessor -from fastdeploy.input.v1.paddleocr_vl_processor.process_video import sample_frames - -MODULE_PATH = "fastdeploy.input.v1.paddleocr_vl_processor.process" - - -class TestProcessVideo(unittest.TestCase): - def setUp(self): - self.metadata = {"num_of_frame": 100, "fps": 25} - self.frame_factor = 4 - self.min_frames = 8 - self.max_frames = 32 - - def test_sample_with_num_frames(self): - """测试使用num_frames参数采样(来自用户的原始测试)""" - num_frames = 16 - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=num_frames, - fps=0, # 确保 fps 不>0 - metadata=self.metadata, - ) - self.assertEqual(len(indices), 16) - self.assertEqual(indices[0], 0) - self.assertEqual(indices[-1], 93) - np.testing.assert_array_equal(indices, np.arange(0, 100, 100 / 16).astype(np.int32)) - - def test_error_num_frames_exceeds_total(self): - """测试 num_frames 超过总帧数的异常(来自用户的原始测试)""" - with self.assertRaises(ValueError) as context: - sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=200, # 超过总帧数100 - fps=0, - metadata=self.metadata, - ) - self.assertIn("exceeds", str(context.exception)) - - def test_error_mutual_exclusion(self): - """新增:测试 num_frames 和 fps 互斥""" - with self.assertRaises(ValueError) as context: - sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=16, # > 0 - fps=10, # > 0 - metadata=self.metadata, - ) - self.assertIn("mutually exclusive", str(context.exception)) - - def test_error_fps_without_metadata(self): - """新增:测试 fps > 0 但 metadata 为 None""" - with self.assertRaises(TypeError) as context: - sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=0, - fps=10, - metadata=None, # 缺失 - ) - # 验证是预期的 TypeError - self.assertIn("'NoneType' object is not subscriptable", str(context.exception)) - - def test_num_frames_rounding(self): - """新增:测试 num_frames 向 frame_factor 舍入""" - num_frames = 17 # 不是 4 的倍数 - # 逻辑: round(17 / 4) * 4 = round(4.25) * 4 = 4 * 4 = 16 - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=num_frames, - fps=0, - metadata=self.metadata, - ) - # 应舍入到 16 - self.assertEqual(len(indices), 16) - - def test_sample_with_fps_basic(self): - """新增:测试使用 fps 采样(基本路径,被 max_frames 限制)""" - # 逻辑: num_frames_calc = 100 / 25 * 10 = 40 - # num_frames_clamped = min(max(40, 8), 32) = 32 - # num_frames_factored = floor(32 / 4) * 4 = 32 - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=0, - fps=10, - metadata=self.metadata, - ) - # 应被 max_frames=32 限制 - self.assertEqual(len(indices), 32) - self.assertEqual(indices[-1], 96) - - def test_sample_with_fps_hits_min_frames(self): - """新增:测试使用 fps 采样(被 min_frames 限制)""" - # 逻辑: num_frames_calc = 100 / 25 * 1 = 4 - # num_frames_clamped = min(max(4, 8), 32) = 8 - # num_frames_factored = floor(8 / 4) * 4 = 8 - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=0, - fps=1, - metadata=self.metadata, - ) - # 应被 min_frames=8 限制 - self.assertEqual(len(indices), 8) - self.assertEqual(indices[-1], 87) - - def test_sample_with_fps_hits_total_frames(self): - """新增:测试使用 fps 采样(被 total_num_frames 限制)""" - local_max_frames = 200 - - # 逻辑: num_frames_calc = 100 / 25 * 50 = 200 - # num_frames_clamped = min(min(max(200, 8), 200), 100) = 100 - # num_frames_factored = floor(100 / 4) * 4 = 100 - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=local_max_frames, - num_frames=0, - fps=50, - metadata=self.metadata, - ) - # 应被 total_num_frames=100 限制 - self.assertEqual(len(indices), 100) - self.assertEqual(indices[-1], 99) # 采样所有帧 - - def test_no_sampling(self): - """新增:测试不采样(fps=0, num_frames=0)""" - indices = sample_frames( - frame_factor=self.frame_factor, - min_frames=self.min_frames, - max_frames=self.max_frames, - num_frames=0, - fps=0, - metadata=self.metadata, - ) - # 应返回所有帧 - self.assertEqual(len(indices), self.metadata["num_of_frame"]) - self.assertEqual(len(indices), 100) - self.assertEqual(indices[-1], 99) - np.testing.assert_array_equal(indices, np.arange(0, 100).astype(np.int32)) - - -class Test_DataProcessor(unittest.TestCase): - """ - 针对 process.py 中 DataProcessor 类的单元测试。 - """ - - def setUp(self): - - # 1. 手动启动 Patcher - patcher1 = patch(f"{MODULE_PATH}.AutoTokenizer.from_pretrained") - patcher2 = patch(f"{MODULE_PATH}.ImageProcessor.from_pretrained") - patcher_zmq_context = patch(f"{MODULE_PATH}.zmq.Context") - - self.mock_auto_tokenizer_constructor = patcher1.start() - self.mock_image_processor_constructor = patcher2.start() - self.mock_zmq_context_constructor = patcher_zmq_context.start() - - self.addCleanup(patcher1.stop) - self.addCleanup(patcher2.stop) - self.addCleanup(patcher_zmq_context.stop) - - # 2. 创建模拟对象 - self.mock_tokenizer = MagicMock() - self.mock_image_processor = MagicMock() - self.mock_zmq_context = MagicMock() - self.mock_zmq_socket = MagicMock() - - # 3. 配置 from_pretrained 和 zmq - self.mock_auto_tokenizer_constructor.return_value = self.mock_tokenizer - self.mock_image_processor_constructor.return_value = self.mock_image_processor - self.mock_zmq_context_constructor.return_value = self.mock_zmq_context - self.mock_zmq_context.socket.return_value = self.mock_zmq_socket - - # 4. 配置模拟对象的属性和方法 - self._configure_mocks() - - # 5. 实例化 DataProcessor (默认不启用 cache) - self.processor = DataProcessor(model_path="dummy_model_path") - self._configure_processor_ids() - - # 6. 准备测试用的虚拟数据 - self.dummy_image = Image.fromarray(np.uint8(np.random.rand(224, 224, 3) * 255)) - self.dummy_video_frames = np.uint8(np.random.rand(16, 224, 224, 3) * 255) - self.dummy_video_data = "path/to/dummy_video.mp4" - self.dummy_processed_image_cache = ( - np.random.rand(64, 3, 14, 14).astype(np.float32), - {"thw": (1, 8, 8), "fps": 0}, - ) - self.dummy_processed_video_cache = ( - np.random.rand(256, 3, 14, 14).astype(np.float32), - {"thw": (4, 8, 8), "fps": 30}, - ) - - def _configure_mocks(self): - def mock_convert_tokens_to_ids(tokens): - if tokens == "<|IMAGE_PLACEHOLDER|>": - return 100 - if tokens == "<|video_pad|>": - return 101 - if tokens == "<|IMAGE_START|>": - return 102 - if isinstance(tokens, list): - if tokens == ["Hello", "world"]: - return [983, 984] - if tokens == ["Prompt", "text"]: - return [606, 511] - if tokens == ["Prompt", "", "text"]: - return [606, 511] # 模拟 "Prompt text".split() - return [hash(t) % 1000 for t in tokens] - return hash(tokens) % 1000 - - self.mock_tokenizer.convert_tokens_to_ids.side_effect = mock_convert_tokens_to_ids - self.mock_tokenizer.tokenize.side_effect = lambda s: s.split() - self.mock_tokenizer.ignored_index = -100 - self.mock_tokenizer.chat_template = "dummy_template_string" - - self.mock_image_processor.merge_size = 2 - self.mock_image_processor.temporal_patch_size = 1 - - def _configure_processor_ids(self): - self.processor.image_token_id = 100 - self.processor.video_token_id = 101 - self.processor.image_patch_id = 100 - self.processor.vision_start_id = 102 - - def _get_init_outputs(self): - return { - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "images": [], - "grid_thw": [], - "image_type_ids": [], - "labels": [], - "cur_position": 0, - "video_cnt": 0, - "num_input_image_tokens": 0, - "num_input_video_tokens": 0, - "fps": [], - "mm_positions": [], - "mm_hashes": [], - "vit_seqlen": [], - "vit_position_ids": [], - } - - def test_init(self): - """测试 DataProcessor 的初始化""" - self.mock_auto_tokenizer_constructor.assert_called_with("dummy_model_path", padding_side="left", use_fast=True) - self.mock_image_processor_constructor.assert_called_with("dummy_model_path") - self.assertEqual(self.processor.image_token, "<|IMAGE_PLACEHOLDER|>") - self.assertEqual(self.processor.video_token_id, 101) - - def test_compute_text_positions(self): - """测试 _compute_text_positions 纯函数""" - pos_ids = self.processor._compute_text_positions(start_pos=5, num_tokens=3) - expected = np.array([[5, 6, 7], [5, 6, 7], [5, 6, 7]]) - np.testing.assert_array_equal(pos_ids, expected) - - def test_compute_vision_positions(self): - """测试 _compute_vision_positions 纯函数""" - pos_ids = self.processor._compute_vision_positions(start_pos=10, t=2, h=4, w=4, second_per_grid_t=1.0) - self.assertEqual(pos_ids.shape, (3, 8)) - expected_t = np.array([0, 0, 0, 0, 2, 2, 2, 2]) - expected_h = np.array([0, 0, 1, 1, 0, 0, 1, 1]) - expected_w = np.array([0, 1, 0, 1, 0, 1, 0, 1]) - expected = np.stack([expected_t, expected_h, expected_w]) + 10 - np.testing.assert_array_equal(pos_ids, expected) - - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_text(self): - """测试 _add_text 辅助函数""" - outputs = self._get_init_outputs() - self.mock_tokenizer.tokenize.return_value = ["Hello", "world"] - self.mock_tokenizer.convert_tokens_to_ids.side_effect = None - self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11] - - self.processor._add_text("Hello world", outputs) - - self.assertEqual(outputs["input_ids"], [10, 11]) - self.assertEqual(outputs["token_type_ids"], [0, 0]) - self.assertEqual(outputs["cur_position"], 2) - - @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_123") - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_image_autohash(self, mock_hasher): - """测试 _add_image 辅助函数 (自动哈希)""" - outputs = self._get_init_outputs() - outputs["cur_position"] = 5 - - num_patches_hw = 8 * 8 - num_tokens = 16 - mock_preprocess_return = { - "pixel_values": np.random.rand(num_patches_hw, 3, 14, 14), - "grid_thw": np.array([1, 8, 8]), - } - self.mock_image_processor.preprocess.return_value = mock_preprocess_return - - self.processor._add_image(self.dummy_image, outputs, uuid=None) - - self.assertEqual(len(outputs["input_ids"]), num_tokens) - self.assertEqual(outputs["num_input_image_tokens"], num_tokens) - mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"]) - self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_123") - self.assertEqual(outputs["cur_position"], 9) - - @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features") - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_video_with_uuid(self, mock_hasher): - """测试 _add_video 辅助函数 (使用 uuid)""" - outputs = self._get_init_outputs() - outputs["cur_position"] = 10 - meta = {"fps": 30} - - num_patches_total = 256 - num_tokens = 64 - - mock_preprocess_return = { - "pixel_values": np.random.rand(num_patches_total, 3, 14, 14), - "image_grid_thw": np.array([4, 8, 8]), - } - self.mock_image_processor.preprocess.return_value = mock_preprocess_return - - self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid="custom_vid_uuid") - - self.assertEqual(len(outputs["input_ids"]), num_tokens) - self.assertEqual(outputs["token_type_ids"], [2] * num_tokens) - mock_hasher.assert_not_called() - self.assertEqual(outputs["mm_hashes"][0], "custom_vid_uuid") - self.assertEqual(outputs["image_type_ids"], [1, 1, 1, 1]) - - @patch.object(DataProcessor, "_add_text", MagicMock()) - @patch.object(DataProcessor, "_add_image", MagicMock()) - @patch.object(DataProcessor, "_add_video", MagicMock()) - @patch.object(DataProcessor, "_load_and_process_video") - def test_text2ids_parsing(self, mock_load_video): - """测试 text2ids 的解析和分支逻辑""" - mock_load_video.return_value = (self.dummy_video_frames, {"fps": 30}) - text = "Text1 <|IMAGE_PLACEHOLDER|> Text2 <|video_pad|> Text3" - images = [self.dummy_image] - videos = [self.dummy_video_data] - image_uuid = ["img_uuid_1"] - video_uuid = ["vid_uuid_1"] - - outputs = self.processor.text2ids(text, images, videos, image_uuid, video_uuid) - - self.processor._add_text.assert_any_call("Text1 ", outputs) - self.processor._add_image.assert_called_once_with(self.dummy_image, outputs, "img_uuid_1") - self.processor._add_video.assert_called_once_with(self.dummy_video_frames, {"fps": 30}, outputs, "vid_uuid_1") - - @patch(f"{MODULE_PATH}.parse_chat_messages") - @patch.object(DataProcessor, "text2ids", return_value="final_output") - def test_request2ids(self, mock_text2ids, mock_parse_chat): - """测试 request2ids 的 chat 模板逻辑""" - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Hello"}, - {"type": "image", "data": self.dummy_image, "uuid": "img1"}, - ], - } - ] - request = {"request_id": "test_0", "messages": messages, "add_generation_prompt": True} - request = Request.from_dict(request) - mock_parse_chat.return_value = messages - parsed_prompt = "User: Hello <|IMAGE_PLACEHOLDER|> Assistant:" - self.mock_tokenizer.apply_chat_template.return_value = parsed_prompt - - result = self.processor.request2ids(request) - - self.mock_tokenizer.apply_chat_template.assert_called_once() - mock_text2ids.assert_called_once_with(parsed_prompt, [self.dummy_image], [], ["img1"], []) - self.assertEqual(result, "final_output") - - @patch(f"{MODULE_PATH}.sample_frames") - @patch(f"{MODULE_PATH}.read_video_decord") - def test_load_and_process_video(self, mock_read_video, mock_sample_frames): - """测试 _load_and_process_video 的帧采样逻辑""" - mock_reader = MagicMock() - mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint( - 0, 255, (100, 100, 3), dtype=np.uint8 - ) - mock_meta = {"num_of_frame": 100, "duration": 10.0, "fps": 10.0} - mock_read_video.return_value = (mock_reader, mock_meta, None) - mock_sample_frames.return_value = [0, 10, 20, 30, 40] - self.processor.fps = 1 - - frames, meta = self.processor._load_and_process_video("dummy_url", {"min_frames": 2, "max_frames": 10}) - - mock_sample_frames.assert_called_once_with( - frame_factor=ANY, - min_frames=2, - max_frames=10, - metadata=mock_meta, - fps=self.processor.fps, - num_frames=self.processor.target_frames, - ) - self.assertEqual(frames.shape, (5, 100, 100, 3)) - self.assertEqual(meta["fps"], 1) - - def test_init_with_external_tokenizer(self): - """新增:测试使用外部传入的 tokenizer 初始化""" - self.mock_auto_tokenizer_constructor.reset_mock() - - external_tokenizer = MagicMock() - processor = DataProcessor(model_path="dummy", tokenizer=external_tokenizer) - - self.mock_auto_tokenizer_constructor.assert_not_called() - self.assertIs(processor.tokenizer, external_tokenizer) - - def test_add_text_empty(self): - """新增:测试 _add_text 传入空字符串""" - outputs = self._get_init_outputs() - self.processor._add_text("", outputs) - self.assertEqual(outputs["input_ids"], []) - self.assertEqual(outputs["cur_position"], 0) - - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0}) - def test_add_text_pre_tokenized(self): - """新增:测试 _add_text 传入已 tokenized 的 IDs""" - outputs = self._get_init_outputs() - token_ids = [10, 11, 12] - self.processor._add_text(token_ids, outputs) - - self.mock_tokenizer.tokenize.assert_not_called() - self.assertEqual(outputs["input_ids"], [10, 11, 12]) - self.assertEqual(outputs["token_type_ids"], [0, 0, 0]) - self.assertEqual(outputs["cur_position"], 3) - - @patch(f"{MODULE_PATH}.MultimodalHasher.hash_features", return_value="dummy_hash_456") - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_video_no_uuid(self, mock_hasher): - """新增:测试 _add_video 在 uuid 为 None 时自动哈希""" - outputs = self._get_init_outputs() - meta = {"fps": 30} - mock_preprocess_return = { - "pixel_values": np.random.rand(256, 3, 14, 14), - "image_grid_thw": np.array([4, 8, 8]), - } - self.mock_image_processor.preprocess.return_value = mock_preprocess_return - - self.processor._add_video(self.dummy_video_frames, meta, outputs, uuid=None) - - mock_hasher.assert_called_once_with(mock_preprocess_return["pixel_values"]) - self.assertEqual(outputs["mm_hashes"][0], "dummy_hash_456") - - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_processed_image(self): - """新增:测试 _add_processed_image 处理缓存数据""" - outputs = self._get_init_outputs() - outputs["cur_position"] = 3 - - self.processor._add_processed_image(self.dummy_processed_image_cache, outputs, "cached_img_uuid") - - num_tokens = 16 - self.assertEqual(len(outputs["input_ids"]), num_tokens) - self.assertEqual(outputs["input_ids"][0], self.processor.image_patch_id) - - np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_image_cache[0]) - - self.assertEqual(outputs["mm_hashes"][0], "cached_img_uuid") - self.assertEqual(outputs["cur_position"], 7) - - @patch(f"{MODULE_PATH}.IDS_TYPE_FLAG", {"text": 0, "image": 1, "video": 2}) - def test_add_processed_video(self): - """新增:测试 _add_processed_video 处理缓存数据""" - outputs = self._get_init_outputs() - outputs["cur_position"] = 5 - - self.processor._add_processed_video(self.dummy_processed_video_cache, outputs, "cached_vid_uuid") - - num_tokens = 64 - t, h, w = self.dummy_processed_video_cache[1]["thw"] - - self.assertEqual(len(outputs["input_ids"]), num_tokens) - self.assertEqual(outputs["token_type_ids"], [2] * num_tokens) - - np.testing.assert_array_equal(outputs["images"][0], self.dummy_processed_video_cache[0]) - - self.assertEqual(outputs["mm_hashes"][0], "cached_vid_uuid") - self.assertEqual(outputs["image_type_ids"], [1] * t) - self.assertGreater(outputs["cur_position"], 5) - - def test_text2ids_with_processed_data(self): - """新增:测试 text2ids 调用 _add_processed_image 和 _add_processed_video""" - with ( - patch.object(self.processor, "_add_processed_image") as mock_add_proc_img, - patch.object(self.processor, "_add_processed_video") as mock_add_proc_vid, - ): - - text = "<|IMAGE_PLACEHOLDER|><|video_pad|>" - images = [self.dummy_processed_image_cache] - videos = [self.dummy_processed_video_cache] - image_uuid = ["img1"] - video_uuid = ["vid1"] - - self.processor.text2ids(text, images, videos, image_uuid, video_uuid) - - mock_add_proc_img.assert_called_once_with(self.dummy_processed_image_cache, ANY, "img1") - mock_add_proc_vid.assert_called_once_with(self.dummy_processed_video_cache, ANY, "vid1") - - @patch(f"{MODULE_PATH}.sample_frames") - @patch(f"{MODULE_PATH}.read_video_decord") - def test_load_and_process_video_no_sampling(self, mock_read_video, mock_sample_frames): - """新增:测试 _load_and_process_video 不采样(fps=-1)""" - mock_reader = MagicMock() - mock_reader.__getitem__.return_value.asnumpy.return_value = np.random.randint( - 0, 255, (100, 100, 3), dtype=np.uint8 - ) - mock_meta = {"num_of_frame": 10, "duration": 1.0, "fps": 10.0} - mock_read_video.return_value = (mock_reader, mock_meta, None) - - self.processor.fps = -1 - self.processor.target_frames = -1 - - frames, meta = self.processor._load_and_process_video("dummy_url", {}) - - mock_sample_frames.assert_not_called() - self.assertEqual(frames.shape, (10, 100, 100, 3)) - self.assertEqual(meta["num_of_frame"], 10) - - def test_get_processor_cache(self): - """新增:测试 get_processor_cache (zmq)""" - hashes = ["hash1", "hash2"] - expected_items = ["item1", "item2"] - mock_resp = pickle.dumps(expected_items) - self.mock_zmq_socket.recv_multipart.return_value = (b"", mock_resp) - - items = self.processor.get_processor_cache(self.mock_zmq_socket, hashes) - - self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", pickle.dumps(hashes)]) - self.assertEqual(items, expected_items) - - def test_update_processor_cache(self): - """新增:测试 update_processor_cache (zmq)""" - hashes = ["hash1"] - items = ["item1"] - - self.processor.update_processor_cache(self.mock_zmq_socket, hashes, items) - - expected_req = pickle.dumps((hashes, items)) - self.mock_zmq_socket.send_multipart.assert_called_once_with([b"", expected_req]) - - def test_apply_chat_template(self): - """新增:测试 apply_chat_template 核心逻辑""" - request = {"messages": ["msg1"], "add_generation_prompt": True, "request_id": "req123"} - self.mock_tokenizer.apply_chat_template.return_value = "Prompt <|IMAGE_PLACEHOLDER|> text" - self.mock_tokenizer.tokenize.return_value = ["Prompt", "text"] - - self.mock_tokenizer.convert_tokens_to_ids.side_effect = None - self.mock_tokenizer.convert_tokens_to_ids.return_value = [10, 11] - - token_ids = self.processor.apply_chat_template(request) - - self.assertEqual(token_ids, [10, 11]) - self.assertEqual(request["text_after_process"], "Prompt <|IMAGE_PLACEHOLDER|> text") - - self.mock_tokenizer.tokenize.assert_called_with("Prompt text") - - def test_apply_chat_template_raises_error(self): - """新增:测试 apply_chat_template 在模板不存在时引发 ValueError""" - self.mock_tokenizer.chat_template = None - with self.assertRaises(ValueError) as context: - self.processor.apply_chat_template({"messages": []}) - self.assertIn("does not support chat_template", str(context.exception)) - - @patch(f"{MODULE_PATH}.parse_chat_messages") - def test_request2ids_cache_miss_raises_error(self, mock_parse_chat): - """新增:测试 request2ids 在缓存关闭时缺少数据引发 ValueError""" - messages = [{"role": "user", "content": [{"type": "image", "uuid": "img1"}]}] - request = {"request_id": "test_0", "messages": messages} - request = Request.from_dict(request) - - mock_parse_chat.return_value = messages - - with self.assertRaises(ValueError) as context: - self.processor.request2ids(request) - - self.assertIn("Missing items cannot be retrieved without processor cache.", str(context.exception)) - - @patch(f"{MODULE_PATH}.DataProcessor.get_processor_cache") - @patch(f"{MODULE_PATH}.DataProcessor.update_processor_cache") - @patch(f"{MODULE_PATH}.DataProcessor.text2ids") - @patch(f"{MODULE_PATH}.parse_chat_messages") - def test_request2ids_cache_hit_and_update(self, mock_parse_chat, mock_text2ids, mock_update_cache, mock_get_cache): - """新增:测试 request2ids 缓存命中和缓存更新""" - self.processor = DataProcessor(model_path="dummy_model_path", enable_processor_cache=True) - self._configure_processor_ids() - - messages = [ - { - "role": "user", - "content": [ - {"type": "image", "uuid": "img_cache_hit"}, - {"type": "image", "data": self.dummy_image, "uuid": "img_to_update"}, - ], - } - ] - request = {"request_id": "test_0", "messages": messages} - request = Request.from_dict(request) - - mock_parse_chat.return_value = messages - mock_get_cache.return_value = [self.dummy_processed_image_cache] - - mock_text2ids_output = { - "grid_thw": [(1, 8, 8), (1, 8, 8)], - "fps": [0, 0], - "mm_hashes": ["img_cache_hit", "img_to_update"], - "images": [self.dummy_processed_image_cache[0], self.dummy_processed_image_cache[0]], - } - mock_text2ids.return_value = mock_text2ids_output - self.mock_tokenizer.apply_chat_template.return_value = "<|IMAGE_PLACEHOLDER|><|IMAGE_PLACEHOLDER|>" - - self.processor.request2ids(request) - - self.mock_zmq_context.socket.assert_called_with(zmq.DEALER) - mock_get_cache.assert_called_once_with(self.mock_zmq_socket, ["img_cache_hit"]) - - parsed_images = mock_text2ids.call_args[0][1] - self.assertIs(parsed_images[0], self.dummy_processed_image_cache) - self.assertIs(parsed_images[1], self.dummy_image) - - expected_hash_to_cache = ["img_to_update"] - expected_item_to_cache = (self.dummy_processed_image_cache[0], {"thw": (1, 8, 8), "fps": 0}) - mock_update_cache.assert_called_once() - self.assertEqual(mock_update_cache.call_args[0][1], expected_hash_to_cache) - self.assertEqual(mock_update_cache.call_args[0][2][0][1], expected_item_to_cache[1]) - np.testing.assert_array_equal(mock_update_cache.call_args[0][2][0][0], expected_item_to_cache[0]) - - @patch(f"{MODULE_PATH}.DataProcessor.text2ids") - @patch(f"{MODULE_PATH}.parse_chat_messages") - def test_request2ids_unsupported_type(self, mock_parse_chat, mock_text2ids): - """新增:测试 request2ids 静默忽略不支持的类型""" - messages = [ - { - "role": "user", - "content": [{"type": "text", "text": "Hello"}, {"type": "audio", "data": "...", "uuid": "audio1"}], - } - ] - request = {"request_id": "test_0", "messages": messages} - request = Request.from_dict(request) - - mock_parse_chat.return_value = messages - self.mock_tokenizer.apply_chat_template.return_value = "User: Hello " - - self.processor.request2ids(request) - - mock_text2ids.assert_called_once() - call_args = mock_text2ids.call_args[0] - self.assertEqual(call_args[1], []) # images - self.assertEqual(call_args[2], []) # videos - self.assertEqual(call_args[3], []) # image_uuid - self.assertEqual(call_args[4], []) # video_uuid - - -class TestPaddleOCR_VL_ImageProcessor(unittest.TestCase): - def setUp(self): - # 初始化默认参数 - self.default_params = { - "do_resize": True, - "resample": 3, - "do_rescale": True, - "rescale_factor": 1 / 255, - "do_normalize": True, - "image_mean": [0.48145466, 0.4578275, 0.40821073], - "image_std": [0.26862954, 0.26130258, 0.27577711], - "do_convert_rgb": True, - "min_pixels": 28 * 28 * 130, - "max_pixels": 28 * 28 * 1280, - "patch_size": 14, - "temporal_patch_size": 1, - "merge_size": 2, - } - - # 创建测试图像 - self.test_image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)) - - def test_initialization(self): - """测试初始化参数是否正确设置""" - processor = ImageProcessor(**self.default_params) - - for param, value in self.default_params.items(): - self.assertEqual(getattr(processor, param), value) - - def test_smart_resize(self): - """测试智能调整图像大小功能""" - # 测试正常尺寸调整 - h, w = smart_resize(224, 224, factor=28) - self.assertEqual(h % 28, 0) - self.assertEqual(w % 28, 0) - - # 测试小尺寸调整 - h, w = smart_resize(20, 20, factor=28) - self.assertGreaterEqual(h, 28) - self.assertGreaterEqual(w, 28) - - # 测试超大尺寸调整 - h, w = smart_resize(2000, 2000, factor=28) - self.assertLess(h * w, 28 * 28 * 1280) - - def test_preprocess_single_image(self): - """测试单张图像预处理流程""" - processor = ImageProcessor(**self.default_params) - - # 测试正常预处理 - result = processor.preprocess(self.test_image) - self.assertIn("pixel_values", result) - self.assertIn("grid_thw", result) - self.assertEqual(result["pixel_values"].ndim, 4) # [N, C, H, W] - - # 测试关闭某些预处理步骤 - result = processor.preprocess(self.test_image, do_resize=False, do_normalize=False) - self.assertIn("pixel_values", result) - - def test_preprocess_batch_images(self): - """测试批量图像预处理""" - processor = ImageProcessor(**self.default_params) - batch_images = [self.test_image, self.test_image] - - result = processor.preprocess(batch_images) - expected_shape = 1152 - self.assertEqual(result["pixel_values"].shape[0], expected_shape) - - def test_invalid_input(self): - """测试无效输入处理""" - processor = ImageProcessor(**self.default_params) - - # 测试无效图像 - with self.assertRaises(ValueError): - processor.preprocess("invalid_image") - - # 测试视频输入(暂不支持) - with self.assertRaises(NotImplementedError): - processor.preprocess(self.test_image, videos=["video"]) - - def test_from_pretrained(self): - """测试从预训练模型加载配置""" - with patch("builtins.open", unittest.mock.mock_open(read_data='{"do_resize": false}')) as mock_file: - processor = ImageProcessor.from_pretrained("dummy_path") - self.assertFalse(processor.do_resize) - mock_file.assert_called_once() - - -class TestPaddleOCRVLProcessor(unittest.TestCase): - def setUp(self): - # 创建 PaddleOCRVLProcessor 实例的模拟对象 - with patch.object(PaddleOCRVLProcessor, "__init__", return_value=None): - self.processor = PaddleOCRVLProcessor("model_path") - - # 设置必要的属性 - self.processor.tokenizer = MagicMock() - self.processor.tokenizer.eos_token_id = 1 - self.processor.processor = MagicMock() - self.processor.limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} - self.processor.eos_token_ids = [1] - self.processor.reasoning_parser = None - self.processor.model_status_dict = {} - - # 模拟 _apply_default_parameters - def mock_apply_default_parameters(request_or_dict): - if isinstance(request_or_dict, dict): - if "top_p" not in request_or_dict: - request_or_dict["top_p"] = 0.9 - return request_or_dict - - if not hasattr(request_or_dict, "top_p"): - request_or_dict.top_p = 0.9 - return request_or_dict - - self.processor._apply_default_parameters = mock_apply_default_parameters - - # 模拟 pack_outputs - def mock_pack_outputs(outputs): - # 简化 position_ids 的处理 - position_ids_list = outputs["position_ids"] - if not position_ids_list: - position_ids = np.array([], dtype=np.int64) - elif isinstance(position_ids_list[0], list): - position_ids = np.array(position_ids_list, dtype=np.int64) - else: - position_ids = np.concatenate(position_ids_list, axis=1, dtype=np.int64) - - if position_ids.ndim == 1: - position_ids = position_ids.reshape(1, -1) - - # 源码的 pack_outputs 会 transpose - position_ids = position_ids.transpose(1, 0) - - return { - "input_ids": np.array(outputs["input_ids"], dtype=np.int64), - "token_type_ids": np.array(outputs["token_type_ids"], dtype=np.int64), - "position_ids": position_ids, - "images": np.vstack(outputs["images"]) if outputs.get("images") else None, - "grid_thw": np.vstack(outputs["grid_thw"]) if outputs.get("grid_thw") else None, - "image_type_ids": np.array(outputs["image_type_ids"]) if outputs.get("image_type_ids") else None, - } - - self.processor.pack_outputs = mock_pack_outputs - self.processor.np = np - - # 模拟 _SAMPLING_EPS 常量 - self.processor._SAMPLING_EPS = 1e-5 - - # 模拟 processor 返回 (position_ids 必须是 2D array 的 list) - self.processor.processor.text2ids.return_value = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], # 修正 - "images": ["image_feature"], - "grid_thw": ["grid_feature"], - "image_type_ids": [0], - "cur_position": 3, - } - - self.processor.processor.request2ids.return_value = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], # 修正 - "images": ["image_feature"], - "grid_thw": ["grid_feature"], - "image_type_ids": [0], - "cur_position": 3, - } - - # 模拟 _compute_text_positions 方法 (返回 2D array) - self.processor.processor._compute_text_positions = lambda pos, num: np.array( - [list(range(pos, pos + num))], dtype=np.int64 - ) - - # 模拟 update_stop_seq - self.processor.update_stop_seq = MagicMock(return_value=([[99, 98]], [2])) - - # 模拟 pack_outputs 需要的属性 - self.processor.processor.image_token_id = 100 - self.processor.processor.video_token_id = 101 - - def test_process_request_dict_basic(self): - """测试基本请求处理功能""" - request = { - "request_id": "test_request", - "prompt": "test prompt", - "multimodal_data": {"image": ["image1"]}, - "metadata": {"generated_token_ids": []}, - } - request = Request.from_dict(request) - - result = self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(result.prompt_token_ids, [1, 2, 3]) - self.assertEqual(result.prompt_token_ids_len, 3) - self.assertTrue(hasattr(result, "multimodal_inputs")) - - def test_process_request_dict_with_messages(self): - """测试 messages 格式的请求处理""" - request = { - "request_id": "test_0", - "messages": [ - { - "role": "user", - "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}], - } - ], - "metadata": {"generated_token_ids": []}, - } - request = Request.from_dict(request) - - result = self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(result.prompt_token_ids, [1, 2, 3]) - self.assertTrue(hasattr(result, "multimodal_inputs")) - - def test_process_request_dict_with_max_len(self): - """测试最大长度限制功能""" - request = { - "request_id": "test_0", - "prompt": "test prompt", - "multimodal_data": {"image": ["image1"]}, - "metadata": {"generated_token_ids": []}, - } - request = Request.from_dict(request) - - # 模拟 processor 返回长序列 - self.processor.processor.text2ids.return_value = { - "input_ids": list(range(100)), - "token_type_ids": [0] * 100, - "position_ids": [np.array([list(range(100))], dtype=np.int64)], - "images": ["image_feature"], - "grid_thw": ["grid_feature"], - "image_type_ids": [0], - "cur_position": 100, - } - - max_model_len = 50 - result = self.processor.process_request_dict(request, max_model_len) - # 验证是否截断到 max_model_len - 1 - self.assertEqual(len(result.prompt_token_ids), max_model_len - 1) - self.assertEqual(result.prompt_token_ids, list(range(49))) - # 验证原始输入长度确实超过了限制 - self.assertGreater(len(self.processor.processor.text2ids.return_value["input_ids"]), max_model_len) - - def test_parse_processor_kwargs(self): - """测试处理器参数解析""" - valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1} - result = self.processor._parse_processor_kwargs(valid_kwargs) - self.assertEqual(result, valid_kwargs) - - # 测试无效参数 - invalid_kwargs = {"video_max_frames": "invalid"} - with patch( - "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger" - ) as mock_logger: - result = self.processor._parse_processor_kwargs(invalid_kwargs) - self.assertEqual(result, {}) - # 确认警告已被记录 - mock_logger.warning.assert_called() - - def test_parse_limits(self): - """测试输入限制解析""" - custom_limits = {"image": 2, "video": 3} - result = self.processor._parse_limits(custom_limits) - self.assertEqual(result["image"], 2) - self.assertEqual(result["video"], 3) - self.assertEqual(result["audio"], 1) # 默认值 - - def test_check_mm_limits(self): - """测试多模态输入限制检查 (dict path)""" - # 测试不超限 - item = {"image": ["image1"], "video": ["video1"]} - self.processor._check_mm_limits(item) - - # 测试超限 - item_exceeded = {"image": ["image1", "image2"], "video": ["video1"]} - with self.assertRaises(ValueError): - self.processor._check_mm_limits(item_exceeded) - - def test_parse_processor_kwargs_invalid_type(self): - """测试 _parse_processor_kwargs 传入非字典类型""" - invalid_input = ["video_max_frames", 10] - with patch( - "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger" - ) as mock_logger: - result = self.processor._parse_processor_kwargs(invalid_input) - self.assertEqual(result, {}) # 触发 - mock_logger.warning.assert_called() - - def test_parse_limits_invalid_type(self): - """测试 _parse_limits 传入非字典类型""" - invalid_input = ["image", 2] - with patch( - "fastdeploy.input.v1.paddleocr_vl_processor.paddleocr_vl_processor.data_processor_logger" - ) as mock_logger: - result = self.processor._parse_limits(invalid_input) - # 应返回默认值 - self.assertEqual(result, {"image": 1, "video": 1, "audio": 1}) - mock_logger.warning.assert_called() - - def test_check_mm_limits_messages_path(self): - """测试 _check_mm_limits (messages path)""" - messages = [ - {"role": "user", "content": [{"type": "text", "text": "Hello"}, {"type": "image_url", "url": "image1"}]} - ] - self.processor._check_mm_limits(messages) # 不应抛出异常 - - def test_check_mm_limits_messages_exceeded(self): - """测试 _check_mm_limits (messages path) 超限""" - messages = [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Hello"}, - {"type": "image_url", "url": "image1"}, - {"type": "image_url", "url": "image2"}, # 超过限制 1 - ], - } - ] - with self.assertRaises(ValueError): - self.processor._check_mm_limits(messages) - - def test_process_request_dict_no_prompt_or_messages(self): - """测试当请求既没有 prompt 也没有 messages 时抛出异常""" - request = {"request_id": "test_0", "metadata": {"generated_token_ids": []}} - request = Request.from_dict(request) - with self.assertRaises(ValueError): - self.processor.process_request_dict(request, max_model_len=512) - - def test_process_request_dict_with_continuation(self): - """测试续写逻辑 (metadata 包含 generated_token_ids)""" - request = { - "request_id": "test_0", - "prompt": "test prompt", - "multimodal_data": {"image": ["image1"]}, - "metadata": {"generated_token_ids": [10, 11, 12]}, # 已生成的 token - } - request = Request.from_dict(request) - setattr(request, "metadata", {"generated_token_ids": [10, 11, 12]}) - - result = self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(result.prompt_token_ids, [1, 2, 3, 10, 11, 12]) - self.assertEqual(result.prompt_token_ids_len, 6) - - def test_process_request_dict_with_stop_sequences(self): - """测试 stop_sequences 处理""" - request = { - "request_id": "test_0", - "prompt": "test prompt", - "stop": ["stop1", "stop2"], - "metadata": {"generated_token_ids": []}, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, max_model_len=512) - - # 验证 update_stop_seq 被调用 - self.processor.update_stop_seq.assert_called_with(["stop1", "stop2"]) - # 验证结果被设置到 request 中 - self.assertEqual(result.sampling_params.stop_token_ids, [[99, 98]]) - self.assertEqual(result.sampling_params.stop_seqs_len, [2]) - - def test_process_request_dictefault_max_tokens(self): - """测试默认 max_tokens 计算""" - request = { - "request_id": "test_0", - "prompt": "test prompt", - "metadata": {"generated_token_ids": []}, - } # 长度为 3 - request = Request.from_dict(request) - max_model_len = 10 - result = self.processor.process_request_dict(request, max_model_len) - - self.assertEqual(result.sampling_params.max_tokens, 7) - - def test_process_request_dict_top_p_clamping(self): - """测试 top_p 值被修正 (clamping)""" - request = { - "request_id": "test_0", - "prompt": "test prompt", - "top_p": 0.0, # 低于 _SAMPLING_EPS - "metadata": {"generated_token_ids": []}, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(result.sampling_params.top_p, self.processor._SAMPLING_EPS) - - def test_append_generated_tokens(self): - """直接测试 append_generated_tokens 辅助函数""" - # : position_ids 必须是 [2D array] - multimodal_inputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], - "cur_position": 3, - } - generated_token_ids = [10, 11] - - # 调用 append_generated_tokens (它是 PaddleOCRVLProcessor 的方法) - PaddleOCRVLProcessor.append_generated_tokens(self.processor, multimodal_inputs, generated_token_ids) - - self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 10, 11]) - self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0]) - # : 检查 position_ids 是否为 [np.array(...), np.array(...)] - self.assertEqual(len(multimodal_inputs["position_ids"]), 2) - self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][0], np.array([[0, 1, 2]], dtype=np.int64))) - self.assertTrue(np.array_equal(multimodal_inputs["position_ids"][1], np.array([[3, 4]], dtype=np.int64))) - self.assertEqual(multimodal_inputs["cur_position"], 5) - - def test_pack_outputs_real_no_images(self): - """测试真实的 pack_outputs 方法 (无图像)""" - outputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - # : position_ids 必须是 [2D array] - "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], - "images": [], # 空列表 - "grid_thw": [], - "image_type_ids": [], - "cur_position": 3, - } - - # 调用真实的类方法,而不是 setUp 中 mock 的实例方法 - result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs) - - self.assertIsNone(result["images"]) - self.assertIsNone(result["grid_thw"]) - self.assertIsNone(result["image_type_ids"]) - self.assertTrue(np.array_equal(result["input_ids"], np.array([1, 2, 3], dtype=np.int64))) - # 验证 position_ids 被 concatenate 和 transpose - # input: [array([[0, 1, 2]])] -> concat: array([[0, 1, 2]]) (shape 1,3) -> transpose: array([[0], [1], [2]]) (shape 3,1) - self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64))) - self.assertEqual(result["image_patch_id"], 100) - self.assertEqual(result["video_patch_id"], 101) - - def test_pack_outputs_real_with_images(self): - """测试真实的 pack_outputs 方法 (有图像)""" - image_feature = np.array([[0.1, 0.2]]) - grid_feature = np.array([[1, 2, 3]]) - - outputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - # : position_ids 必须是 [2D array] - "position_ids": [np.array([[0, 1, 2]], dtype=np.int64)], - "images": [image_feature], - "grid_thw": [grid_feature], - "image_type_ids": [0], - "cur_position": 3, - } - - result = PaddleOCRVLProcessor.pack_outputs(self.processor, outputs) - - self.assertTrue(np.array_equal(result["images"], image_feature)) - self.assertTrue(np.array_equal(result["grid_thw"], grid_feature)) - self.assertTrue(np.array_equal(result["image_type_ids"], np.array([0]))) - self.assertTrue(np.array_equal(result["position_ids"], np.array([[0], [1], [2]], dtype=np.int64))) - - def test_think_status(self): - """测试 思考机制""" - request = { - "prompt": "hello", - "request_id": "test_1", - "prompt_token_ids": [1, 2, 3], - } - request = Request.from_dict(request) - self.processor.reasoning_parser = MagicMock() - self.processor.reasoning_parser.get_model_status.return_value = "think_start" - self.processor.model_status_dict = {} - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - request = { - "prompt": "hello", - "request_id": "test_2", - "prompt_token_ids": [1, 2, 3], - } - request = Request.from_dict(request) - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_process_video.py b/tests/input/v1/test_process_video.py deleted file mode 100644 index b8777cb0a76..00000000000 --- a/tests/input/v1/test_process_video.py +++ /dev/null @@ -1,386 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import io -import math -import os -import tempfile -import unittest -from unittest.mock import patch - -import numpy as np -from PIL import Image as PILImage - -import fastdeploy.input.v1.ernie4_5_vl_processor.process_video as process_video_module -from fastdeploy.input.v1.ernie4_5_vl_processor.process_video import ( - get_frame_indices, - read_frames_decord, - read_video_decord, -) - - -class _MockFrame: - """Lightweight frame wrapper that mimics the real frame object.""" - - def __init__(self, arr): - self._arr = arr - - def asnumpy(self): - """Return the underlying numpy array.""" - return self._arr - - -class MockVideoReaderWrapper: - """ - Simple mock implementation of a video reader: - - - __len__ returns the total number of frames - - __getitem__ returns a _MockFrame(arr) - - get_avg_fps() returns fps - - Specific indices can be configured to raise errors in __getitem__ - """ - - def __init__( - self, - src, - num_threads=1, - vlen=12, - fps=6, - fail_indices=None, - h=4, - w=5, - c=3, - ): - self.src = src - self._vlen = vlen - self._fps = fps - self._fail = set(fail_indices or []) - self._h, self._w, self._c = h, w, c - - def __len__(self): - return self._vlen - - def get_avg_fps(self): - return self._fps - - def __getitem__(self, idx): - if idx < 0 or idx >= self._vlen: - raise IndexError("index out of range") - if idx in self._fail: - raise ValueError(f"forced fail at {idx}") - # Create a frame whose pixel value encodes the index (for easy debugging) - arr = np.zeros((self._h, self._w, self._c), dtype=np.uint8) - arr[:] = idx % 255 - return _MockFrame(arr) - - -class TestReadVideoDecord(unittest.TestCase): - def test_read_video_decord_with_wrapper(self): - """Test passing an existing VideoReaderWrapper instance directly.""" - # Patch VideoReaderWrapper in the target module so isinstance checks use our mock class - with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper): - mock_reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5) - reader, meta, path = read_video_decord(mock_reader, save_to_disk=False) - - self.assertIs(reader, mock_reader) - self.assertEqual(meta["fps"], 5) - self.assertEqual(meta["num_of_frame"], 10) - self.assertTrue(math.isclose(meta["duration"], 10 / 5, rel_tol=1e-6)) - # The original reader object should be returned unchanged - self.assertIs(path, mock_reader) - - def test_read_video_decord_with_bytes(self): - """Test that bytes input is wrapped into BytesIO and passed to VideoReaderWrapper.""" - with patch.object(process_video_module, "VideoReaderWrapper", MockVideoReaderWrapper): - data = b"\x00\x01\x02\x03" - reader, meta, path = read_video_decord(data, save_to_disk=False) - - self.assertIsInstance(reader, MockVideoReaderWrapper) - self.assertEqual(meta["fps"], 6) - self.assertEqual(meta["num_of_frame"], 12) - self.assertTrue(math.isclose(meta["duration"], 12 / 6, rel_tol=1e-6)) - self.assertIsInstance(path, io.BytesIO) - - -class TestGetFrameIndices(unittest.TestCase): - def test_by_target_frames_middle(self): - """Test target_frames mode with 'middle' sampling strategy.""" - vlen = 12 - out = get_frame_indices( - vlen=vlen, - target_frames=4, - target_fps=-1, - frames_sample="middle", - input_fps=-1, - ) - # 12 frames split into 4 segments -> midpoints [1, 4, 7, 10] - self.assertEqual(out, [1, 4, 7, 10]) - - def test_by_target_frames_leading(self): - """Test target_frames mode with 'leading' sampling strategy.""" - vlen = 10 - out = get_frame_indices( - vlen=vlen, - target_frames=5, - target_fps=-1, - frames_sample="leading", - input_fps=-1, - ) - # 10 frames split into 5 segments -> segment starts [0, 2, 4, 6, 8] - self.assertEqual(out, [0, 2, 4, 6, 8]) - - def test_by_target_frames_rand(self): - """Test target_frames mode with 'rand' sampling strategy.""" - vlen = 10 - out = get_frame_indices( - vlen=vlen, - target_frames=4, - target_fps=-1, - frames_sample="rand", - input_fps=-1, - ) - self.assertEqual(len(out), 4) - self.assertTrue(all(0 <= i < vlen for i in out)) - - def test_by_target_frames_fix_start(self): - """Test target_frames mode with a fixed start offset.""" - vlen = 10 - out = get_frame_indices( - vlen=vlen, - target_frames=5, - target_fps=-1, - frames_sample="middle", # overridden by fix_start - fix_start=1, - input_fps=-1, - ) - # Segment starts [0, 2, 4, 6, 8] -> +1 => [1, 3, 5, 7, 9] - self.assertEqual(out, [1, 3, 5, 7, 9]) - - def test_target_frames_greater_than_vlen(self): - """Test that target_frames > vlen falls back to using vlen samples.""" - vlen = 5 - out = get_frame_indices( - vlen=vlen, - target_frames=10, - target_fps=-1, - frames_sample="middle", - input_fps=-1, - ) - self.assertEqual(len(out), vlen) - self.assertTrue(all(0 <= i < vlen for i in out)) - - def test_by_target_fps_middle(self): - """Test target_fps mode with 'middle' sampling strategy.""" - vlen, in_fps = 12, 6 - out = get_frame_indices( - vlen=vlen, - target_frames=-1, - target_fps=2, - frames_sample="middle", - input_fps=in_fps, - ) - # Roughly 4 frames expected - self.assertTrue(3 <= len(out) <= 5) - self.assertTrue(all(0 <= i < vlen for i in out)) - - def test_by_target_fps_leading(self): - """Test target_fps mode with 'leading' sampling strategy.""" - vlen, in_fps = 12, 6 - out = get_frame_indices( - vlen=vlen, - target_frames=-1, - target_fps=2, - frames_sample="leading", - input_fps=in_fps, - ) - self.assertTrue(3 <= len(out) <= 5) - self.assertTrue(all(0 <= i < vlen for i in out)) - - def test_by_target_fps_rand(self): - """Test target_fps mode with 'rand' sampling strategy.""" - vlen, in_fps = 12, 6 - out = get_frame_indices( - vlen=vlen, - target_frames=-1, - target_fps=2, - frames_sample="rand", - input_fps=in_fps, - ) - self.assertTrue(3 <= len(out) <= 5) - self.assertTrue(all(0 <= i < vlen for i in out)) - - def test_invalid_both_negative(self): - """Test that both target_frames and target_fps being negative raises ValueError.""" - with self.assertRaises(ValueError): - get_frame_indices( - vlen=10, - target_frames=-1, - target_fps=-1, - frames_sample="middle", - ) - - def test_invalid_both_specified(self): - """Test that specifying both target_frames and target_fps raises AssertionError.""" - with self.assertRaises(AssertionError): - get_frame_indices( - vlen=10, - target_frames=4, - target_fps=2, - frames_sample="middle", - input_fps=6, - ) - - def test_invalid_target_fps_missing_input(self): - """Test that target_fps > 0 with invalid input_fps raises AssertionError.""" - with self.assertRaises(AssertionError): - get_frame_indices( - vlen=10, - target_frames=-1, - target_fps=2, - frames_sample="middle", - input_fps=-1, - ) - - -class TestReadFramesDecord(unittest.TestCase): - def test_basic_read_no_save(self): - """Test normal frame reading without saving to disk.""" - reader = MockVideoReaderWrapper("dummy", vlen=8, fps=4) - meta = {"fps": 4, "duration": 8 / 4, "num_of_frame": 8} - - ret, idxs, ts = read_frames_decord( - video_path="dummy", - video_reader=reader, - video_meta=meta, - target_frames=4, - frames_sample="middle", - save_to_disk=False, - ) - - # Should return 4 PIL.Image instances - self.assertEqual(len(ret), 4) - for img in ret: - self.assertIsInstance(img, PILImage.Image) - - self.assertEqual(idxs, [0, 2, 4, 6]) - dur = meta["duration"] - n = meta["num_of_frame"] - for i, t in zip(idxs, ts): - self.assertTrue(math.isclose(t, i * dur / n, rel_tol=1e-6)) - - def test_read_and_save_to_disk(self): - """Test reading frames and saving them as PNG files on disk.""" - reader = MockVideoReaderWrapper("dummy", vlen=4, fps=2) - meta = {"fps": 2, "duration": 4 / 2, "num_of_frame": 4} - - with ( - tempfile.TemporaryDirectory() as tmpdir, - patch.object( - process_video_module, - "get_filename", - return_value="det_id", - ), - ): - ret, idxs, ts = read_frames_decord( - video_path="dummy", - video_reader=reader, - video_meta=meta, - target_frames=2, - frames_sample="leading", - save_to_disk=True, - cache_dir=tmpdir, - ) - - self.assertEqual(len(ret), 2) - for i, pth in enumerate(ret): - self.assertIsInstance(pth, str) - self.assertTrue(os.path.exists(pth)) - self.assertEqual(os.path.basename(pth), f"{i}.png") - - def test_fallback_previous_success(self): - """Test that a failed frame read falls back to a previous valid frame when possible.""" - reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={3}) - meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10} - idxs = [1, 2, 3, 6] - - ret, new_idxs, ts = read_frames_decord( - video_path="dummy", - video_reader=reader, - video_meta=meta, - frame_indices=idxs.copy(), - save_to_disk=False, - tol=5, - ) - - # Index 3 fails and should be replaced by 2 or 4 (previous/next search) - self.assertIn(new_idxs[2], (2, 4)) - self.assertEqual(len(ret), 4) - - def test_fallback_next_when_prev_fails(self): - """Test that when current and previous frames fail, a later frame is used as fallback.""" - reader = MockVideoReaderWrapper("dummy", vlen=10, fps=5, fail_indices={2, 3}) - meta = {"fps": 5, "duration": 10 / 5, "num_of_frame": 10} - idxs = [1, 2, 3, 6] - - ret, new_idxs, ts = read_frames_decord( - video_path="dummy", - video_reader=reader, - video_meta=meta, - frame_indices=idxs.copy(), - save_to_disk=False, - tol=5, - ) - - # Frame 3 should eventually be replaced by 4 - self.assertEqual(new_idxs[2], 4) - self.assertEqual(len(ret), 4) - - def test_len_assert_when_no_fallback(self): - """Test that assertion is triggered when no valid fallback frame can be found.""" - - class FailAllAroundReader(MockVideoReaderWrapper): - """Reader that fails on index 1 and has too small length to find fallback.""" - - def __init__(self, *a, **kw): - super().__init__(*a, **kw) - self._vlen = 2 - self._fps = 2 - self._fail = {1} - - def __getitem__(self, idx): - if idx in self._fail: - raise ValueError("fail hard") - return super().__getitem__(idx) - - reader = FailAllAroundReader("dummy") - meta = {"fps": 2, "duration": 2 / 2, "num_of_frame": 2} - - # Request 2 frames: index 0 succeeds, index 1 always fails, - # and tol=0 disallows searching neighbors -> stack and length assertion should fail - with self.assertRaises(AssertionError): - read_frames_decord( - video_path="dummy", - video_reader=reader, - video_meta=meta, - target_frames=2, - frames_sample="leading", - save_to_disk=False, - tol=0, - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_qwen3_vl_processor.py b/tests/input/v1/test_qwen3_vl_processor.py deleted file mode 100644 index c858f1e51db..00000000000 --- a/tests/input/v1/test_qwen3_vl_processor.py +++ /dev/null @@ -1,1172 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import copy -import unittest -from unittest.mock import MagicMock, patch - -import numpy as np -from PIL import Image - -from fastdeploy.engine.request import Request -from fastdeploy.input.v1.qwen3_vl_processor import Qwen3VLProcessor -from fastdeploy.input.v1.qwen3_vl_processor.process import sample_frames - - -def mock_pil_image(height, width): - """ - Generate mock random RGB image - - Args: - height: Image height in pixels - width: Image width in pixels - - Returns: - PIL.Image object with random RGB data - """ - rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) - return Image.fromarray(rgb_image) - - -def mock_read_frames(height: int, width: int, nums_frame: int, fps: int): - """ - Generate mock video frames with metadata for testing purposes - - Creates synthetic video data by generating random RGB frames and constructing - corresponding metadata to simulate real video processing. - - Args: - height (int): Height of video frames in pixels - width (int): Width of video frames in pixels - nums_frame (int): Number of frames to generate - fps (int): Frames per second for the mock video - - Returns: - tuple: A tuple containing: - frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3) - containing randomly generated RGB frames - meta (dict): Dictionary with video metadata: - - fps (int): Frames per second (same as input) - - duration (float): Calculated duration in seconds (nums_frame/fps) - - num_of_frame (int): Number of frames (same as nums_frame input) - """ - frames = [] - for _ in range(nums_frame): - frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) - frames.append(frame) - frames = np.stack(frames, axis=0) - - meta = { - "fps": fps, - "duration": nums_frame / fps, - "num_of_frame": nums_frame, - } - return frames, meta - - -class TestQwen3VLProcessor(unittest.TestCase): - """ - Unit tests for Qwen Vision-Language Processor functionality - """ - - def setUp(self): - """ - Initialize test case with: - - Mock configuration - - Patched message parsing and video processing methods - - QwenVLProcessor instance with test parameters - """ - config = MagicMock() - config.vision_config.tokens_per_second = 2 - - self.patcher_parse_image = patch( - "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640) - ) - self.patcher_parse_image.start() - - self.patcher_parse_video = patch( - "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123" - ) - self.patcher_parse_video.start() - - self.patcher_read_frames = patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.DataProcessor._load_and_process_video", - return_value=mock_read_frames(480, 640, 5, 2), - ) - self.patcher_read_frames.start() - - mm_processor_kwargs = {"video_max_frames": 10, "video_min_frames": 1} - limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} - - self.model_name_or_path = "/ModelData/Qwen3-VL-4B-Instruct" - self.processor = Qwen3VLProcessor( - config=config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=limit_mm_per_prompt, - mm_processor_kwargs=mm_processor_kwargs, - reasoning_parser_obj=None, - tool_parser_obj=None, - ) - - def tearDown(self) -> None: - """Clean up test case by stopping all mock patches""" - self.patcher_read_frames.stop() - self.patcher_parse_image.stop() - self.patcher_parse_video.stop() - - def test_process_request_dict(self): - """ - Test processing of dictionary-format request with multimodal input - - Validates: - 1. Token ID lengths match position_ids and token_type_ids shapes - 2. Image processing produces expected output dimensions - 3. Video processing produces expected output dimensions - 4. Correct counts for images (1) and videos (1) - """ - num_completion_token_ids = 10 - request = { - "request_id": "12345", - "completion_token_ids": [1] * num_completion_token_ids, - "stop": ["stop", "eof"], - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - - result = self.processor.process_request_dict(request, 1024 * 100) - - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0]) - self.assertEqual( - result.multimodal_inputs["images"].shape[0], - sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])), - ) - self.assertEqual( - result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum() - ) - - def test_prompt(self): - """ - Test processing of prompt with image and video placeholders - - Validates: - 1. Token ID lengths match position_ids and token_type_ids shapes - 2. Image processing produces expected output dimensions - 3. Video processing produces expected output dimensions - 4. Correct counts for images (1) and videos (1) - """ - IMAGE_PLACEHOLDER = "<|image_pad|>" - VIDEO_PLACEHOLDER = "<|video_pad|>" - prompt = { - "request_id": "12345", - "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.", - "multimodal_data": { - "image": [mock_pil_image(10, 2100)], - "video": [{"video": b"123", "fps": 5}], - }, - } - - request = Request.from_dict(prompt) - result = self.processor.process_request_dict(request, 1024 * 100) - - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0]) - self.assertEqual( - result.multimodal_inputs["images"].shape[0], - sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])), - ) - self.assertEqual( - result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum() - ) - - def test_message_and_prompt(self): - """ - Test consistency between message-based and prompt-based processing - - Validates that processing a request through: - 1. The message format (with image/video URLs) - 2. The prompt format (with direct image/video data) - produces identical tokenization and multimodal input results. - - Checks: - 1. Prompt token IDs match between both processing methods - 2. Grid dimensions (THW) match between both methods - 3. Position IDs match between both methods - """ - # Create test request in message format - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024 * 100) - - # Create equivalent request in prompt format - prompt = { - "request_id": "12345", - "prompt": request.prompt_tokens, - "multimodal_data": { - "image": [mock_pil_image(480, 640)], - "video": [{"video": b"123"}], - }, - } - request2 = Request.from_dict(prompt) - result2 = self.processor.process_request_dict(request2, 1024 * 100) - - # Verify both processing methods produce identical results - self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids) - self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all()) - self.assertTrue( - np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all() - ) - - def test_apply_chat_template(self): - """ - Test the consistency between: - 1. Directly applying chat template using HuggingFace tokenizer - 2. Applying chat template through the processor's request processing - - This test verifies that: - - The processor correctly handles multimodal messages (image, video, text) - - The prompt_tokens field matches the output from direct tokenizer application - - The chat template application preserves the message structure and content - - Test Steps: - 1. Create sample multimodal messages with image, video and text content - 2. Apply chat template directly using the tokenizer - 3. Process the same messages through the processor - 4. Compare the outputs to ensure consistency - """ - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) - - # Sample multimodal messages containing image, video and text - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video", "video": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ] - - # Apply chat template directly using the tokenizer - prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - - # Create equivalent request dictionary - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - - # Process request through the processor - self.processor.process_request_dict(request, 1024 * 100) - prompt2 = request.prompt_tokens - - # Verify both methods produce identical prompt strings - self.assertEqual(prompt, prompt2) - - def test_add_processed_image(self): - """ - Test DataProcessor._add_processed_image via Qwen3VLProcessor - """ - merge_size = self.processor.processor.image_processor.merge_size - - # shape[0] must be divisible by merge_size^2 - num_tokens = 4 - img = np.zeros( - (num_tokens * merge_size * merge_size, 3, 3), - dtype=np.float32, - ) - meta = { - "thw": (1, 8, 8), - } - uuid = "test-image-uuid" - - img_cache = (img, meta) - - outputs = { - "mm_positions": [], - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "cur_position": 5, - "images": [], - "mm_hashes": [], - "grid_thw": [], - "image_type_ids": [], - "fps": [], - } - - # ----------------------- - # mock vision position computation - # ----------------------- - dp = self.processor.processor - dp._compute_vision_positions = MagicMock(return_value=np.array([[10, 11, 12]], dtype=np.int64)) - - dp._add_processed_image(img_cache, outputs, uuid) - - # ---- input_ids / token_type_ids ---- - self.assertEqual(len(outputs["input_ids"]), num_tokens) - self.assertEqual( - outputs["input_ids"], - [dp.image_token_id] * num_tokens, - ) - - # ---- mm_positions ---- - self.assertEqual(len(outputs["mm_positions"]), 1) - mm_pos = outputs["mm_positions"][0] - self.assertEqual(mm_pos.length, num_tokens) - - # ---- vision positions ---- - dp._compute_vision_positions.assert_called_once_with(5, 1, 8, 8, 0) - np.testing.assert_array_equal( - outputs["position_ids"][0], - np.array([[10, 11, 12]], dtype=np.int64), - ) - self.assertEqual(outputs["cur_position"], 13) - - # ---- image payload ---- - self.assertEqual(len(outputs["images"]), 1) - np.testing.assert_array_equal(outputs["images"][0], img) - - self.assertEqual(outputs["mm_hashes"], [uuid]) - np.testing.assert_array_equal( - outputs["grid_thw"][0], - np.array([[1, 8, 8]]), - ) - self.assertEqual(outputs["image_type_ids"], [0]) - self.assertEqual(outputs["fps"], [0]) - - def test_multimodal_token_len_validation(self): - """Test token_len validation for raw and processed multimodal paths""" - dp = self.processor.processor - merge_size = dp.image_processor.merge_size - - def build_outputs(image=False, video=False): - outputs = { - "mm_positions": [], - "input_ids": [], - "token_type_ids": [], - "position_ids": [], - "cur_position": 0, - "images": [], - "mm_hashes": [], - "grid_thw": [], - "image_type_ids": [], - "fps": [], - } - if image: - outputs["num_input_image_tokens"] = 0 - if video: - outputs["num_input_video_tokens"] = 0 - return outputs - - processed_image = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (1, 8, 8)}, - ) - processed_video = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (2, 8, 8), "fps": 5}, - ) - - with self.subTest("add_image"): - with patch.object( - dp.image_processor, - "preprocess", - return_value={ - "grid_thw": np.array([1, merge_size * 2, merge_size * 2]), - "pixel_values": np.zeros((1, 3, 3), dtype=np.float32), - }, - ): - with self.assertRaisesRegex(ValueError, "image tokens num not match the size"): - dp._add_image(mock_pil_image(32, 32), build_outputs(image=True), None, token_len=3) - - with self.subTest("add_processed_image"): - with self.assertRaisesRegex(ValueError, "image tokens num not match the size"): - dp._add_processed_image(processed_image, build_outputs(), "uuid", token_len=2) - - with self.subTest("add_video"): - with patch.object( - dp.image_processor, - "preprocess", - return_value={ - "grid_thw": np.array([1, merge_size * 2, merge_size * 2]), - "pixel_values": np.zeros((1, 3, 3), dtype=np.float32), - }, - ): - with self.assertRaisesRegex(ValueError, "video tokens num not match the size"): - dp._add_video( - np.zeros((2, 4, 4, 3), dtype=np.uint8), - {"fps": 4}, - build_outputs(video=True), - None, - token_len=3, - ) - - with self.subTest("add_processed_video"): - with self.assertRaisesRegex(ValueError, "video tokens num not match the size"): - dp._add_processed_video(processed_video, build_outputs(), "uuid", token_len=2) - - def test_prompt_token_ids2outputs_error_branches(self): - """Test prompt_token_ids2outputs error branches with minimal fixtures""" - dp = self.processor.processor - request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": [dp.image_token_id], - "messages": [{"role": "user", "content": [{"type": "image_url", "uuid": "missing-image"}]}], - } - ) - parsed_messages = [{"role": "user", "content": {"type": "image", "data": None, "uuid": "missing-image"}}] - - with self.subTest("missing_without_cache"): - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages - ): - with self.assertRaisesRegex(ValueError, "Missing items cannot be retrieved without processor cache."): - dp.prompt_token_ids2outputs(request) - - with self.subTest("missing_cache_item_not_found"): - old_enable_processor_cache = dp.enable_processor_cache - dp.enable_processor_cache = True - fake_context = MagicMock() - fake_context.socket.return_value = MagicMock() - try: - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages - ): - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context - ): - with patch.object(dp, "get_processor_cache", return_value=[None]): - with self.assertRaisesRegex(ValueError, "Missing item 0 not found in processor cache"): - dp.prompt_token_ids2outputs(request) - finally: - dp.enable_processor_cache = old_enable_processor_cache - - with self.subTest("unexpected_multimodal_type"): - - class FlakyTypeItem: - def __init__(self): - self.calls = 0 - - def get(self, key, default=None): - if key == "type": - self.calls += 1 - return "image" if self.calls == 1 else "audio" - if key == "data": - return "bad-data" - if key == "uuid": - return "bad-uuid" - return default - - parsed_messages = [{"role": "user", "content": FlakyTypeItem()}] - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages - ): - with self.assertRaisesRegex(ValueError, "Unsupported multimodal type: audio"): - dp.prompt_token_ids2outputs(request) - - def test_prompt_token_ids2outputs_cache_update_paths(self): - """Test prompt_token_ids2outputs cache update for missing, 1D and 2D grid_thw paths""" - dp = self.processor.processor - merge_size = dp.image_processor.merge_size - old_enable_processor_cache = dp.enable_processor_cache - dp.enable_processor_cache = True - - missing_image = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (1, 8, 8)}, - ) - processed_video = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (2, 8, 8), "fps": 6}, - ) - parsed_messages = [ - { - "role": "user", - "content": [ - {"type": "image", "data": None, "uuid": "missing-image"}, - {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"}, - {"type": "video", "data": processed_video, "uuid": "processed-video"}, - ], - } - ] - request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": [dp.image_token_id, 99, dp.image_token_id, 98, dp.image_token_id], - "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}], - } - ) - fake_socket = MagicMock() - fake_context = MagicMock() - fake_context.socket.return_value = fake_socket - - try: - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages - ): - with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context): - with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)): - with patch.object( - dp.image_processor, - "preprocess", - return_value={ - "grid_thw": np.array([1, merge_size, merge_size]), - "pixel_values": np.zeros((1, 3, 3), dtype=np.float32), - }, - ): - with patch.object( - dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4) - ): - with patch.object( - dp, "get_processor_cache", return_value=[missing_image] - ) as cache_get: - with patch.object(dp, "update_processor_cache") as cache_update: - outputs = dp.prompt_token_ids2outputs(request) - - cache_get.assert_called_once_with(fake_socket, ["missing-image"]) - cache_update.assert_called_once() - _, cached_hashes, cached_items = cache_update.call_args.args - self.assertEqual(cached_hashes, ["raw-video", "processed-video"]) - self.assertEqual(cached_items[0][1]["thw"], (1, merge_size, merge_size)) - self.assertEqual(cached_items[1][1]["thw"], (2, 8, 8)) - self.assertEqual(outputs["mm_hashes"], ["missing-image", "raw-video", "processed-video"]) - self.assertEqual(outputs["input_ids"][-1], dp.image_token_id) - finally: - dp.enable_processor_cache = old_enable_processor_cache - - def test_request2ids_cache_update_paths(self): - """Test request2ids cache update for missing, 1D and 2D grid_thw paths""" - dp = self.processor.processor - merge_size = dp.image_processor.merge_size - old_enable_processor_cache = dp.enable_processor_cache - dp.enable_processor_cache = True - - missing_image = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (1, 8, 8)}, - ) - processed_image = ( - np.zeros((merge_size * merge_size, 3, 3), dtype=np.float32), - {"thw": (1, 8, 8)}, - ) - parsed_messages = [ - { - "role": "user", - "content": [ - {"type": "image", "data": None, "uuid": "missing-image"}, - {"type": "image", "data": processed_image, "uuid": "processed-image"}, - {"type": "video", "data": {"video": "raw-video", "fps": 4}, "uuid": "raw-video"}, - ], - } - ] - request = Request.from_dict( - { - "request_id": "12345", - "messages": [{"role": "user", "content": [{"type": "text", "text": "unused"}]}], - "add_generation_prompt": True, - } - ) - fake_socket = MagicMock() - fake_context = MagicMock() - fake_context.socket.return_value = fake_socket - - try: - with patch( - "fastdeploy.input.v1.qwen3_vl_processor.process.parse_chat_messages", return_value=parsed_messages - ): - with patch("fastdeploy.input.v1.qwen3_vl_processor.process.zmq.Context", return_value=fake_context): - with patch.object(dp, "_compute_vision_positions", return_value=np.array([[0]], dtype=np.int64)): - with patch.object( - dp.image_processor, - "preprocess", - return_value={ - "grid_thw": np.array([1, merge_size, merge_size]), - "pixel_values": np.zeros((1, 3, 3), dtype=np.float32), - }, - ): - with patch.object( - dp, "_load_and_process_video", return_value=mock_read_frames(4, 4, 2, 4) - ): - with patch.object( - dp, "get_processor_cache", return_value=[missing_image] - ) as cache_get: - with patch.object(dp, "update_processor_cache") as cache_update: - with patch.object( - self.processor.tokenizer, - "apply_chat_template", - return_value="<|image_pad|>a<|image_pad|>b<|video_pad|>", - ): - outputs = dp.request2ids(request) - - cache_get.assert_called_once_with(fake_socket, ["missing-image"]) - cache_update.assert_called_once() - _, cached_hashes, cached_items = cache_update.call_args.args - self.assertEqual(cached_hashes, ["processed-image", "raw-video"]) - self.assertEqual(cached_items[0][1]["thw"], (1, 8, 8)) - self.assertEqual(cached_items[1][1]["thw"], (1, merge_size, merge_size)) - self.assertEqual(outputs["mm_hashes"], ["missing-image", "processed-image", "raw-video"]) - finally: - dp.enable_processor_cache = old_enable_processor_cache - - def test_parse_processor_kwargs_valid(self): - """Test _parse_processor_kwargs with valid input""" - valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1} - result = self.processor._parse_processor_kwargs(valid_kwargs) - self.assertEqual(result, valid_kwargs) - - def test_parse_processor_kwargs_empty(self): - """Test _parse_processor_kwargs with empty input""" - result = self.processor._parse_processor_kwargs(None) - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_invalid_type(self): - """Test _parse_processor_kwargs with invalid type""" - result = self.processor._parse_processor_kwargs("invalid") - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_invalid_value_type(self): - """Test _parse_processor_kwargs with invalid value type""" - invalid_kwargs = {"video_max_frames": "10"} # Should be int - result = self.processor._parse_processor_kwargs(invalid_kwargs) - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_mixed_valid_invalid(self): - """Test _parse_processor_kwargs with mixed valid and invalid types""" - mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"} - result = self.processor._parse_processor_kwargs(mixed_kwargs) - self.assertEqual(result, {}) - - def test_parse_limits_valid(self): - """Test _parse_limits with valid limits""" - limits = {"image": 2, "video": 3} - result = self.processor._parse_limits(limits) - expected = {"image": 2, "video": 3, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_empty(self): - """Test _parse_limits with empty input""" - result = self.processor._parse_limits(None) - expected = {"image": 1, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_invalid_type(self): - """Test _parse_limits with invalid type""" - result = self.processor._parse_limits("invalid") - expected = {"image": 1, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_partial(self): - """Test _parse_limits with partial limits""" - limits = {"image": 5} - result = self.processor._parse_limits(limits) - expected = {"image": 5, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_check_mm_limits_dict_valid(self): - """Test _check_mm_limits with valid dict input""" - mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]} - # Should not raise exception - self.processor._check_mm_limits(mm_data) - - def test_check_mm_limits_dict_exceed_limit(self): - """Test _check_mm_limits when dict input exceeds limit""" - mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]} - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(mm_data) - self.assertIn("Too many image items", str(context.exception)) - - def test_check_mm_limits_messages_valid(self): - """Test _check_mm_limits with valid messages input""" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "text", "text": "Describe this image."}, - ], - } - ] - # Should not raise exception - self.processor._check_mm_limits(messages) - - def test_check_mm_limits_messages_exceed_limit(self): - """Test _check_mm_limits when messages input exceeds limit""" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}}, - {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}}, - ], - } - ] - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(messages) - self.assertIn("Too many image items", str(context.exception)) - - def test_check_mm_limits_video_exceed(self): - """Test _check_mm_limits when video exceeds limit""" - mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]} - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(mm_data) - self.assertIn("Too many video items", str(context.exception)) - - def test_process_request_dict_with_prompt(self): - """Test process_request_dict with prompt format""" - request = { - "request_id": "12345", - "prompt": "Test prompt", - "multimodal_data": {"image": [mock_pil_image(10, 10)]}, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(len(result.prompt_token_ids), 0) - self.assertGreater(len(result.multimodal_inputs), 0) - - def test_process_request_dict_with_messages(self): - """Test process_request_dict with messages format""" - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [{"type": "text", "text": "Hello"}], - } - ], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(len(result.prompt_token_ids), 0) - self.assertGreater(len(result.multimodal_inputs), 0) - - def test_process_request_dict_with_prompt_token_ids_only(self): - """Test process_request_dict with prompt_token_ids only""" - request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": [1, 2, 3], - } - ) - result = self.processor.process_request_dict(request, 1024) - - self.assertEqual(result.prompt_token_ids, [1, 2, 3]) - self.assertEqual(result.prompt_token_ids_len, 3) - self.assertIsNone(result.multimodal_inputs["images"]) - self.assertEqual(result.multimodal_inputs["token_type_ids"].tolist(), [0, 0, 0]) - - def test_process_request_dict_with_prompt_token_ids_and_messages(self): - """Test process_request_dict with prompt_token_ids and multimodal messages""" - source_request = Request.from_dict( - { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - ) - source_result = self.processor.process_request_dict(source_request, 1024 * 100) - - token_request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": list(source_result.prompt_token_ids), - "messages": copy.deepcopy(source_request.messages), - } - ) - token_result = self.processor.process_request_dict(token_request, 1024 * 100) - - self.assertEqual(token_result.prompt_token_ids, source_result.prompt_token_ids) - self.assertTrue( - np.equal(token_result.multimodal_inputs["grid_thw"], source_result.multimodal_inputs["grid_thw"]).all() - ) - self.assertTrue( - np.equal( - token_result.multimodal_inputs["position_ids"], - source_result.multimodal_inputs["position_ids"], - ).all() - ) - self.assertTrue( - np.equal( - token_result.multimodal_inputs["image_type_ids"], - source_result.multimodal_inputs["image_type_ids"], - ).all() - ) - - def test_process_request_dict_prompt_token_ids_more_multimodal_segments_than_messages(self): - """Test prompt_token_ids path when token-side multimodal segments exceed messages""" - source_request = Request.from_dict( - { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - ) - source_result = self.processor.process_request_dict(source_request, 1024 * 100) - - token_request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": list(source_result.prompt_token_ids), - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - ) - - with self.assertRaisesRegex(ValueError, "more multimodal placeholder"): - self.processor.process_request_dict(token_request, 1024 * 100) - - def test_process_request_dict_prompt_token_ids_unused_multimodal_messages(self): - """Test prompt_token_ids path when messages have unused multimodal items""" - source_request = Request.from_dict( - { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "text", "text": "Describe image."}, - ], - } - ], - } - ) - source_result = self.processor.process_request_dict(source_request, 1024 * 100) - - token_request = Request.from_dict( - { - "request_id": "12345", - "prompt_token_ids": list(source_result.prompt_token_ids), - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image."}, - ], - } - ], - } - ) - - with self.assertRaisesRegex(ValueError, "number of multimodal items does not match"): - self.processor.process_request_dict(token_request, 1024 * 100) - - def test_process_request_dict_invalid_format(self): - """Test process_request_dict with invalid format""" - request = {"request_id": "12345"} - request = Request.from_dict(request) - with self.assertRaises(ValueError) as context: - self.processor.process_request_dict(request, 1024) - self.assertIn("must contain 'prompt', or 'messages'", str(context.exception)) - - def test_process_request_dict_with_bad_words(self): - """Test process_request_dict with bad_words""" - request = { - "request_id": "12345", - "prompt": "Test prompt", - "bad_words": ["bad", "word"], - "bad_words_token_ids": [100, 200], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - # Verify bad_words_token_ids is set - self.assertIsNotNone(result.sampling_params.bad_words_token_ids) - - def test_process_request_dict_invalid_chat_template_kwargs(self): - """Test process_request_dict with invalid chat_template_kwargs""" - request = { - "request_id": "12345", - "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}], - } - request = Request.from_dict(request) - request.chat_template_kwargs = "invalid" - with self.assertRaises(ValueError) as context: - self.processor.process_request_dict(request, 1024) - self.assertIn("must be a dict", str(context.exception)) - - def test_process_request_dict_with_completion_token_ids(self): - """Test process_request_dict with completion_token_ids""" - request = {"request_id": "12345", "prompt": "Test"} - request = Request.from_dict(request) - request.completion_token_ids = [1, 2, 3] - result = self.processor.process_request_dict(request, 1024) - # Verify completion tokens are appended - self.assertGreater(len(result.prompt_token_ids), 3) - - def test_process_request_dict_prompt_truncation(self): - """Test process_request_dict with prompt truncation""" - # Create a long prompt that exceeds max_model_len - long_prompt = "Test " * 1000 - request = { - "request_id": "12345", - "prompt": long_prompt, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 100) - # Verify prompt is truncated - self.assertLessEqual(len(result.prompt_token_ids), 99) - - def test_process_request_dict_default_max_tokens(self): - """Test process_request_dict sets default max_tokens""" - request = { - "request_id": "12345", - "prompt": "Test", - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(result.sampling_params.max_tokens, 0) - - def test_process_request_dict_enable_thinking_false(self): - """Test process_request_dict sets enable_thinking to False""" - request = { - "request_id": "12345", - "prompt": "Test", - "enable_thinking": True, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertFalse(result.enable_thinking) - - def test_append_completion_tokens(self): - """Test append_completion_tokens method""" - multimodal_inputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - "cur_position": 3, - } - completion_token_ids = [4, 5] - self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids) - - self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5]) - self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0]) - self.assertEqual(multimodal_inputs["cur_position"], 5) - - def test_pack_outputs_with_images(self): - """Test pack_outputs with image data""" - outputs = { - "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])], - "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])], - "image_type_ids": [0, 1], - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - } - result = self.processor.pack_outputs(outputs) - - self.assertIsNotNone(result["images"]) - self.assertIsNotNone(result["grid_thw"]) - self.assertIsNotNone(result["image_type_ids"]) - self.assertEqual(result["images"].shape[0], 4) - self.assertEqual(result["grid_thw"].shape[0], 2) - - def test_pack_outputs_without_images(self): - """Test pack_outputs without image data""" - outputs = { - "images": [], - "grid_thw": [], - "image_type_ids": [], - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - } - result = self.processor.pack_outputs(outputs) - - # Test that image-related fields are None when no images - self.assertIsNone(result["images"]) - self.assertIsNone(result["grid_thw"]) - self.assertIsNone(result["image_type_ids"]) - - # Test data types - self.assertEqual(result["input_ids"].dtype, np.int64) - self.assertEqual(result["token_type_ids"].dtype, np.int64) - self.assertEqual(result["position_ids"].dtype, np.int64) - - # Test patch IDs are set - self.assertIn("image_patch_id", result) - self.assertIn("video_patch_id", result) - self.assertIn("mm_num_token_func", result) - - -class TestSampleFrames(unittest.TestCase): - """ - Unit tests for sample_frames function - """ - - def setUp(self): - self.metadata = { - "num_of_frame": 100, - "fps": 25, - } - - def test_fps_and_num_frames_mutually_exclusive(self): - with self.assertRaises(ValueError): - sample_frames( - frame_factor=4, - min_frames=8, - max_frames=32, - metadata=self.metadata, - fps=2, - num_frames=16, - ) - - def test_num_frames_round_to_factor(self): - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=64, - metadata=self.metadata, - num_frames=18, # round(18 / 4) * 4 = 16 - ) - - self.assertEqual(len(indices), 16) - self.assertEqual(indices[0], 0) - self.assertLess(indices[-1], self.metadata["num_of_frame"]) - - def test_fps_sampling_basic(self): - # total = 100 frames, fps=25, target fps=5 → 20 frames - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=64, - metadata=self.metadata, - fps=5, - ) - - self.assertEqual(len(indices), 20) - self.assertEqual(indices.dtype, np.int32) - self.assertEqual(indices[0], 0) - - def test_fps_respects_min_frames(self): - indices = sample_frames( - frame_factor=4, - min_frames=24, - max_frames=64, - metadata=self.metadata, - fps=1, # very small fps - ) - - self.assertEqual(len(indices), 24) - - def test_num_frames_exceeds_total_raises(self): - with self.assertRaises(ValueError): - sample_frames( - frame_factor=4, - min_frames=8, - max_frames=200, - metadata=self.metadata, - num_frames=200, - ) - - def test_force_multiple_of_4_hack(self): - indices = sample_frames( - frame_factor=2, - min_frames=2, - max_frames=100, - metadata=self.metadata, - num_frames=10, # 10 % 4 != 0 → hack → 8 - ) - - self.assertEqual(len(indices), 8) - self.assertEqual(len(indices) % 4, 0) - - def test_keep_all_frames_when_num_frames_zero(self): - indices = sample_frames( - frame_factor=4, - min_frames=0, - max_frames=100, - metadata=self.metadata, - num_frames=0, - ) - - self.assertEqual(len(indices), self.metadata["num_of_frame"]) - np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32)) - - def test_indices_evenly_spaced(self): - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=32, - metadata=self.metadata, - num_frames=16, - ) - - diffs = np.diff(indices) - self.assertTrue(np.all(diffs > 0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_qwen_vl_processor.py b/tests/input/v1/test_qwen_vl_processor.py deleted file mode 100644 index 5de7df1da39..00000000000 --- a/tests/input/v1/test_qwen_vl_processor.py +++ /dev/null @@ -1,776 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import unittest -from unittest.mock import MagicMock, patch - -import numpy as np -from PIL import Image - -from fastdeploy.engine.request import Request -from fastdeploy.input.v1.qwen_vl_processor import QwenVLProcessor -from fastdeploy.input.v1.qwen_vl_processor.process_video import sample_frames - - -def mock_pil_image(height, width): - """ - Generate mock random RGB image - - Args: - height: Image height in pixels - width: Image width in pixels - - Returns: - PIL.Image object with random RGB data - """ - rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) - return Image.fromarray(rgb_image) - - -def mock_read_frames(height: int, width: int, nums_frame: int, fps: int): - """ - Generate mock video frames with metadata for testing purposes - - Creates synthetic video data by generating random RGB frames and constructing - corresponding metadata to simulate real video processing. - - Args: - height (int): Height of video frames in pixels - width (int): Width of video frames in pixels - nums_frame (int): Number of frames to generate - fps (int): Frames per second for the mock video - - Returns: - tuple: A tuple containing: - frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3) - containing randomly generated RGB frames - meta (dict): Dictionary with video metadata: - - fps (int): Frames per second (same as input) - - duration (float): Calculated duration in seconds (nums_frame/fps) - - num_of_frame (int): Number of frames (same as nums_frame input) - """ - frames = [] - for _ in range(nums_frame): - frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) - frames.append(frame) - frames = np.stack(frames, axis=0) - - meta = { - "fps": fps, - "duration": nums_frame / fps, - "num_of_frame": nums_frame, - } - return frames, meta - - -class TestQwenVLProcessor(unittest.TestCase): - """ - Unit tests for Qwen Vision-Language Processor functionality - """ - - def setUp(self): - """ - Initialize test case with: - - Mock configuration - - Patched message parsing and video processing methods - - QwenVLProcessor instance with test parameters - """ - config = MagicMock() - config.vision_config.tokens_per_second = 2 - - self.patcher_parse_image = patch( - "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_image", return_value=mock_pil_image(480, 640) - ) - self.patcher_parse_image.start() - - self.patcher_parse_video = patch( - "fastdeploy.entrypoints.chat_utils.MultimodalPartParser.parse_video", return_value=b"123" - ) - self.patcher_parse_video.start() - - self.patcher_read_frames = patch( - "fastdeploy.input.v1.qwen_vl_processor.process.DataProcessor._load_and_process_video", - return_value=mock_read_frames(480, 640, 5, 2), - ) - self.patcher_read_frames.start() - - mm_processor_kwargs = { - "video_max_frames": 10, - "video_min_frames": 1, - } - limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1} - - self.model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct" - self.processor = QwenVLProcessor( - config=config, - model_name_or_path=self.model_name_or_path, - limit_mm_per_prompt=limit_mm_per_prompt, - mm_processor_kwargs=mm_processor_kwargs, - reasoning_parser_obj=None, - tool_parser_obj=None, - ) - - def tearDown(self) -> None: - """Clean up test case by stopping all mock patches""" - self.patcher_read_frames.stop() - self.patcher_parse_image.stop() - self.patcher_parse_video.stop() - - def test_process_request(self): - """ - Test processing of Request object with multimodal input - - Validates: - 1. Token ID lengths match position_ids and token_type_ids shapes - 2. Image processing produces expected output dimensions - 3. Video processing produces expected output dimensions - 4. Correct counts for images (1) and videos (1) - """ - message = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - - request = Request.from_dict(message) - result = self.processor.process_request_dict(request, 1024 * 100) - - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0]) - self.assertEqual( - result.multimodal_inputs["images"].shape[0], - sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])), - ) - self.assertEqual( - result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum() - ) - - def test_process_request_dict(self): - """ - Test processing of dictionary-format request with multimodal input - - Validates: - 1. Token ID lengths match position_ids and token_type_ids shapes - 2. Image processing produces expected output dimensions - 3. Video processing produces expected output dimensions - 4. Correct counts for images (1) and videos (1) - """ - num_completion_token_ids = 10 - request = { - "request_id": "12345", - "completion_token_ids": [1] * num_completion_token_ids, - "stop": ["stop", "eof"], - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - - result = self.processor.process_request_dict(request, 1024 * 100) - - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0]) - self.assertEqual( - result.multimodal_inputs["images"].shape[0], - sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])), - ) - self.assertEqual( - result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum() - ) - - def test_process_request_dict_enable_thinking(self): - num_completion_token_ids = 10 - request = { - "request_id": "12345", - "completion_token_ids": [1] * num_completion_token_ids, - "stop": ["stop", "eof"], - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": "Hello"}, - ], - } - ], - "chat_template_kwargs": {"enable_thinking": True}, - } - request = Request.from_dict(request) - - result = self.processor.process_request_dict(request, 100) - self.assertEqual(result.enable_thinking, False) - - def test_prompt(self): - """ - Test processing of prompt with image and video placeholders - - Validates: - 1. Token ID lengths match position_ids and token_type_ids shapes - 2. Image processing produces expected output dimensions - 3. Video processing produces expected output dimensions - 4. Correct counts for images (1) and videos (1) - """ - IMAGE_PLACEHOLDER = "<|image_pad|>" - VIDEO_PLACEHOLDER = "<|video_pad|>" - prompt = { - "request_id": "12345", - "prompt": f"{IMAGE_PLACEHOLDER}{VIDEO_PLACEHOLDER}Describe image and video.", - "multimodal_data": { - "image": [mock_pil_image(10, 2100)], - "video": [{"video": b"123", "fps": 5}], - }, - } - - request = Request.from_dict(prompt) - result = self.processor.process_request_dict(request, 1024 * 100) - - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0]) - self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0]) - self.assertEqual( - result.multimodal_inputs["images"].shape[0], - sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])), - ) - self.assertEqual( - result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum() - ) - - def test_message_and_prompt(self): - """ - Test consistency between message-based and prompt-based processing - - Validates that processing a request through: - 1. The message format (with image/video URLs) - 2. The prompt format (with direct image/video data) - produces identical tokenization and multimodal input results. - - Checks: - 1. Prompt token IDs match between both processing methods - 2. Grid dimensions (THW) match between both methods - 3. Position IDs match between both methods - """ - # Create test request in message format - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024 * 100) - - # Create equivalent request in prompt format - prompt = { - "request_id": "12345", - "prompt": request.prompt_tokens, - "multimodal_data": { - "image": [mock_pil_image(480, 640)], - "video": [{"video": b"123"}], - }, - } - request2 = Request.from_dict(prompt) - result2 = self.processor.process_request_dict(request2, 1024 * 100) - - # Verify both processing methods produce identical results - self.assertEqual(result.prompt_token_ids, result2.prompt_token_ids) - self.assertTrue(np.equal(result.multimodal_inputs["grid_thw"], result2.multimodal_inputs["grid_thw"]).all()) - self.assertTrue( - np.equal(result.multimodal_inputs["position_ids"], result2.multimodal_inputs["position_ids"]).all() - ) - - def test_apply_chat_template(self): - """ - Test the consistency between: - 1. Directly applying chat template using HuggingFace tokenizer - 2. Applying chat template through the processor's request processing - - This test verifies that: - - The processor correctly handles multimodal messages (image, video, text) - - The prompt_tokens field matches the output from direct tokenizer application - - The chat template application preserves the message structure and content - - Test Steps: - 1. Create sample multimodal messages with image, video and text content - 2. Apply chat template directly using the tokenizer - 3. Process the same messages through the processor - 4. Compare the outputs to ensure consistency - """ - from transformers import AutoTokenizer - - tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path) - - # Sample multimodal messages containing image, video and text - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video", "video": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ] - - # Apply chat template directly using the tokenizer - prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - - # Create equivalent request dictionary - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}}, - {"type": "text", "text": "Describe image and video."}, - ], - } - ], - } - request = Request.from_dict(request) - - # Process request through the processor - self.processor.process_request_dict(request, 1024 * 100) - prompt2 = request.prompt_tokens - - # Verify both methods produce identical prompt strings - self.assertEqual(prompt, prompt2) - - def test_think_status(self): - """测试 思考机制""" - request = { - "prompt": "hello", - "request_id": "test_1", - "prompt_token_ids": [1, 2, 3], - "temperature": 0.7, - "top_p": 0.9, - } - request = Request.from_dict(request) - self.processor.reasoning_parser = MagicMock() - self.processor.reasoning_parser.get_model_status.return_value = "think_start" - self.processor.model_status_dict = {} - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - request = { - "prompt": "hello", - "request_id": "test", - "prompt_token_ids": [1, 2, 3], - "temperature": 0.7, - "top_p": 0.9, - } - request = Request.from_dict(request) - self.processor.process_request_dict(request, max_model_len=512) - self.assertEqual(request.enable_thinking, True) - - def test_parse_processor_kwargs_valid(self): - """Test _parse_processor_kwargs with valid input""" - valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1} - result = self.processor._parse_processor_kwargs(valid_kwargs) - self.assertEqual(result, valid_kwargs) - - def test_parse_processor_kwargs_empty(self): - """Test _parse_processor_kwargs with empty input""" - result = self.processor._parse_processor_kwargs(None) - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_invalid_type(self): - """Test _parse_processor_kwargs with invalid type""" - result = self.processor._parse_processor_kwargs("invalid") - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_invalid_value_type(self): - """Test _parse_processor_kwargs with invalid value type""" - invalid_kwargs = {"video_max_frames": "10"} # Should be int - result = self.processor._parse_processor_kwargs(invalid_kwargs) - self.assertEqual(result, {}) - - def test_parse_processor_kwargs_mixed_valid_invalid(self): - """Test _parse_processor_kwargs with mixed valid and invalid types""" - mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"} - result = self.processor._parse_processor_kwargs(mixed_kwargs) - self.assertEqual(result, {}) - - def test_parse_limits_valid(self): - """Test _parse_limits with valid limits""" - limits = {"image": 2, "video": 3} - result = self.processor._parse_limits(limits) - expected = {"image": 2, "video": 3, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_empty(self): - """Test _parse_limits with empty input""" - result = self.processor._parse_limits(None) - expected = {"image": 1, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_invalid_type(self): - """Test _parse_limits with invalid type""" - result = self.processor._parse_limits("invalid") - expected = {"image": 1, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_parse_limits_partial(self): - """Test _parse_limits with partial limits""" - limits = {"image": 5} - result = self.processor._parse_limits(limits) - expected = {"image": 5, "video": 1, "audio": 1} - self.assertEqual(result, expected) - - def test_check_mm_limits_dict_valid(self): - """Test _check_mm_limits with valid dict input""" - mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]} - # Should not raise exception - self.processor._check_mm_limits(mm_data) - - def test_check_mm_limits_dict_exceed_limit(self): - """Test _check_mm_limits when dict input exceeds limit""" - mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]} - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(mm_data) - self.assertIn("Too many image items", str(context.exception)) - - def test_check_mm_limits_messages_valid(self): - """Test _check_mm_limits with valid messages input""" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}}, - {"type": "text", "text": "Describe this image."}, - ], - } - ] - # Should not raise exception - self.processor._check_mm_limits(messages) - - def test_check_mm_limits_messages_exceed_limit(self): - """Test _check_mm_limits when messages input exceeds limit""" - messages = [ - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}}, - {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}}, - ], - } - ] - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(messages) - self.assertIn("Too many image items", str(context.exception)) - - def test_check_mm_limits_video_exceed(self): - """Test _check_mm_limits when video exceeds limit""" - mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]} - with self.assertRaises(ValueError) as context: - self.processor._check_mm_limits(mm_data) - self.assertIn("Too many video items", str(context.exception)) - - def test_process_request_dict_with_prompt(self): - """Test process_request_dict with prompt format""" - request = { - "request_id": "12345", - "prompt": "Test prompt", - "multimodal_data": {"image": [mock_pil_image(10, 10)]}, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(len(result.prompt_token_ids), 0) - self.assertGreater(len(result.multimodal_inputs), 0) - - def test_process_request_dict_with_messages(self): - """Test process_request_dict with messages format""" - request = { - "request_id": "12345", - "messages": [ - { - "role": "user", - "content": [{"type": "text", "text": "Hello"}], - } - ], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(len(result.prompt_token_ids), 0) - self.assertGreater(len(result.multimodal_inputs), 0) - - def test_process_request_dict_invalid_format(self): - """Test process_request_dict with invalid format""" - request = {"request_id": "12345"} - request = Request.from_dict(request) - with self.assertRaises(ValueError) as context: - self.processor.process_request_dict(request, 1024) - self.assertIn("must contain 'prompt', or 'messages'", str(context.exception)) - - def test_process_request_dict_with_bad_words(self): - """Test process_request_dict with bad_words""" - request = { - "request_id": "12345", - "prompt": "Test prompt", - "bad_words": ["bad", "word"], - "bad_words_token_ids": [100, 200], - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - # Verify bad_words_token_ids is set - self.assertIsNotNone(result.sampling_params.bad_words_token_ids) - - def test_process_request_dict_invalid_chat_template_kwargs(self): - """Test process_request_dict with invalid chat_template_kwargs""" - request = { - "request_id": "12345", - "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}], - } - request = Request.from_dict(request) - request.chat_template_kwargs = "invalid" - with self.assertRaises(ValueError) as context: - self.processor.process_request_dict(request, 1024) - self.assertIn("must be a dict", str(context.exception)) - - def test_process_request_dict_with_completion_token_ids(self): - """Test process_request_dict with completion_token_ids""" - request = {"request_id": "12345", "prompt": "Test"} - request = Request.from_dict(request) - request.completion_token_ids = [1, 2, 3] - result = self.processor.process_request_dict(request, 1024) - # Verify completion tokens are appended - self.assertGreater(len(result.prompt_token_ids), 3) - - def test_process_request_dict_prompt_truncation(self): - """Test process_request_dict with prompt truncation""" - # Create a long prompt that exceeds max_model_len - long_prompt = "Test " * 1000 - request = { - "request_id": "12345", - "prompt": long_prompt, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 100) - # Verify prompt is truncated - self.assertLessEqual(len(result.prompt_token_ids), 99) - - def test_process_request_dict_default_max_tokens(self): - """Test process_request_dict sets default max_tokens""" - request = { - "request_id": "12345", - "prompt": "Test", - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertGreater(result.sampling_params.max_tokens, 0) - - def test_process_request_dict_enable_thinking_false(self): - """Test process_request_dict sets enable_thinking to False""" - request = { - "request_id": "12345", - "prompt": "Test", - "enable_thinking": True, - } - request = Request.from_dict(request) - result = self.processor.process_request_dict(request, 1024) - self.assertFalse(result.enable_thinking) - - def test_append_completion_tokens(self): - """Test append_completion_tokens method""" - multimodal_inputs = { - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - "cur_position": 3, - } - completion_token_ids = [4, 5] - self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids) - - self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5]) - self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0]) - self.assertEqual(multimodal_inputs["cur_position"], 5) - - def test_pack_outputs_with_images(self): - """Test pack_outputs with image data""" - outputs = { - "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])], - "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])], - "image_type_ids": [0, 1], - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - } - result = self.processor.pack_outputs(outputs) - - self.assertIsNotNone(result["images"]) - self.assertIsNotNone(result["grid_thw"]) - self.assertIsNotNone(result["image_type_ids"]) - self.assertEqual(result["images"].shape[0], 4) - self.assertEqual(result["grid_thw"].shape[0], 2) - - def test_pack_outputs_without_images(self): - """Test pack_outputs without image data""" - outputs = { - "images": [], - "grid_thw": [], - "image_type_ids": [], - "input_ids": [1, 2, 3], - "token_type_ids": [0, 0, 0], - "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])], - } - result = self.processor.pack_outputs(outputs) - - # Test that image-related fields are None when no images - self.assertIsNone(result["images"]) - self.assertIsNone(result["grid_thw"]) - self.assertIsNone(result["image_type_ids"]) - - # Test data types - self.assertEqual(result["input_ids"].dtype, np.int64) - self.assertEqual(result["token_type_ids"].dtype, np.int64) - self.assertEqual(result["position_ids"].dtype, np.int64) - - # Test patch IDs are set - self.assertIn("image_patch_id", result) - self.assertIn("video_patch_id", result) - self.assertIn("mm_num_token_func", result) - - -class TestSampleFrames(unittest.TestCase): - """ - Unit tests for sample_frames function - """ - - def setUp(self): - self.metadata = { - "num_of_frame": 100, - "fps": 25, - } - - def test_fps_and_num_frames_mutually_exclusive(self): - with self.assertRaises(ValueError): - sample_frames( - frame_factor=4, - min_frames=8, - max_frames=32, - metadata=self.metadata, - fps=2, - num_frames=16, - ) - - def test_num_frames_round_to_factor(self): - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=64, - metadata=self.metadata, - num_frames=18, # round(18 / 4) * 4 = 16 - ) - - self.assertEqual(len(indices), 16) - self.assertEqual(indices[0], 0) - self.assertLess(indices[-1], self.metadata["num_of_frame"]) - - def test_fps_sampling_basic(self): - # total = 100 frames, fps=25, target fps=5 → 20 frames - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=64, - metadata=self.metadata, - fps=5, - ) - - self.assertEqual(len(indices), 20) - self.assertEqual(indices.dtype, np.int32) - self.assertEqual(indices[0], 0) - - def test_fps_respects_min_frames(self): - indices = sample_frames( - frame_factor=4, - min_frames=24, - max_frames=64, - metadata=self.metadata, - fps=1, # very small fps - ) - - self.assertEqual(len(indices), 24) - - def test_num_frames_exceeds_total_raises(self): - with self.assertRaises(ValueError): - sample_frames( - frame_factor=4, - min_frames=8, - max_frames=200, - metadata=self.metadata, - num_frames=200, - ) - - def test_force_multiple_of_4_hack(self): - indices = sample_frames( - frame_factor=2, - min_frames=2, - max_frames=100, - metadata=self.metadata, - num_frames=10, # 10 % 4 != 0 → hack → 8 - ) - - self.assertEqual(len(indices), 8) - self.assertEqual(len(indices) % 4, 0) - - def test_keep_all_frames_when_num_frames_zero(self): - indices = sample_frames( - frame_factor=4, - min_frames=0, - max_frames=100, - metadata=self.metadata, - num_frames=0, - ) - - self.assertEqual(len(indices), self.metadata["num_of_frame"]) - np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32)) - - def test_indices_evenly_spaced(self): - indices = sample_frames( - frame_factor=4, - min_frames=8, - max_frames=32, - metadata=self.metadata, - num_frames=16, - ) - - diffs = np.diff(indices) - self.assertTrue(np.all(diffs > 0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_text_processor.py b/tests/input/v1/test_text_processor.py deleted file mode 100644 index 147d843e85d..00000000000 --- a/tests/input/v1/test_text_processor.py +++ /dev/null @@ -1,586 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import importlib -import importlib.util -import sys -import types -import unittest -from pathlib import Path -from types import SimpleNamespace -from unittest import mock - -import numpy as np - -from fastdeploy.engine.request import Request, RequestOutput -from fastdeploy.engine.sampling_params import SamplingParams - - -class DummyTokenizer: - bos_token = "" - cls_token = "" - sep_token = "" - eos_token = "" - mask_token = "" - chat_template = "dummy" - - def __init__(self): - self.pad_token_id = 1 - self.eos_token_id = 2 - self.eos_token = 2 - self.vocab_size = 256 - self.bos_token_id = self._convert_token_to_id(self.bos_token) - self.cls_token_id = self._convert_token_to_id(self.cls_token) - self.sep_token_id = self._convert_token_to_id(self.sep_token) - self.mask_token_id = self._convert_token_to_id(self.mask_token) - - def _convert_token_to_id(self, token): - return len(str(token)) - - def __call__(self, text, **kwargs): - if isinstance(text, list): - values = [self._value(item) for item in text] - else: - values = [self._value(text)] - max_length = kwargs.get("max_length") - if max_length is not None: - values = values[:max_length] - return {"input_ids": np.array([values], dtype=np.int64)} - - def _value(self, item): - if isinstance(item, str): - return len(item) - return int(item) - - def tokenize(self, text): - if isinstance(text, str): - return [text] - return [str(text)] - - def convert_tokens_to_ids(self, tokens): - return [self._value(token) for token in tokens] - - def decode(self, token_ids, **kwargs): - return " ".join(str(t) for t in token_ids) - - def decode_token(self, token_ids, prefix_offset, read_offset): - start = read_offset - delta_tokens = token_ids[start:] - delta = "".join(str(t) for t in delta_tokens) - prefix_offset += len(token_ids) - read_offset += len(delta_tokens) - return delta, prefix_offset, read_offset - - def batch_decode(self, batch, **kwargs): - return [self.decode(seq) for seq in batch] - - def apply_chat_template(self, request, **kwargs): - if isinstance(request, dict): - system = request.get("system") - messages = request.get("messages", []) - else: - system = getattr(request, "system", None) - messages = getattr(request, "messages", []) - parts = [system] if system else [] - parts.extend(msg.get("content", "") for msg in messages) - return " ".join(part for part in parts if part) - - -class DummyLlamaTokenizer(DummyTokenizer): - pass - - -class DummyAutoTokenizer: - @classmethod - def from_pretrained(cls, *args, **kwargs): - return DummyTokenizer() - - -class DummyHFTokenizer: - @classmethod - def from_pretrained(cls, *args, **kwargs): - return DummyTokenizer() - - -def _create_dummy_modules(): - """Create all dummy modules needed for testing fastdeploy.input.text_processor.""" - repo_root = Path(__file__).resolve().parents[2] - - dummy_logger = SimpleNamespace( - info=lambda *args, **kwargs: None, - warning=lambda *args, **kwargs: None, - debug=lambda *args, **kwargs: None, - ) - - utils_module = types.ModuleType("fastdeploy.utils") - utils_module.data_processor_logger = dummy_logger - - envs_module = types.ModuleType("fastdeploy.envs") - envs_module.FD_USE_HF_TOKENIZER = False - - generation_module = types.ModuleType("paddleformers.generation") - - class DummyGenerationConfig: - def __init__(self): - self.top_p = 0.8 - self.temperature = 0.9 - self.repetition_penalty = 1.1 - self.frequency_penalty = 0.2 - self.presence_penalty = 0.1 - - @classmethod - def from_pretrained(cls, *args, **kwargs): - return cls() - - generation_module.GenerationConfig = DummyGenerationConfig - - transformers_module = types.ModuleType("paddleformers.transformers") - transformers_module.AutoTokenizer = DummyAutoTokenizer - transformers_module.LlamaTokenizer = DummyLlamaTokenizer - transformers_module.Llama3Tokenizer = DummyLlamaTokenizer - - hf_transformers_module = types.ModuleType("transformers") - hf_transformers_module.AutoTokenizer = DummyHFTokenizer - - llm_utils_module = types.ModuleType("paddleformers.cli.utils.llm_utils") - llm_utils_module.get_eos_token_id = lambda tokenizer, config: [tokenizer.eos_token_id] - - fastdeploy_module = types.ModuleType("fastdeploy") - fastdeploy_module.__path__ = [str(repo_root / "fastdeploy")] - fastdeploy_module.utils = utils_module - fastdeploy_module.envs = envs_module - - return { - "fastdeploy": fastdeploy_module, - "fastdeploy.utils": utils_module, - "fastdeploy.envs": envs_module, - "paddleformers.generation": generation_module, - "paddleformers.transformers": transformers_module, - "transformers": hf_transformers_module, - "paddleformers.cli.utils.llm_utils": llm_utils_module, - } - - -def _import_text_processor(use_hf_tokenizer=False): - modules = _create_dummy_modules() - - modules["fastdeploy.envs"].FD_USE_HF_TOKENIZER = use_hf_tokenizer - - previous_modules = {} - for name, module in modules.items(): - previous_modules[name] = sys.modules.get(name) - sys.modules[name] = module - - try: - text_processor_module = importlib.import_module("fastdeploy.input.v1.text_processor") - importlib.reload(text_processor_module) - except Exception: - for name, original in previous_modules.items(): - if original is None: - sys.modules.pop(name, None) - else: - sys.modules[name] = original - raise - - def cleanup(): - sys.modules.pop("fastdeploy.input.text_processor", None) - for name, original in previous_modules.items(): - if original is None: - sys.modules.pop(name, None) - else: - sys.modules[name] = original - - return text_processor_module, cleanup - - -class DummyRequest: - def __init__(self, **kwargs): - self.request_id = kwargs.get("request_id", "req") - self.prompt = kwargs.get("prompt") - self.prompt_token_ids = kwargs.get("prompt_token_ids") - self.messages = kwargs.get("messages") - self.eos_token_ids = kwargs.get("eos_token_ids") - self.chat_template = kwargs.get("chat_template") - self.enable_thinking = kwargs.get("enable_thinking") - self.history = kwargs.get("history") - self.tools = kwargs.get("tools") - self.system = kwargs.get("system") - self.sampling_params = SimpleNamespace( - top_p=kwargs.get("top_p"), - temperature=kwargs.get("temperature"), - repetition_penalty=kwargs.get("repetition_penalty"), - frequency_penalty=kwargs.get("frequency_penalty"), - presence_penalty=kwargs.get("presence_penalty"), - stop=kwargs.get("stop"), - stop_token_ids=kwargs.get("stop_token_ids"), - stop_seqs_len=kwargs.get("stop_seqs_len"), - bad_words=kwargs.get("bad_words"), - bad_words_token_ids=kwargs.get("bad_words_token_ids"), - max_tokens=kwargs.get("max_tokens"), - ) - - def get(self, key, default=None): - if hasattr(self, key) and getattr(self, key) is not None: - return getattr(self, key) - return getattr(self.sampling_params, key, default) - - def set(self, key, value): - if hasattr(self.sampling_params, key): - setattr(self.sampling_params, key, value) - else: - setattr(self, key, value) - - def to_dict(self): - return { - "request_id": self.request_id, - "messages": self.messages, - "prompt": self.prompt, - "system": self.system, - "history": self.history, - "tools": self.tools, - "chat_template": self.chat_template, - "enable_thinking": self.enable_thinking, - } - - def __getitem__(self, key): - return self.get(key) - - def __setitem__(self, key, value): - self.set(key, value) - - -class DataProcessorTestCase(unittest.TestCase): - @staticmethod - def create_dummy_reasoning(tokenizer, reasoning_content="think"): - class DummyReasoning: - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def extract_reasoning_content(self, full_text, response_dict, model_status): - return reasoning_content, f"{full_text}!" - - return DummyReasoning(tokenizer) - - @staticmethod - def create_dummy_tool_parser(tokenizer, content="tool-text"): - class DummyToolParser: - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - def extract_tool_calls(self, full_text, response_dict): - return SimpleNamespace(tools_called=True, tool_calls=["tool"], content=content) - - return DummyToolParser - - def setUp(self): - module, cleanup = _import_text_processor() - self.text_processor_module = module - self.addCleanup(cleanup) - self.processor = self.text_processor_module.DataProcessor("stub-model") - - def test_base_data_processor_contract(self): - text_processor_module = self.text_processor_module - - class MinimalProcessor(text_processor_module.BaseDataProcessor): - def __init__(self): - self.generation_config = SimpleNamespace( - top_p=0.5, - temperature=0.6, - repetition_penalty=1.1, - frequency_penalty=0.2, - presence_penalty=0.3, - ) - super().__init__() - - def _load_tokenizer(self): - return DummyTokenizer() - - def process_request_dict(self, request, **kwargs): - return super().process_request_dict(request, **kwargs) - - def process_response_dict(self, response_obj): - return super().process_response_dict(response_obj) - - processor = MinimalProcessor() - request = Request(request_id="test_0", sampling_params=SamplingParams()) - defaults = processor._apply_default_parameters(request) - self.assertAlmostEqual(defaults.sampling_params.top_p, 0.5) - with self.assertRaises(NotImplementedError): - processor.process_request_dict({}, max_model_len=None) - with self.assertRaises(NotImplementedError): - processor.process_response_dict({}) - with self.assertRaises(NotImplementedError): - processor.text2ids("text") - with self.assertRaises(NotImplementedError): - processor.messages2ids([]) - with self.assertRaises(NotImplementedError): - processor.ids2tokens([1], "task") - - def test_process_request_dict_prompt_defaults(self): - request = {"request_id": "test_0", "prompt": "hi", "temperature": 0, "top_p": 0, "stop": ["stop"]} - request = Request.from_dict(request) - processed = self.processor.process_request_dict(request, max_model_len=5) - - self.assertEqual(processed.prompt_token_ids, [2]) - self.assertEqual(processed.sampling_params.stop_token_ids, [[4]]) - self.assertEqual(processed.sampling_params.stop_seqs_len, [1]) - self.assertEqual(processed.sampling_params.temperature, 1) - self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5) - self.assertEqual(processed.sampling_params.max_tokens, 4) - - def test_process_request_dict_messages_template(self): - request = { - "request_id": "chat", - "messages": [{"role": "user", "content": "hello"}], - "chat_template_kwargs": {"system": "system prompt"}, - } - request = Request.from_dict(request) - request.chat_template_kwargs = {"system": "system prompt"} - processed = self.processor.process_request_dict(request, max_model_len=6) - - self.assertEqual(processed.prompt_token_ids, [len("system prompt hello")]) - self.assertEqual(processed.system, "system prompt") - self.assertTrue(processed.enable_thinking) - self.assertEqual(processed.prompt_tokens, "system prompt hello") - - def test_process_request_dictect_handles_sequences(self): - request = DummyRequest( - prompt=[1, 2, 3, 4, 5, 6], - stop=["stop"], - bad_words=["zz"], - temperature=0, - top_p=0, - ) - processed = self.processor.process_request_dict(request, max_model_len=5) - - self.assertEqual(processed.prompt_token_ids, [1, 2, 3, 4]) - self.assertEqual(processed.sampling_params.max_tokens, 1) - self.assertEqual(processed.sampling_params.stop_token_ids, [[4]]) - self.assertEqual(set(processed.sampling_params.bad_words_token_ids), {2, 3}) - self.assertEqual(processed.sampling_params.temperature, 1) - self.assertAlmostEqual(processed.sampling_params.top_p, 1e-5) - - def test_process_request_requires_prompt_or_messages(self): - request = DummyRequest(prompt=None, messages=None, prompt_token_ids=None) - with self.assertRaisesRegex(ValueError, "Request must contain 'prompt_token_ids', 'prompt', or 'messages'"): - self.processor.process_request_dict(request, max_model_len=5) - - def test_process_request_dict_rejects_bad_kwargs(self): - request = { - "request_id": "test_0", - "messages": [{"role": "user", "content": "hi"}], - "chat_template_kwargs": "invalid", - } - request = Request.from_dict(request) - request.chat_template_kwargs = "invalid" - request.sampling_params = SamplingParams() - with self.assertRaisesRegex(ValueError, "chat_template_kwargs must be a dict"): - self.processor.process_request_dict(request) - - def test_ids2tokens_and_clear_request_status(self): - delta, _, _ = self.processor.ids2tokens([3], "task-1") - self.assertEqual(delta, "3") - delta, _, _ = self.processor.ids2tokens([4], "task-1") - self.assertEqual(delta, "4") - - combined = self.processor.clear_request_status("task-1") - self.assertEqual(combined, "34") - self.assertNotIn("task-1", self.processor.decode_status) - - def test_clear_request_status_hf_branch(self): - module, cleanup = _import_text_processor(use_hf_tokenizer=True) - self.addCleanup(cleanup) - processor = module.DataProcessor("stub-model") - processor.decode_status = {"task": [[], [], "transcript"]} - - self.assertEqual(processor.clear_request_status("task"), "transcript") - self.assertNotIn("task", processor.decode_status) - - def test_data_processor_init_handles_missing_generation_config(self): - with mock.patch.object( - self.text_processor_module.GenerationConfig, - "from_pretrained", - side_effect=OSError("missing"), - ): - processor = self.text_processor_module.DataProcessor("stub-model") - self.assertIsNone(processor.generation_config) - - def test_process_response_with_reasoning_and_tools(self): - processor = self.processor - processor.model_status_dict = {"resp": "normal"} - - processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer) - processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-only") - - response = SimpleNamespace( - request_id="resp", outputs=SimpleNamespace(token_ids=[1, processor.tokenizer.eos_token_id]), finished=True - ) - - processed = processor.process_response_obj_normal(response) - self.assertEqual(processed.outputs.text, "tool-only") - self.assertEqual(processed.outputs.reasoning_content, "think") - self.assertEqual(processed.outputs.tool_calls, ["tool"]) - - def test_process_response_streaming_clears_state(self): - processor = self.processor - req_id = "stream" - processor.decode_status[req_id] = [0, 0, [], ""] - response = {"finished": True, "request_id": req_id, "outputs": {"token_ids": [7]}} - response = RequestOutput.from_dict(response) - - result = processor.process_response_obj_streaming(response, enable_thinking=False) - self.assertEqual(result.outputs.text, "7") - self.assertNotIn(req_id, processor.decode_status) - - def test_process_response_obj_normal_with_reasoning(self): - processor = self.processor - processor.model_status_dict = {"normal": "normal"} - processor.reasoning_parser = self.create_dummy_reasoning(processor.tokenizer, reasoning_content="because") - processor.tool_parser_obj = self.create_dummy_tool_parser(processor.tokenizer, content="tool-text") - - response = { - "finished": True, - "request_id": "normal", - "outputs": {"token_ids": [7, processor.tokenizer.eos_token_id]}, - } - response = RequestOutput.from_dict(response) - - result = processor.process_response_obj_normal(response, enable_thinking=True) - self.assertEqual(result.outputs.completion_tokens, "7") - self.assertEqual(result.outputs.text, "tool-text") - self.assertEqual(result.outputs.reasoning_content, "because") - self.assertEqual(result.outputs.reasoning_token_num, 1) - - def test_process_response_dict_dispatch(self): - processor = self.processor - calls = {} - - def fake_stream(response_obj, **kwargs): - calls["stream"] = kwargs - return "stream" - - def fake_normal(response_obj, **kwargs): - calls["normal"] = kwargs - return "normal" - - original_stream = processor.process_response_obj_streaming - original_normal = processor.process_response_obj_normal - processor.process_response_obj_streaming = fake_stream - processor.process_response_obj_normal = fake_normal - self.addCleanup(lambda: setattr(processor, "process_response_obj_streaming", original_stream)) - self.addCleanup(lambda: setattr(processor, "process_response_obj_normal", original_normal)) - - response = {"outputs": {}, "finished": False, "request_id": "req"} - self.assertEqual(processor.process_response_dict(response, stream=True, enable_thinking=True), "stream") - self.assertTrue(calls["stream"]["enable_thinking"]) - self.assertEqual( - processor.process_response_dict(response, stream=False, enable_thinking=True), - "normal", - ) - self.assertTrue(calls["normal"]["enable_thinking"]) - - def test_update_stop_seq_excludes_eos(self): - stop_seqs, stop_len = self.processor.update_stop_seq(["stop", self.processor.tokenizer.eos_token_id]) - self.assertEqual(stop_seqs, [[4]]) - self.assertEqual(stop_len, [1]) - - def test_pad_batch_data_left_padding(self): - padded, lengths = self.processor.pad_batch_data( - [[1], [2, 3]], - pad_id=-1, - return_seq_len=True, - return_array=False, - pad_style="left", - ) - self.assertEqual(padded, [[-1, 1], [2, 3]]) - self.assertEqual(lengths, [1, 2]) - - def test_pad_batch_data_empty_returns_array(self): - padded, lengths = self.processor.pad_batch_data([], return_seq_len=True) - self.assertEqual(padded.shape, (1, 0)) - self.assertEqual(lengths.shape, (0,)) - - def test_get_pad_id_prefers_eos_when_missing(self): - processor = self.text_processor_module.DataProcessor("stub-model") - llama_tokenizer = DummyLlamaTokenizer() - llama_tokenizer.pad_token_id = None - llama_tokenizer.eos_token = 99 - processor.tokenizer = llama_tokenizer - - self.assertEqual(processor.get_pad_id(), 99) - - def test_load_tokenizer_hf_branch(self): - module, cleanup = _import_text_processor(use_hf_tokenizer=True) - self.addCleanup(cleanup) - processor = module.DataProcessor("stub-model") - self.assertIsInstance(processor.tokenizer, DummyTokenizer) - - def test_text2ids_hf_branch(self): - module, cleanup = _import_text_processor(use_hf_tokenizer=True) - self.addCleanup(cleanup) - processor = module.DataProcessor("stub-model") - ids = processor.text2ids("hi", max_model_len=5) - self.assertEqual(ids.tolist(), [2, 0, 0, 0, 0][: len(ids)]) - - def test_process_logprob_response(self): - self.assertEqual(self.processor.process_logprob_response([1, 2]), "1 2") - - def test_process_request_dict_uses_existing_ids(self): - request = {"request_id": "test_0", "prompt_token_ids": [1, 2, 3], "max_tokens": 5} - request = Request.from_dict(request) - processed = self.processor.process_request_dict(request, max_model_len=6) - self.assertEqual(processed.prompt_token_ids, [1, 2, 3]) - self.assertEqual(processed.sampling_params.max_tokens, 3) - - def test_process_request_dict_requires_chat_template(self): - original_template = self.processor.tokenizer.chat_template - self.processor.tokenizer.chat_template = None - self.addCleanup(lambda: setattr(self.processor.tokenizer, "chat_template", original_template)) - with self.assertRaisesRegex(ValueError, "chat_template"): - request = {"request_id": "test_0", "messages": [{"role": "user", "content": "hi"}]} - request = Request.from_dict(request) - self.processor.process_request_dict(request) - - def test_update_bad_words_with_warnings(self): - processor = self.processor - - def custom_tokenize(text): - base = text.strip() - if base == "combo": - return ["co", "mbo"] - if base == "oversize": - return [base] - return [base] - - def custom_convert(tokens): - if tokens == ["co", "mbo"]: - return [1, 2] - if tokens == ["oversize"]: - return [processor.tokenizer.vocab_size + 1] - return [len(tokens[0])] - - original_tokenize = processor.tokenizer.tokenize - original_convert = processor.tokenizer.convert_tokens_to_ids - processor.tokenizer.tokenize = custom_tokenize - processor.tokenizer.convert_tokens_to_ids = custom_convert - self.addCleanup(lambda: setattr(processor.tokenizer, "tokenize", original_tokenize)) - self.addCleanup(lambda: setattr(processor.tokenizer, "convert_tokens_to_ids", original_convert)) - - self.assertEqual(processor.update_bad_words(["combo", "oversize"], []), []) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/input/v1/test_tokenizer_client.py b/tests/input/v1/test_tokenizer_client.py deleted file mode 100644 index 06804ebade3..00000000000 --- a/tests/input/v1/test_tokenizer_client.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import httpx -import pytest -import respx - -from fastdeploy.input.tokenzier_client import ( - AsyncTokenizerClient, - ImageEncodeRequest, - VideoEncodeRequest, -) - - -@pytest.mark.asyncio -@respx.mock -async def test_encode_image_success(): - base_url = "http://testserver" - client = AsyncTokenizerClient(base_url=base_url) - - # Mock 创建任务接口 - respx.post(f"{base_url}/image/encode").mock( - return_value=httpx.Response(200, json={"code": 0, "task_tag": "task123"}) - ) - # Mock 轮询接口,返回完成状态 - mock_get_ret = { - "state": "Finished", - "result": {"feature_url": "bos://host:port/key", "feature_shape": [80, 45, 1563]}, - } - respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json=mock_get_ret)) - - request = ImageEncodeRequest( - version="v1", req_id="req_img_001", is_gen=False, resolution=512, image_url="http://example.com/image.jpg" - ) - - result = await client.encode_image(request) - assert result["feature_url"] == "bos://host:port/key" - assert result["feature_shape"] == [80, 45, 1563] - - -@pytest.mark.asyncio -@respx.mock -async def test_encode_video_failure(): - base_url = "http://testserver" - client = AsyncTokenizerClient(base_url=base_url, max_wait=1) - - respx.post(f"{base_url}/video/encode").mock( - return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_vid_001"}) - ) - # 模拟轮询接口失败状态 - respx.get(f"{base_url}/encode/get").mock( - return_value=httpx.Response(200, json={"state": "Error", "message": "Encode failed"}) - ) - - request = VideoEncodeRequest( - version="v1", - req_id="req_vid_001", - is_gen=True, - resolution=720, - video_url="http://example.com/video.mp4", - start_ts=0.0, - end_ts=10.0, - frames=30, - vit_merge=True, - ) - - with pytest.raises(RuntimeError, match="Encode failed"): - await client.encode_video(request) - - -@pytest.mark.asyncio -@respx.mock -async def test_encode_timeout(): - base_url = "http://testserver" - client = AsyncTokenizerClient(base_url=base_url, max_wait=1, poll_interval=0.1) - - respx.post(f"{base_url}/image/encode").mock( - return_value=httpx.Response(200, json={"code": 0, "task_tag": "task_timeout"}) - ) - # 模拟轮询接口一直返回等待状态,导致超时 - respx.get(f"{base_url}/encode/get").mock(return_value=httpx.Response(200, json={"status": "processing"})) - - request = ImageEncodeRequest( - version="v1", req_id="req_img_timeout", is_gen=False, resolution=256, image_url="http://example.com/image.jpg" - ) - - with pytest.raises(TimeoutError): - await client.encode_image(request) diff --git a/tests/inter_communicator/test_zmq_server.py b/tests/inter_communicator/test_zmq_server.py index 629551b1707..57c9a0c479a 100644 --- a/tests/inter_communicator/test_zmq_server.py +++ b/tests/inter_communicator/test_zmq_server.py @@ -9,7 +9,6 @@ import types import unittest from collections import defaultdict -from multiprocessing.reduction import ForkingPickler from unittest import mock import msgpack @@ -264,19 +263,6 @@ def send(self, msg, flags=0, **kwargs): with self.assertRaises(RuntimeError): server.send_pyobj({"boom": True}) - def test_pack_aggregated_data_respects_env_flag(self): - server = _DummyServer() - responses = [_DummyResponse(1), _DummyResponse(2, finished=True)] - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - packed = server.pack_aggregated_data(responses) - unpacked = ForkingPickler.loads(packed) - self.assertEqual(unpacked[0]["tensor_sum"], 3) - - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True): - packed = server.pack_aggregated_data(responses) - unpacked = ForkingPickler.loads(packed) - self.assertIsInstance(unpacked[0], _DummyResponse) - def test_receive_json_once_paths(self): fake_socket = _FakeSocket() fake_socket.closed = True @@ -360,8 +346,7 @@ def test_send_response_per_query_cache_and_flush(self): self.assertIn(req_id, server.cached_results) server.req_dict[req_id] = b"client" - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)]) + server._send_response_per_query(req_id, [_DummyResponse(4, finished=True)]) self.assertNotIn(req_id, server.req_dict) self.assertEqual(fake_socket.sent[-1][0], "send_multipart") @@ -370,17 +355,7 @@ def test_send_response_per_query_aggregate(self): server = _DummyServer(socket=fake_socket) server.req_dict["req-agg"] = b"client" server.aggregate_send = True - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)]) - self.assertEqual(fake_socket.sent[-1][0], "send_multipart") - - def test_send_response_per_query_v1_processor(self): - fake_socket = _FakeSocket() - server = _DummyServer(socket=fake_socket) - server.req_dict["req-v1"] = b"client" - server.aggregate_send = False - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True): - server._send_response_per_query("req-v1", [_DummyResponse(6, finished=True)]) + server._send_response_per_query("req-agg", [_DummyResponse(5, finished=True)]) self.assertEqual(fake_socket.sent[-1][0], "send_multipart") def test_send_response_per_query_send_failure(self): @@ -391,8 +366,7 @@ def send_multipart(self, parts, copy=True): server = _DummyServer(socket=_ErrorSocket()) server.req_dict["req-error"] = b"client" server.aggregate_send = False - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)]) + server._send_response_per_query("req-error", [_DummyResponse(7, finished=True)]) self.assertEqual(server.req_dict, {}) def test_send_response_per_query_raises_without_socket(self): @@ -436,22 +410,11 @@ def test_send_batch_response_success(self): fake_socket = _FakeSocket() server = _DummyServer(socket=fake_socket) server.address = "test-address" - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - batch_data = [[_DummyResponse(1, finished=True)]] - server._send_batch_response(batch_data) + batch_data = [[_DummyResponse(1, finished=True)]] + server._send_batch_response(batch_data) self.assertEqual(len(fake_socket.sent), 1) self.assertEqual(fake_socket.sent[0][0], "send") - def test_send_batch_response_v1_processor(self): - """Test _send_batch_response with ENABLE_V1_DATA_PROCESSOR=True""" - fake_socket = _FakeSocket() - server = _DummyServer(socket=fake_socket) - server.address = "test-address" - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", True): - batch_data = [[_DummyResponse(1, finished=True)]] - server._send_batch_response(batch_data) - self.assertEqual(len(fake_socket.sent), 1) - def test_send_batch_response_raises_without_socket(self): """Test _send_batch_response logs error and returns when socket is None""" server = _DummyServer(socket=None) @@ -470,9 +433,8 @@ def send(self, msg, flags=0, **kwargs): server = _DummyServer(socket=_ErrorSocket()) server.address = "test-address" batch_data = [[_DummyResponse(1)]] - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - # Should not raise, error is caught and logged - server._send_batch_response(batch_data) + # Should not raise, error is caught and logged + server._send_batch_response(batch_data) def test_recv_result_handle_paths(self): fake_socket = _FakeSocket() @@ -640,10 +602,9 @@ def test_send_batch_response_with_worker_pid_none_uses_default_socket(self): server = _DummyServer(socket=fake_socket) server.address = "test-address" - with mock.patch.object(envs, "ENABLE_V1_DATA_PROCESSOR", False): - batch_data = [[_DummyResponse(1, finished=True)]] - # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket - server._send_batch_response(batch_data, worker_pid=None) + batch_data = [[_DummyResponse(1, finished=True)]] + # worker_pid=None -> goes to the else branch that calls _ensure_socket / uses self.socket + server._send_batch_response(batch_data, worker_pid=None) # The default socket should have been used to send the data self.assertEqual(len(fake_socket.sent), 1) diff --git a/tests/model_executor/test_thinking_budget.py b/tests/model_executor/test_thinking_budget.py index 8ba9319ff7d..d9dd8f4b9b9 100644 --- a/tests/model_executor/test_thinking_budget.py +++ b/tests/model_executor/test_thinking_budget.py @@ -27,13 +27,6 @@ Ernie4_5_VLProcessor as ErnieVLDataProcessor, ) from fastdeploy.input.text_processor import DataProcessor as TextDataProcessor -from fastdeploy.input.v1.ernie4_5_processor import ( - Ernie4_5Processor as V1ErnieTextDataProcessor, -) -from fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor import ( - Ernie4_5_VLProcessor as V1ErnieVLDataProcessor, -) -from fastdeploy.input.v1.text_processor import DataProcessor as V1TextDataProcessor from fastdeploy.model_executor.logits_processor import ThinkingBudgetLogitsProcessor from fastdeploy.scheduler import SchedulerConfig @@ -711,31 +704,6 @@ def parallel_config(self): ips = None -class DummyRequestV1(SimpleNamespace): - def get(self, key, default=None): - if hasattr(self, key): - value = getattr(self, key) - if value is not None: - return value - if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key): - value = getattr(self.sampling_params, key) - if value is not None: - return value - return default - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - setattr(self, key, value) - - def set(self, key, value): - if hasattr(self, "sampling_params") and hasattr(self.sampling_params, key): - setattr(self.sampling_params, key, value) - else: - setattr(self, key, value) - - class TestThinkingBudgetSupplemental(unittest.TestCase): def test_update_thinking_prompt_state_from_text_processor(self): processor = TextDataProcessor.__new__(TextDataProcessor) @@ -750,43 +718,6 @@ def test_update_thinking_prompt_state_from_text_processor(self): self.assertEqual(updated["think_prompt_tokens_after_start"], 0) self.assertEqual(updated["think_prompt_last_token_id"], 3) - def test_v1_process_request_missing_logits_processors_args(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - processor.generation_config = SimpleNamespace( - top_p=0.7, - temperature=1.0, - repetition_penalty=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - ) - processor.eos_token_ids = [1] - processor.update_stop_seq = lambda *args, **kwargs: None - processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids - processor.encode_with_cache = lambda *args, **kwargs: [1] - processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args - processor.reasoning_parser = None - request = DummyRequestV1( - request_id="req", - eos_token_ids=None, - prompt_token_ids=[1], - prompt=None, - messages=None, - max_tokens=1, - chat_template_kwargs=None, - sampling_params=SimpleNamespace( - bad_words=None, - bad_words_token_ids=None, - max_tokens=1, - temperature=1.0, - top_p=0.9, - repetition_penalty=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - ), - ) - with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None): - processor.process_request(request, max_model_len=8) - def test_engine_line_break_id_from_dict(self): tokenizer = DummyTokenizerForTextProcessor() data_processor = SimpleNamespace(tokenizer=tokenizer, eos_token_id_len=1, pad_token_id=0) @@ -835,27 +766,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False): self.assertEqual(processor.encode_with_cache("iter"), [21, 22]) self.assertNotIn(("np", False), processor._tokenize_cache) - def test_v1_encode_with_cache_branches(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - processor._tokenize_cache = OrderedDict() - processor._tokenize_cache_capacity = 1 - call_counter = {"np": 0, "iter": 0} - - def _text2ids(text, max_model_len=None, add_special_tokens=False): - if text == "np": - call_counter["np"] += 1 - return np.array([31, 32], dtype=np.int64) - call_counter["iter"] += 1 - return (v for v in [41, 42]) - - processor.text2ids = _text2ids - - self.assertEqual(processor.encode_with_cache("np"), [31, 32]) - self.assertEqual(processor.encode_with_cache("np"), [31, 32]) - self.assertEqual(call_counter["np"], 1) - self.assertEqual(processor.encode_with_cache("iter"), [41, 42]) - self.assertNotIn(("np", False), processor._tokenize_cache) - def test_text_encode_with_cache_lazy_init(self): processor = TextDataProcessor.__new__(TextDataProcessor) call_counter = {"count": 0} @@ -872,22 +782,6 @@ def _text2ids(text, max_model_len=None, add_special_tokens=False): self.assertEqual(processor.encode_with_cache("lazy"), [51, 52]) self.assertEqual(call_counter["count"], 1) - def test_v1_encode_with_cache_lazy_init(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - call_counter = {"count": 0} - - def _text2ids(text, max_model_len=None, add_special_tokens=False): - call_counter["count"] += 1 - return np.array([61, 62], dtype=np.int64) - - processor.text2ids = _text2ids - - self.assertFalse(hasattr(processor, "_tokenize_cache")) - self.assertEqual(processor.encode_with_cache("lazy"), [61, 62]) - self.assertTrue(hasattr(processor, "_tokenize_cache")) - self.assertEqual(processor.encode_with_cache("lazy"), [61, 62]) - self.assertEqual(call_counter["count"], 1) - def test_ernie_encode_literal_text_with_cache(self): processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor) processor.tokenizer = SimpleNamespace( @@ -898,16 +792,6 @@ def test_ernie_encode_literal_text_with_cache(self): self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72]) self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [71, 72]) - def test_v1_ernie_encode_literal_text_with_cache(self): - processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor) - processor.tokenizer = SimpleNamespace( - tokenize=lambda text: ["token_c", "token_d"], - convert_tokens_to_ids=lambda tokens: [81, 82], - ) - - self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82]) - self.assertEqual(processor._encode_literal_text_with_cache("fallback"), [81, 82]) - def test_text_update_thinking_prompt_state_branches(self): processor = TextDataProcessor.__new__(TextDataProcessor) processor._think_token_ids = None @@ -949,29 +833,6 @@ def test_text_update_thinking_prompt_state_branches(self): # 命中 _get_think_token_ids 的缓存分支 self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID)) - def test_v1_update_thinking_prompt_state_branches(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - processor._think_token_ids = None - processor.tokenizer = DummyTokenizerForTextProcessor() - - self.assertEqual(processor._update_thinking_prompt_state([1], "not-dict"), "not-dict") - self.assertEqual( - processor._update_thinking_prompt_state([1], {"thinking_budget": -1}), {"thinking_budget": -1} - ) - self.assertEqual(processor._update_thinking_prompt_state(None, {"thinking_budget": 1}), {"thinking_budget": 1}) - - with_start_no_end = processor._update_thinking_prompt_state( - np.array([1, THINKING_START_TOKEN_ID, 2, 3], dtype=np.int64), - {"thinking_budget": 4}, - ) - self.assertTrue(with_start_no_end["think_prompt_started"]) - self.assertFalse(with_start_no_end["think_prompt_ended"]) - self.assertEqual(with_start_no_end["think_prompt_tokens_after_start"], 0) - self.assertEqual(with_start_no_end["think_prompt_last_token_id"], 3) - - # 命中 _get_think_token_ids 的缓存分支 - self.assertEqual(processor._get_think_token_ids(), (THINKING_START_TOKEN_ID, THINKING_END_TOKEN_ID)) - def test_text_process_request_dict_think_stop_sentence(self): processor = TextDataProcessor.__new__(TextDataProcessor) processor._apply_default_parameters = lambda request: request @@ -1003,74 +864,6 @@ def test_text_process_request_dict_think_stop_sentence(self): ) self.assertNotIn("think_stop_sentence", processed["logits_processors_args"]) - def test_v1_process_request_think_stop_sentence(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - processor._apply_default_parameters = lambda request: request - processor.eos_token_ids = [1] - processor.update_stop_seq = lambda *args, **kwargs: None - processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids - processor._encode_literal_text_with_cache = lambda text: [301, 302] - processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args - processor.reasoning_parser = None - - request = DummyRequestV1( - request_id="req_v1", - eos_token_ids=[1], - prompt_token_ids=[10], - prompt=None, - messages=None, - logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"}, - bad_words=None, - bad_words_token_ids=None, - max_tokens=1, - temperature=1.0, - top_p=0.9, - ) - with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None): - processed = processor.process_request(request, max_model_len=16) - self.assertEqual( - processed.logits_processors_args.get("think_stop_sentence_token_ids"), - [301, 302], - ) - self.assertNotIn("think_stop_sentence", processed.logits_processors_args) - - def test_v1_process_request_dict_think_stop_sentence(self): - processor = V1TextDataProcessor.__new__(V1TextDataProcessor) - processor._apply_default_parameters = lambda request: request - processor.eos_token_ids = [1] - processor.update_stop_seq = lambda *args, **kwargs: None - processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids - processor._encode_literal_text_with_cache = lambda text: [401, 402] - processor._update_thinking_prompt_state = lambda prompt_token_ids, args: args - processor.reasoning_parser = None - - request = DummyRequestV1( - request_id="req_v1_dict", - eos_token_ids=[1], - prompt_token_ids=[11], - prompt=None, - messages=None, - chat_template_kwargs=None, - sampling_params=SimpleNamespace( - bad_words=None, - bad_words_token_ids=None, - max_tokens=1, - temperature=1.0, - top_p=0.9, - repetition_penalty=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"}, - ), - ) - with patch("fastdeploy.input.v1.text_processor.process_stop_token_ids", lambda *args, **kwargs: None): - processed = processor.process_request_dict(request, max_model_len=16) - self.assertEqual( - processed.sampling_params.logits_processors_args.get("think_stop_sentence_token_ids"), - [401, 402], - ) - self.assertNotIn("think_stop_sentence", processed.sampling_params.logits_processors_args) - def test_ernie_process_request_dict_prepares_thinking_budget_args(self): processor = ErnieTextDataProcessor.__new__(ErnieTextDataProcessor) processor._apply_default_parameters = lambda request: request @@ -1104,46 +897,6 @@ def test_ernie_process_request_dict_prepares_thinking_budget_args(self): self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"]) self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0) - def test_v1_ernie_process_request_dict_prepares_thinking_budget_args(self): - processor = V1ErnieTextDataProcessor.__new__(V1ErnieTextDataProcessor) - processor._apply_default_parameters = lambda request: request - processor.eos_token_ids = [1] - processor.update_stop_seq = lambda *args, **kwargs: None - processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids - processor._encode_literal_text_with_cache = lambda text: [601, 602] - processor.tokenizer = DummyTokenizerForTextProcessor() - processor.reasoning_parser = None - - request = DummyRequestV1( - request_id="req_v1_ernie_text", - eos_token_ids=[1], - prompt_token_ids=[1, THINKING_START_TOKEN_ID, 2], - prompt=None, - messages=None, - chat_template_kwargs=None, - enable_thinking=True, - sampling_params=SimpleNamespace( - bad_words=None, - bad_words_token_ids=None, - max_tokens=1, - temperature=1.0, - top_p=0.9, - repetition_penalty=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - response_max_tokens=None, - n=1, - logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"}, - ), - ) - with patch("fastdeploy.input.v1.ernie4_5_processor.process_stop_token_ids", lambda *args, **kwargs: None): - processed = processor.process_request_dict(request, max_model_len=16) - - self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [601, 602]) - self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"]) - self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"]) - self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0) - def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self): processor = ErnieVLDataProcessor.__new__(ErnieVLDataProcessor) processor._apply_default_parameters = lambda request: request @@ -1182,58 +935,6 @@ def test_ernie_vl_process_request_dict_prepares_thinking_budget_args(self): self.assertFalse(processed["logits_processors_args"]["think_prompt_ended"]) self.assertEqual(processed["logits_processors_args"]["think_prompt_tokens_after_start"], 0) - def test_v1_ernie_vl_process_request_dict_prepares_thinking_budget_args(self): - processor = V1ErnieVLDataProcessor.__new__(V1ErnieVLDataProcessor) - processor._apply_default_parameters = lambda request: request - processor.eos_token_ids = [1] - processor.update_stop_seq = lambda *args, **kwargs: None - processor.update_bad_words = lambda bad_words, bad_words_token_ids: bad_words_token_ids - processor._encode_literal_text_with_cache = lambda text: [801, 802] - processor.tokenizer = DummyTokenizerForTextProcessor() - processor.reasoning_parser = None - processor._check_mm_limits = lambda *args, **kwargs: None - processor.append_completion_tokens = lambda *args, **kwargs: None - processor.pack_outputs = lambda outs: outs - processor.ernie4_5_processor = SimpleNamespace( - request2ids=lambda request: {"input_ids": np.array([1, THINKING_START_TOKEN_ID, 2], dtype=np.int64)} - ) - - request = DummyRequestV1( - request_id="req_v1_ernie_vl", - eos_token_ids=[1], - prompt_token_ids=None, - prompt=None, - messages=[{"role": "user", "content": "hi"}], - chat_template_kwargs=None, - enable_thinking=True, - completion_token_ids=None, - multimodal_data=None, - sampling_params=SimpleNamespace( - bad_words=None, - bad_words_token_ids=None, - max_tokens=1, - temperature=1.0, - top_p=0.9, - repetition_penalty=1.0, - frequency_penalty=0.0, - presence_penalty=0.0, - response_max_tokens=None, - reasoning_max_tokens=None, - n=1, - logits_processors_args={"thinking_budget": 20, "think_stop_sentence": "done"}, - ), - ) - with patch( - "fastdeploy.input.v1.ernie4_5_vl_processor.ernie4_5_vl_processor.process_stop_token_ids", - lambda *args, **kwargs: None, - ): - processed = processor.process_request_dict(request, max_model_len=16) - - self.assertEqual(processed.sampling_params.logits_processors_args["think_stop_sentence_token_ids"], [801, 802]) - self.assertTrue(processed.sampling_params.logits_processors_args["think_prompt_started"]) - self.assertFalse(processed.sampling_params.logits_processors_args["think_prompt_ended"]) - self.assertEqual(processed.sampling_params.logits_processors_args["think_prompt_tokens_after_start"], 0) - if __name__ == "__main__": unittest.main()