From 54f04d49db833d74b5396d5876029a86de0c84c1 Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Tue, 24 Mar 2026 07:12:17 +0000 Subject: [PATCH 1/6] use share mem --- fastdeploy/config.py | 2 + fastdeploy/engine/args_utils.py | 12 + fastdeploy/engine/offload_manager.py | 1254 +++++++++++++++++ fastdeploy/engine/request.py | 1 + .../engine/sched/resource_manager_v1.py | 58 +- fastdeploy/envs.py | 6 + fastdeploy/utils.py | 1 + fastdeploy/worker/worker_process.py | 6 + 8 files changed, 1337 insertions(+), 3 deletions(-) create mode 100644 fastdeploy/engine/offload_manager.py diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 6d31e5ca616..91400d5af1f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1805,6 +1805,7 @@ def __init__( tool_parser: str = None, test_mode=False, routing_replay_config: Optional[RoutingReplayConfig] = None, + enable_decode_offload: bool = False, ): self.model_config: ModelConfig = model_config # type: ignore self.cache_config: CacheConfig = cache_config # type: ignore @@ -1821,6 +1822,7 @@ def __init__( self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config self.router_config: RouterConfig = router_config self.routing_replay_config = routing_replay_config + self.enable_decode_offload = enable_decode_offload # Initialize cuda graph capture list max_capture_shape = self.scheduler_config.max_num_seqs diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 889b11cbdc2..c71815d9e77 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -551,6 +551,11 @@ class EngineArgs: Flag to enable prefill_use_worst_num_tokens. Default is False (disabled). """ + enable_decode_offload: bool = False + """ + Flag to enable decode offload. Default is False (disabled). + """ + def __post_init__(self): """ Post-initialization processing to set default tokenizer if not provided. @@ -1071,6 +1076,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.ep_prefill_use_worst_num_tokens, help="Enable prefill use worst num tokens for EP.", ) + parallel_group.add_argument( + "--enable-decode-offload", + action="store_true", + default=EngineArgs.enable_decode_offload, + help="Enable decode offload.", + ) # Load group load_group = parser.add_argument_group("Load Configuration") @@ -1514,4 +1525,5 @@ def create_engine_config(self) -> FDConfig: plas_attention_config=plas_attention_config, early_stop_config=early_stop_cfg, routing_replay_config=routing_replay_config, + enable_decode_offload=self.enable_decode_offload, ) diff --git a/fastdeploy/engine/offload_manager.py b/fastdeploy/engine/offload_manager.py new file mode 100644 index 00000000000..418c115c49f --- /dev/null +++ b/fastdeploy/engine/offload_manager.py @@ -0,0 +1,1254 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import os +import threading +import uuid +from typing import Dict, List, Optional, Tuple + +import paddle + +from fastdeploy import envs +from fastdeploy.engine.request import Request, RequestStatus +from fastdeploy.utils import offload_logger + +# 导入 share_external_data 用于从共享内存获取 KV cache +try: + from fastdeploy.cache_manager.ops import share_external_data_ +except ImportError: + share_external_data_ = None + + +class OffloadManager: + """ + KV Cache Offload管理器 + + 职责: + 1. 管理被offload请求的KV Cache + 2. 提供Decode阶段批量offload和resume接口 + 3. 维护offloaded请求队列 + 4. 支持多级卸载策略 (L1 GPU -> L2 CPU -> L3 SSD) + """ + + # 存储层级常量 + STORAGE_LEVEL_CPU = "L2" + STORAGE_LEVEL_SSD = "L3" + + def __init__(self, config=None, cache_manager=None, model_runner=None): + """ + 初始化OffloadManager + + Args: + config: FastDeploy配置对象 + cache_manager: PrefixCacheManager实例 + model_runner: ModelRunner实例 (用于访问 KV cache tensors) + """ + self.config = config + self.cache_manager = cache_manager + self.model_runner = model_runner # 用于访问 KV cache + + # offload开关 + self.enable_offload = getattr(config, "enable_decode_offload", False) if config else False + + # TODO:offload策略参数,需要兼容性处理 + self.min_steps = 20 + # cpu block大小为8KB + self.cpu_offloading_chunk_size = getattr(envs, "FD_CPU_OFFLOAD_CHUNK_SIZE", 8192) + # 默认cpu memory限制为50GB + self.cpu_memory_limit = getattr(envs, "FD_CPU_MEMORY_LIMIT", 50 * 1024 * 1024 * 1024) + # ssd存储路径 + self.storage_path = getattr(envs, "FD_OFFLOAD_STORAGE_PATH", "/tmp/fd_offload") + + # 保存offloaded请求的相关cache信息 + self._offloaded_requests: Dict[str, dict] = {} + self._lock = threading.Lock() + + # 缓存配置信息(延迟初始化) + self._cache_config = None + self._key_cache_shape = None + self._value_cache_shape = None + self._num_layers = None + self._tensor_parallel_size = None + self._local_rank = 0 + self._device_id = 0 + self._cache_dtype = None + + # 确保存储目录存在 + if self.enable_offload and not os.path.exists(self.storage_path): + os.makedirs(self.storage_path, exist_ok=True) + + offload_logger.info( + f"[DEBUG: offload] OffloadManager initialized: enable_offload={self.enable_offload}, " + f"min_steps={self.min_steps}, storage_path={self.storage_path}" + ) + + def _init_cache_info(self): + """初始化cache配置信息(延迟初始化)""" + if self._cache_config is not None: + return + + if self.cache_manager is None: + return + + self._cache_config = self.cache_manager.cache_config + self._num_layers = self.config.model_config.num_hidden_layers + self._tensor_parallel_size = getattr(self.config.parallel_config, "tensor_parallel_size", 1) + self._local_rank = getattr(self.config.parallel_config, "tensor_parallel_rank", 0) + + # 从 device_ids 获取实际的设备号(考虑 CUDA_VISIBLE_DEVICES) + device_ids_str = getattr(self.config.parallel_config, "device_ids", "0") + device_ids = device_ids_str.split(",") + local_dp_id = getattr(self.config.parallel_config, "local_data_parallel_id", 0) + # device_id 是当前 rank 对应的实际 GPU 设备号 + rank_in_node = self._local_rank % len(device_ids) + self._device_id = int(device_ids[rank_in_node]) if rank_in_node < len(device_ids) else 0 + + self._cache_dtype = self._cache_config.cache_dtype + + offload_logger.info( + f"[DEBUG: offload] _init_cache_info: local_rank={self._local_rank}, " + f"device_ids={device_ids}, rank_in_node={rank_in_node}, device_id={self._device_id}" + ) + + # 计算KV cache shape - 使用实际的GPU block数量 + # 从cache_config获取总的GPU block数 + total_gpu_blocks = getattr(self._cache_config, "total_block_num", None) + if total_gpu_blocks is None: + # 尝试从cache_manager获取 + total_gpu_blocks = getattr(self.cache_manager, "num_gpu_blocks", None) + if total_gpu_blocks is None: + # 最后尝试从gpu_free_block_list推断 + total_gpu_blocks = len(getattr(self.cache_manager, "gpu_free_block_list", [])) + + if not total_gpu_blocks: + offload_logger.error("[DEBUG: offload] Failed to get total_gpu_blocks, using default 100") + total_gpu_blocks = 100 # 默认值,防止错误 + + key_shape, val_shape = self._compute_kv_cache_shape(total_gpu_blocks) + self._key_cache_shape = key_shape + self._value_cache_shape = val_shape + + offload_logger.info( + f"[DEBUG: offload] Cache info initialized: num_layers={self._num_layers}, " + f"total_gpu_blocks={total_gpu_blocks}, key_shape={self._key_cache_shape}, " + f"value_shape={self._value_cache_shape}" + ) + + def _get_cache_tensor_name(self, layer_id: int, is_key: bool) -> str: + """获取共享内存中cache tensor的名称""" + prefix = "key_caches" if is_key else "value_caches" + return f"{prefix}_{layer_id}_rank{self._local_rank}.device{self._device_id}" + + def _get_cache_scale_tensor_name(self, layer_id: int, is_key: bool) -> str: + """获取共享内存中cache scale tensor的名称(用于量化)""" + prefix = "key_cache_scales" if is_key else "value_cache_scales" + return f"{prefix}_{layer_id}_rank{self._local_rank}.device{self._device_id}" + + def _get_gpu_cache_tensor(self, layer_id: int, is_key: bool) -> Optional[paddle.Tensor]: + """ + 从共享内存获取GPU cache tensor的引用 + + Args: + layer_id: 层ID + is_key: 是否为key cache + + Returns: + paddle.Tensor: GPU cache tensor引用,失败返回None + """ + if share_external_data_ is None: + offload_logger.error("[DEBUG: offload] share_external_data_ is not available") + return None + + try: + tensor_name = self._get_cache_tensor_name(layer_id, is_key) + cache_shape = self._key_cache_shape if is_key else self._value_cache_shape + + if cache_shape is None or len(cache_shape) == 0: + offload_logger.error(f"[DEBUG: offload] cache_shape is None or empty for layer {layer_id}") + return None + + # 创建空tensor并通过share_external_data_绑定到共享内存 + dtype = self._cache_dtype if self._cache_dtype else "bfloat16" + if dtype == "block_wise_fp8": + dtype = paddle.float8_e4m3fn + elif dtype == "int4_zp": + dtype = paddle.int8 + elif dtype == "bfloat16": + dtype = paddle.bfloat16 + else: + dtype = paddle.float16 + + empty_tensor = paddle.empty(shape=[], dtype=dtype) + try: + cache_tensor = share_external_data_(empty_tensor, tensor_name, cache_shape, True) + except Exception as e: + offload_logger.error(f"[DEBUG: offload] share_external_data_ FAILED for {tensor_name}: {e}") + raise + + return cache_tensor + + except Exception as e: + offload_logger.error(f"[DEBUG: offload] Failed to get GPU cache tensor for layer {layer_id}: {e}") + return None + + def _get_gpu_cache_scale_tensor(self, layer_id: int, is_key: bool) -> Optional[paddle.Tensor]: + """ + 从共享内存获取GPU cache scale tensor的引用(用于量化) + """ + if share_external_data_ is None: + return None + + try: + tensor_name = self._get_cache_scale_tensor_name(layer_id, is_key) + # scale shape: [num_blocks, num_heads, block_size] + scale_shape = [ + self._key_cache_shape[0], + self._key_cache_shape[1], + self._key_cache_shape[2], + ] + + empty_tensor = paddle.empty(shape=[], dtype=paddle.float32) + scale_tensor = share_external_data_(empty_tensor, tensor_name, scale_shape, True) + + return scale_tensor + + except Exception as e: + offload_logger.error(f"[DEBUG: offload] Failed to get GPU scale tensor for layer {layer_id}: {e}") + return None + + def _compute_kv_cache_shape(self, max_block_num): + """ + 计算 KV Cache 的 shape + + Args: + max_block_num: 最大 block 数量 + + Returns: + tuple: (key_cache_shape, val_cache_shape) + """ + try: + from fastdeploy.model_executor.layers.attention import get_attention_backend + + config = self.cache_manager.config + cache_config = self.cache_manager.cache_config + + attn_cls = get_attention_backend() + tp_size = getattr(config.parallel_config, "tensor_parallel_size", 1) + num_heads = config.model_config.num_attention_heads // tp_size + kv_num_heads = max( + 1, + int(config.model_config.num_key_value_heads) // tp_size, + ) + head_dim = config.model_config.head_dim + + kv_cache_quant_type = None + if ( + config.quant_config + and hasattr(config.quant_config, "kv_cache_quant_type") + and config.quant_config.kv_cache_quant_type is not None + ): + kv_cache_quant_type = config.quant_config.kv_cache_quant_type + + encoder_block_shape_q = 64 + decoder_block_shape_q = 16 + key_cache_shape, value_cache_shape = attn_cls( + config, + kv_num_heads=kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + encoder_block_shape_q=encoder_block_shape_q, + decoder_block_shape_q=decoder_block_shape_q, + ).get_kv_cache_shape(max_num_blocks=max_block_num, kv_cache_quant_type=kv_cache_quant_type) + + offload_logger.info( + f"[DEBUG: offload] Computed key_cache_shape: {key_cache_shape}, value_cache_shape: {value_cache_shape}" + ) + return key_cache_shape, value_cache_shape + + except Exception as e: + offload_logger.error(f"[DEBUG: offload] Failed to compute kv_cache_shape: {e}") + return None, None + + # ==================== 判断接口 ==================== + + def can_offload(self, request: Request) -> bool: + """ + 检查请求是否可以被offload + + 条件: + 1. offload功能已启用 + 2. 请求未被offload过 + 3. 请求有block_tables可被offload + 4. 请求处于decode阶段(num_computed_tokens >= need_prefill_tokens) + 5. need_prefill_tokens已初始化 + 6. CPU内存充足 + + 注意: decode阶段判定和存储空间检查由调用者或offload_req负责 + """ + # DEBUG: can_offload入口检查 + offload_logger.debug( + f"[DEBUG: can_offload] Checking request {request.request_id}, " + f"enable_offload={self.enable_offload}, is_offloaded={request.is_offloaded}" + ) + + if not self.enable_offload: + offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: offload disabled") + return False + + if request.is_offloaded: + offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: already offloaded") + return False + + # 检查是否有可被offload的blocks + if not request.block_tables: + offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: no block_tables") + return False + + # 新增:检查need_prefill_tokens是否已初始化 + if request.need_prefill_tokens is None: + offload_logger.warning( + f"[DEBUG: can_offload] {request.request_id}: need_prefill_tokens is None, cannot offload" + ) + return False + + # 新增:检查请求是否处于decode阶段 + if request.num_computed_tokens < request.need_prefill_tokens: + offload_logger.warning( + f"[DEBUG: can_offload] {request.request_id} is not in decode phase, " + f"num_computed_tokens={request.num_computed_tokens}, " + f"need_prefill_tokens={request.need_prefill_tokens}, cannot offload" + ) + return False + + # 注意:新方法 .to("cpu") 不需要预先分配的 CPU blocks + # 因此移除对 cpu_free_block_list 的检查 + + offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: can offload = True") + return True + + def can_resume(self, request: Request) -> bool: + """ + 检查请求是否可以被恢复 + + 条件: + 1. offload功能已启用 + 2. 请求存在offloaded信息 + 3. 存在有效的KV Cache副本(cpu_copy或ssd_copy) + 4. GPU内存充足 + """ + if not self.enable_offload: + return False + + if request.request_id not in self._offloaded_requests: + return False + + offloaded_info = self._offloaded_requests.get(request.request_id) + if offloaded_info is None: + return False + + # 检查是否存在有效的KV Cache副本 + storage_level = offloaded_info.get("storage_level") + if storage_level == self.STORAGE_LEVEL_CPU: + if offloaded_info.get("kv_cache_cpu") is None: + return False + elif storage_level == self.STORAGE_LEVEL_SSD: + storage_path = offloaded_info.get("storage_path") + if not storage_path or not os.path.exists(storage_path): + return False + else: + return False + + # 检查GPU内存是否充足 + num_blocks_needed = offloaded_info.get("num_blocks_needed", 0) + if self.cache_manager is None: + return False + + return self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed) + + def offload_decode( + self, running_requests: List[Request], min_steps: int = 20 + ) -> Tuple[List[Request], List[Request]]: + """ + 批量offload decode请求,直到当前batch能运行min_steps个step为止 + + 注意: 调度策略由外层ResourceManager决定,此函数仅负责: + 1. 检测请求是否处于decode阶段 + 2. 执行offload操作 + 3. 保存KV Cache相关信息 + 4. 更新请求状态 + + Args: + running_requests: 待offload的请求列表(已按调度策略排序) + min_steps: 最小运行step数(默认20) + + Returns: + Tuple[offloaded_reqs, abort_reqs]: + - offloaded_reqs: 成功offload的请求 + - abort_reqs: offload失败的请求(需要调用者abort处理) + """ + # DEBUG: offload - 入口调试 + offload_logger.info( + f"[DEBUG: offload_decode] offload_decode called, enable_offload={self.enable_offload}, " + f"num_requests={len(running_requests)}" + ) + + if not self.enable_offload: + return [], [] + + offloaded_reqs = [] + abort_reqs = [] + remaining_count = len(running_requests) + + for req in running_requests: + # 非decode阶段的请求不应出现在这里,记录warning但不处理 + # 请求状态未修改,调用者可重新调度 + if req.num_computed_tokens < req.need_prefill_tokens: + offload_logger.warning( + f"[DEBUG: offload_decode] Request {req.request_id} is not in decode phase, " + f"num_computed_tokens={req.num_computed_tokens}, " + f"need_prefill_tokens={req.need_prefill_tokens}" + ) + continue + + # DEBUG: offload - can_offload 检查 + can_offload_result = self.can_offload(req) + offload_logger.info( + f"[DEBUG: offload_decode] can_offload({req.request_id})={can_offload_result}, " + f"is_offloaded={req.is_offloaded}, block_tables={len(req.block_tables) if req.block_tables else 0}" + ) + + if not can_offload_result: + continue + + # 执行offload + if self.offload_req(req): + offloaded_reqs.append(req) + remaining_count -= 1 + offload_logger.info(f"[DEBUG: offload_decode] Successfully offloaded request {req.request_id}") + else: + # offload失败,返回给调用者处理(需要abort) + abort_reqs.append(req) + offload_logger.warning(f"[DEBUG: offload_decode] Failed to offload request {req.request_id}") + + if self.cache_manager is not None and remaining_count > 0: + block_size = self.cache_manager.cache_config.block_size + blocks_needed_per_request = (min_steps + block_size - 1) // block_size + total_blocks_needed = remaining_count * blocks_needed_per_request + current_free_blocks = len(getattr(self.cache_manager, "gpu_free_block_list", [])) + if current_free_blocks >= total_blocks_needed: + offload_logger.info( + f"[DEBUG: offload_decode] Memory sufficient after offloading " + f"{len(offloaded_reqs)} requests, remaining={remaining_count}, " + f"free_blocks={current_free_blocks}, needed={total_blocks_needed}" + ) + break + + return offloaded_reqs, abort_reqs + + # ==================== 单请求多级Offload接口 ==================== + + def offload_req(self, request: Request) -> bool: + """ + 指定请求触发多级卸载(L1→L2→L3) + + 执行步骤: + 1. 检查请求是否处于decode阶段 + 2. 尝试L2 offload(CPU内存) + 3. 如果L2内存不足,触发L3 offload(SSD存储) + 4. 释放对应GPU blocks + 5. 更新请求状态 + """ + if not self.enable_offload: + return False + + # 检查是否已经被offload + if request.is_offloaded: + offload_logger.warning(f"[DEBUG: offload_req] Request {request.request_id} already offloaded") + return False + + # 初始化cache信息 + self._init_cache_info() + + # 尝试L2 offload (CPU) + storage_level = self.STORAGE_LEVEL_CPU + kv_cache_cpu = None + + try: + kv_cache_cpu = self.get_cpu_copy(request) + if kv_cache_cpu is None: + # CPU offload失败,尝试SSD offload + # 注意: SSD offload同样需要先获取数据,这里直接返回失败 + # 如果未来需要SSD offload,需要实现直接GPU->SSD的传输 + offload_logger.error( + f"[DEBUG: offload_req] CPU offload failed for {request.request_id}, " f"no available fallback" + ) + return False + except Exception as e: + offload_logger.error(f"[DEBUG: offload_req] CPU offload failed: {e}") + return False + + # 如果需要L3,保存到SSD (当前kv_cache_cpu已包含数据) + storage_path = None + if storage_level == self.STORAGE_LEVEL_SSD: + try: + storage_path = self.save_to_storage(kv_cache_cpu) + if storage_path is None: + offload_logger.error(f"[DEBUG: offload_req] SSD offload failed for {request.request_id}") + # 清理已分配的CPU blocks + if kv_cache_cpu and self.cache_manager: + self.cache_manager.recycle_cpu_blocks(kv_cache_cpu.get("cpu_block_ids", [])) + return False + if kv_cache_cpu is not None: + del kv_cache_cpu + kv_cache_cpu = None + except Exception as e: + offload_logger.error(f"[DEBUG: offload_req] SSD offload failed: {e}") + return False + + # 保存offload信息 - 在释放GPU blocks之前保存 + with self._lock: + original_block_tables = list(request.block_tables) if request.block_tables else [] + + # 新增:确保need_prefill_tokens不为None,提供默认值 + need_prefill_tokens_value = request.need_prefill_tokens + if need_prefill_tokens_value is None: + # 如果need_prefill_tokens未初始化,使用prompt_token_ids_len作为默认值 + need_prefill_tokens_value = request.prompt_token_ids_len if request.prompt_token_ids_len else 0 + offload_logger.warning( + f"[DEBUG: offload_req] Request {request.request_id} need_prefill_tokens is None during offload, " + f"using default value: {need_prefill_tokens_value}" + ) + + self._offloaded_requests[request.request_id] = { + "kv_cache_cpu": kv_cache_cpu, + "storage_path": storage_path, + "storage_level": storage_level, + "num_tokens": request.num_total_tokens, + "num_blocks_needed": len(original_block_tables), + "output_token_ids": list(request.output_token_ids), + "num_computed_tokens": request.num_computed_tokens, + "need_prefill_tokens": need_prefill_tokens_value, + "prompt_token_ids": list(request.prompt_token_ids) if request.prompt_token_ids else None, + "prompt_token_ids_len": request.prompt_token_ids_len, + "sampling_params": request.sampling_params, + "block_tables": original_block_tables, + } + + # 释放GPU blocks + self.release_gpu_blocks(request) + + # 更新请求状态 + request.status = RequestStatus.PREEMPTED + request.is_offloaded = True + + offload_logger.info( + f"[DEBUG: offload_req] Request {request.request_id} offloaded to {storage_level}, " + f"num_tokens={request.num_total_tokens}, output_tokens={len(request.output_token_ids)}, " + f"blocks_needed={len(original_block_tables)}" + ) + + return True + + def offload_kv_cache(self, request: Request, target_level: str = "L2") -> bool: + """ + 调用多级memory offload工具函数,并释放相应GPU blocks + + 用于扩展性调用,可单独对指定请求进行KV Cache offload + + Args: + request: 需要offload的请求 + target_level: 目标存储层级("L2"=CPU, "L3"=SSD) + + Returns: + bool: offload是否成功 + """ + try: + if target_level == self.STORAGE_LEVEL_CPU: + kv_cache_cpu = self.get_cpu_copy(request) + return kv_cache_cpu is not None + elif target_level == self.STORAGE_LEVEL_SSD: + kv_cache_cpu = self.get_cpu_copy(request) + storage_path = self.save_to_storage(kv_cache_cpu) + if kv_cache_cpu is not None: + del kv_cache_cpu + return storage_path is not None + else: + offload_logger.error(f"[DEBUG: offload_kv_cache] Invalid target_level: {target_level}") + return False + except Exception as e: + offload_logger.error(f"[DEBUG: offload_kv_cache] offload_kv_cache failed: {e}") + return False + + def release_gpu_blocks(self, request: Request) -> None: + if self.cache_manager is None: + return + + if request.block_tables: + blocks_to_release = list(request.block_tables) + offload_logger.info( + f"[DEBUG: release_gpu_blocks] Releasing {len(blocks_to_release)} blocks for request {request.request_id}" + ) + self.cache_manager.recycle_gpu_blocks(blocks_to_release, request.request_id) + request.block_tables = [] + + # ==================== CPU Memory Offload接口 ==================== + + def get_cpu_copy(self, request: Request) -> Optional[dict]: + """ + 从GPU获取KV Cache的CPU副本 + + 使用 paddle.Tensor.to("cpu") 将KV cache从GPU复制到CPU内存 + + Args: + request: 需要offload的请求 + + Returns: + dict: 包含CPU上KV cache数据的字典,失败返回None + { + "key_caches": List[paddle.Tensor], # CPU上的key cache列表 + "value_caches": List[paddle.Tensor], # CPU上的value cache列表 + "key_scales": List[paddle.Tensor], # 可选,用于量化 + "value_scales": List[paddle.Tensor], # 可选,用于量化 + "block_ids": List[int], # 对应的block IDs + "num_blocks": int, + } + """ + import time + + start_time = time.time() + + if not request.block_tables: + offload_logger.warning(f"[DEBUG: get_cpu_copy] {request.request_id}: no block_tables") + return None + + self._init_cache_info() + + if self._key_cache_shape is None: + offload_logger.error("[DEBUG: get_cpu_copy] key_cache_shape is not initialized") + return None + + try: + block_ids = list(request.block_tables) + num_blocks = len(block_ids) + + key_caches_cpu = [] + value_caches_cpu = [] + key_scales_cpu = [] + value_scales_cpu = [] + + offload_logger.info( + f"[DEBUG: get_cpu_copy] Copying KV cache for request {request.request_id}, " + f"num_layers={self._num_layers}, num_blocks={num_blocks}, block_ids={block_ids}" + ) + + for layer_id in range(self._num_layers): + # 获取GPU上的key cache + key_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=True) + if key_cache_gpu is None: + offload_logger.error(f"[DEBUG: get_cpu_copy] Failed to get key cache for layer {layer_id}") + return None + + # 获取需要的blocks数据 + key_cache_blocks = [] + for block_id in block_ids: + if block_id < key_cache_gpu.shape[0]: + key_cache_blocks.append(key_cache_gpu[block_id]) + else: + offload_logger.error( + f"[DEBUG: get_cpu_copy] Block {block_id} out of range for key cache (shape={key_cache_gpu.shape})" + ) + return None + + # 拼接并复制到CPU + key_cache_layer = ( + paddle.stack(key_cache_blocks) if len(key_cache_blocks) > 1 else key_cache_blocks[0].unsqueeze(0) + ) + key_cache_cpu = key_cache_layer.to("cpu") + key_caches_cpu.append(key_cache_cpu) + + # 获取value cache(如果有) + if self._value_cache_shape and len(self._value_cache_shape) > 0: + value_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=False) + if value_cache_gpu is not None: + value_cache_blocks = [] + for block_id in block_ids: + if block_id < value_cache_gpu.shape[0]: + value_cache_blocks.append(value_cache_gpu[block_id]) + else: + offload_logger.error( + f"[DEBUG: get_cpu_copy] Block {block_id} out of range for value cache" + ) + return None + + value_cache_layer = ( + paddle.stack(value_cache_blocks) + if len(value_cache_blocks) > 1 + else value_cache_blocks[0].unsqueeze(0) + ) + value_cache_cpu = value_cache_layer.to("cpu") + value_caches_cpu.append(value_cache_cpu) + + # 获取scale tensors(用于量化) + if self._cache_dtype == "block_wise_fp8": + key_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=True) + if key_scale_gpu is not None: + key_scale_blocks = [key_scale_gpu[block_id] for block_id in block_ids] + key_scale_layer = ( + paddle.stack(key_scale_blocks) + if len(key_scale_blocks) > 1 + else key_scale_blocks[0].unsqueeze(0) + ) + key_scales_cpu.append(key_scale_layer.to("cpu")) + + value_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=False) + if value_scale_gpu is not None: + value_scale_blocks = [value_scale_gpu[block_id] for block_id in block_ids] + value_scale_layer = ( + paddle.stack(value_scale_blocks) + if len(value_scale_blocks) > 1 + else value_scale_blocks[0].unsqueeze(0) + ) + value_scales_cpu.append(value_scale_layer.to("cpu")) + + result = { + "key_caches": key_caches_cpu, + "value_caches": value_caches_cpu, + "key_scales": key_scales_cpu if key_scales_cpu else None, + "value_scales": value_scales_cpu if value_scales_cpu else None, + "block_ids": block_ids, + "num_blocks": num_blocks, + } + + elapsed_time = time.time() - start_time + offload_logger.info( + f"[DEBUG: get_cpu_copy] Successfully copied KV cache to CPU for request {request.request_id}, " + f"key_cache_shape={key_caches_cpu[0].shape if key_caches_cpu else None}, " + f"elapsed_time={elapsed_time:.4f}s" + ) + + return result + + except Exception as e: + elapsed_time = time.time() - start_time + offload_logger.error( + f"[DEBUG: get_cpu_copy] Failed to copy KV cache to CPU: {e}, elapsed_time={elapsed_time:.4f}s" + ) + return None + + def load_cpu_copy(self, kv_cache_cpu: dict, request: Request) -> bool: + """ + 将CPU上的KV Cache加载回GPU + + 使用 paddle.Tensor.to(device) 将KV cache从CPU复制回GPU共享内存 + + Args: + kv_cache_cpu: CPU上的KV Cache信息(get_cpu_copy返回的字典) + request: 需要恢复的请求 + + Returns: + bool: 加载是否成功 + """ + if kv_cache_cpu is None: + offload_logger.error("[DEBUG: load_cpu_copy] kv_cache_cpu is None") + return False + + if not request.block_tables: + offload_logger.error(f"[DEBUG: load_cpu_copy] {request.request_id}: no block_tables allocated") + return False + + start_time = time.time() + try: + key_caches_cpu = kv_cache_cpu.get("key_caches") + value_caches_cpu = kv_cache_cpu.get("value_caches") + key_scales_cpu = kv_cache_cpu.get("key_scales") + value_scales_cpu = kv_cache_cpu.get("value_scales") + + if not key_caches_cpu: + offload_logger.error("[DEBUG: load_cpu_copy] key_caches is empty") + return False + + device = f"gpu:{self._device_id}" + new_block_ids = list(request.block_tables) + + offload_logger.info( + f"[DEBUG: load_cpu_copy] Loading KV cache from CPU for request {request.request_id}, " + f"num_layers={len(key_caches_cpu)}, num_blocks={len(new_block_ids)}" + ) + + for layer_id, key_cache_cpu in enumerate(key_caches_cpu): + # 获取GPU上的key cache + key_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=True) + if key_cache_gpu is None: + offload_logger.error(f"[DEBUG: load_cpu_copy] Failed to get GPU key cache for layer {layer_id}") + return False + + # 复制到GPU + key_cache_gpu_data = key_cache_cpu.to(device) + + # 写入到新的block IDs + for idx, block_id in enumerate(new_block_ids): + if block_id < key_cache_gpu.shape[0] and idx < key_cache_gpu_data.shape[0]: + key_cache_gpu[block_id] = key_cache_gpu_data[idx] + else: + offload_logger.error( + f"[DEBUG: load_cpu_copy] Block ID out of range: block_id={block_id}, " + f"gpu_shape={key_cache_gpu.shape}, idx={idx}" + ) + return False + + # 复制value cache(如果有) + if value_caches_cpu and layer_id < len(value_caches_cpu): + value_cache_cpu = value_caches_cpu[layer_id] + value_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=False) + if value_cache_gpu is not None: + value_cache_gpu_data = value_cache_cpu.to(device) + for idx, block_id in enumerate(new_block_ids): + if block_id < value_cache_gpu.shape[0] and idx < value_cache_gpu_data.shape[0]: + value_cache_gpu[block_id] = value_cache_gpu_data[idx] + else: + offload_logger.error( + f"[DEBUG: load_cpu_copy] Block ID out of range for value: block_id={block_id}" + ) + return False + + # 复制scales(用于量化) + if key_scales_cpu and layer_id < len(key_scales_cpu): + key_scale_cpu = key_scales_cpu[layer_id] + key_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=True) + if key_scale_gpu is not None: + key_scale_gpu_data = key_scale_cpu.to(device) + for idx, block_id in enumerate(new_block_ids): + if block_id < key_scale_gpu.shape[0] and idx < key_scale_gpu_data.shape[0]: + key_scale_gpu[block_id] = key_scale_gpu_data[idx] + + if value_scales_cpu and layer_id < len(value_scales_cpu): + value_scale_cpu = value_scales_cpu[layer_id] + value_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=False) + if value_scale_gpu is not None: + value_scale_gpu_data = value_scale_cpu.to(device) + for idx, block_id in enumerate(new_block_ids): + if block_id < value_scale_gpu.shape[0] and idx < value_scale_gpu_data.shape[0]: + value_scale_gpu[block_id] = value_scale_gpu_data[idx] + + elapsed_time = time.time() - start_time + offload_logger.info( + f"[DEBUG: load_cpu_copy] Successfully loaded KV cache to GPU for request {request.request_id}, " + f"elapsed_time={elapsed_time:.4f}s" + ) + return True + + except Exception as e: + elapsed_time = time.time() - start_time + offload_logger.error( + f"[DEBUG: load_cpu_copy] Failed to load KV cache to GPU: {e}, elapsed_time={elapsed_time:.4f}s" + ) + return False + + # ==================== SSD Storage Offload接口 ==================== + + def save_to_storage(self, kv_cache_cpu) -> Optional[str]: + """ + 将CPU上的KV Cache保存到SSD存储 + + Args: + kv_cache_cpu: CPU上的KV Cache信息 + + Returns: + str: 存储文件路径,失败返回None + """ + try: + if kv_cache_cpu is None: + return None + + storage_file = f"kv_cache_{uuid.uuid4().hex}.pdparams" + storage_path = os.path.join(self.storage_path, storage_file) + + # 保存KV Cache信息到文件 + paddle.save(kv_cache_cpu, storage_path) + + offload_logger.info(f"[DEBUG: offload_save_to_storage] Saved KV cache to {storage_path}") + return storage_path + + except Exception as e: + offload_logger.error(f"[DEBUG: offload_save_to_storage] save_to_storage failed: {e}") + return None + + def load_from_storage(self, storage_path: str) -> Optional[dict]: + """ + 从SSD存储加载KV Cache到CPU + + Args: + storage_path: 存储文件路径 + + Returns: + dict: CPU上的KV Cache信息,失败返回None + """ + try: + if not os.path.exists(storage_path): + offload_logger.error(f"[DEBUG: off_load_save_to_storage] Storage file not found: {storage_path}") + return None + + kv_cache_cpu = paddle.load(storage_path) + offload_logger.info(f"[DEBUG: off_load_save_to_storage] Loaded KV cache from {storage_path}") + return kv_cache_cpu + + except Exception as e: + offload_logger.error(f"[DEBUG: off_load_save_to_storage] load_from_storage failed: {e}") + return None + + # ==================== Resume接口 ==================== + + def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: + """ + 恢复被offload的请求到GPU + + 执行步骤: + 1. 检查GPU是否有足够内存 + 2. 根据存储层级选择恢复路径(L2/L3) + 3. 验证cache数据完整性 + 4. 检查是否为decode阶段(token_num > prefill_token_num) + 5. 分配GPU blocks并加载数据 + 6. 恢复请求状态 + + Returns: + Tuple[bool, Optional[int]]: + - bool: resume是否成功 + - int: cache中的token数量(即使resume失败也返回,用于重新计算) + """ + start_time = time.time() + if not self.enable_offload: + return False, None + + # 使用锁保护offloaded_requests的读取 + with self._lock: + if request.request_id not in self._offloaded_requests: + offload_logger.warning(f"[DEBUG: resume_decode] Request {request.request_id} is not offloaded") + return False, None + + offloaded_info = self._offloaded_requests.get(request.request_id) + if offloaded_info is None: + return False, None + + # 复制需要的信息,避免长时间持有锁 + storage_level = offloaded_info["storage_level"] + num_blocks_needed = offloaded_info["num_blocks_needed"] + saved_num_tokens = offloaded_info["num_tokens"] + saved_num_computed_tokens = offloaded_info["num_computed_tokens"] + saved_need_prefill_tokens = offloaded_info["need_prefill_tokens"] + storage_path = offloaded_info.get("storage_path") + # 对于CPU层级,需要复制kv_cache_cpu引用(用于完整性检查) + kv_cache_cpu_ref = offloaded_info.get("kv_cache_cpu") if storage_level == self.STORAGE_LEVEL_CPU else None + cache_valid_flag = offloaded_info.get("cache_valid", True) + # 复制output_token_ids和need_prefill_tokens用于恢复 + output_token_ids = list(offloaded_info.get("output_token_ids", [])) + need_prefill_tokens = offloaded_info.get("need_prefill_tokens") + + # 检查是否为decode阶段 + if saved_num_computed_tokens <= saved_need_prefill_tokens: + offload_logger.warning( + f"[DEBUG: resume_decode] Request {request.request_id} is not in decode phase " + f"(num_computed_tokens={saved_num_computed_tokens}, " + f"need_prefill_tokens={saved_need_prefill_tokens}), " + f"should recompute instead of resume" + ) + # 返回token数,让调用者决定是否重新计算 + return False, saved_num_computed_tokens + + if self.cache_manager is None: + return False, saved_num_computed_tokens + + if not self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed): + offload_logger.warning( + f"[DEBUG: resume_decode] Insufficient GPU memory for request {request.request_id}, " + f"need {num_blocks_needed} blocks" + ) + return False, saved_num_computed_tokens + + # 检查cache_valid_flag,如果之前已经标记为无效,直接返回失败 + if not cache_valid_flag: + offload_logger.warning( + f"[DEBUG: resume_decode] Cache for request {request.request_id} is marked as invalid" + ) + return False, saved_num_computed_tokens + + try: + kv_cache_cpu = None + cache_valid = False + + # 根据存储层级恢复 + if storage_level == self.STORAGE_LEVEL_CPU: + kv_cache_cpu = kv_cache_cpu_ref + if kv_cache_cpu is None: + offload_logger.error(f"[DEBUG: resume_decode] No CPU cache found for {request.request_id}") + else: + # 构建临时的offloaded_info用于验证 + temp_offloaded_info = { + "num_blocks_needed": num_blocks_needed, + "num_tokens": saved_num_tokens, + } + cache_valid = self._verify_cache_integrity(kv_cache_cpu, temp_offloaded_info) + + elif storage_level == self.STORAGE_LEVEL_SSD: + if not storage_path or not os.path.exists(storage_path): + offload_logger.error(f"[DEBUG: resume_decode] No SSD storage path for {request.request_id}") + else: + kv_cache_cpu = self.load_from_storage(storage_path) + if kv_cache_cpu is not None: + # 构建临时的offloaded_info用于验证 + temp_offloaded_info = { + "num_blocks_needed": num_blocks_needed, + "num_tokens": saved_num_tokens, + } + cache_valid = self._verify_cache_integrity(kv_cache_cpu, temp_offloaded_info) + else: + offload_logger.error(f"[DEBUG: resume_decode] Failed to load from storage: {storage_path}") + + # 验证cache完整性 + if not cache_valid: + offload_logger.error( + f"[DEBUG: resume_decode] Cache integrity check failed for {request.request_id}, " + f"saved_tokens={saved_num_tokens}, cache may be corrupted" + ) + # 清理无效的cache资源 + if kv_cache_cpu is not None and isinstance(kv_cache_cpu, dict): + cpu_block_ids = kv_cache_cpu.get("cpu_block_ids", []) + if cpu_block_ids and self.cache_manager: + self.cache_manager.recycle_cpu_blocks(cpu_block_ids) + + # 更新offloaded_info标记为无效,避免后续再次尝试使用无效的CPU blocks + with self._lock: + if request.request_id in self._offloaded_requests: + offloaded_info = self._offloaded_requests[request.request_id] + offloaded_info["kv_cache_cpu"] = None + offloaded_info["cache_valid"] = False + # 不删除offloaded_info,保留其他元数据供后续使用 + + # 返回token数,让调用者可以重新计算 + return False, saved_num_computed_tokens + + # 分配GPU blocks + new_block_ids = self.cache_manager.allocate_gpu_blocks(num_blocks_needed, request.request_id) + request.block_tables = new_block_ids + + # 更新kv_cache_cpu中的block_ids为新的分配 + if kv_cache_cpu is not None: + kv_cache_cpu["block_ids"] = new_block_ids + + # 加载cache到GPU + if not self.load_cpu_copy(kv_cache_cpu, request): + offload_logger.error(f"[DEBUG: resume_decode] Failed to load CPU copy to GPU for {request.request_id}") + # 释放已分配的blocks + self.cache_manager.recycle_gpu_blocks(new_block_ids, request.request_id) + request.block_tables = [] + return False, saved_num_computed_tokens + + # 对于SSD层级,清理临时内存 + if storage_level == self.STORAGE_LEVEL_SSD: + del kv_cache_cpu + + # 恢复请求状态 + request.output_token_ids = output_token_ids + request.num_computed_tokens = saved_num_computed_tokens + request.need_prefill_tokens = need_prefill_tokens + request.status = RequestStatus.RUNNING + request.is_offloaded = False + + # 在 resume 成功时,清理 abort 标志,避免后续生成错误的 RequestOutput + # 注意:Request 类没有 outputs 属性,outputs 是 RequestOutput 的属性 + # 这里我们设置一个标记,表示该请求已成功恢复,后续处理不应生成 abort 的 RequestOutput + offload_logger.info( + f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully, " + f"output_tokens={len(output_token_ids)}, idx={request.idx}" + ) + + # 清理offloaded信息 + with self._lock: + self._offloaded_requests.pop(request.request_id, None) + + # 清理SSD存储文件 + if storage_level == self.STORAGE_LEVEL_SSD and storage_path: + try: + os.remove(storage_path) + except Exception as e: + offload_logger.warning(f"[DEBUG: resume_decode] Failed to delete storage file: {e}") + + elapsed_time = time.time() - start_time + offload_logger.info( + f"[DEBUG: resume_decode] Resumed request {request.request_id} from {storage_level}, " + f"output_tokens={len(request.output_token_ids)}, elapsed_time={elapsed_time:.4f}s" + ) + + # 尝试预取其他 SSD 数据到 CPU + self.prefetch_ssd_to_cpu() + + return True, saved_num_computed_tokens + + except Exception as e: + elapsed_time = time.time() - start_time + offload_logger.error( + f"[DEBUG: resume_decode] Failed to resume request {request.request_id}: {e}, elapsed_time={elapsed_time:.4f}s" + ) + # 失败时保持offload状态,下次可以重试 + return False, saved_num_computed_tokens + + def _verify_cache_integrity(self, kv_cache_cpu: dict, offloaded_info: dict) -> bool: + """ + 验证cache数据的完整性 + + Args: + kv_cache_cpu: CPU上的KV Cache信息 + offloaded_info: 保存的offload信息 + + Returns: + bool: cache是否有效 + """ + if kv_cache_cpu is None: + return False + + # 检查必要的字段 + cpu_block_ids = kv_cache_cpu.get("block_ids", []) + num_blocks = kv_cache_cpu.get("num_blocks", 0) + + if not cpu_block_ids or num_blocks == 0: + offload_logger.warning("[DEBUG: offload] Cache integrity check: missing block_ids or num_blocks") + return False + + # 检查block数量是否匹配 + expected_num_blocks = offloaded_info.get("num_blocks_needed", 0) + if len(cpu_block_ids) != num_blocks or num_blocks != expected_num_blocks: + offload_logger.warning( + f"[DEBUG: offload] Cache integrity check: block count mismatch, " + f"expected={expected_num_blocks}, actual={num_blocks}" + ) + return False + + # 检查key_caches是否存在 + key_caches = kv_cache_cpu.get("key_caches") + if not key_caches or len(key_caches) == 0: + offload_logger.warning("[DEBUG: offload] Cache integrity check: missing key_caches") + return False + + # 检查token数量是否一致 + saved_num_tokens = offloaded_info.get("num_tokens", 0) + block_size = self.cache_manager.cache_config.block_size if self.cache_manager else 64 + actual_max_tokens = num_blocks * block_size + if saved_num_tokens > actual_max_tokens: + offload_logger.warning( + f"[DEBUG: offload] Cache integrity check: token count exceeds capacity, " + f"saved_tokens={saved_num_tokens}, max_capacity={actual_max_tokens}" + ) + return False + + offload_logger.debug(f"Cache integrity check passed: num_blocks={num_blocks}, num_tokens={saved_num_tokens}") + return True + + # ==================== 辅助接口 ==================== + + def cleanup_offloaded_request(self, request_id: str) -> None: + """清理被offload请求的缓存(请求完成时调用)""" + with self._lock: + if request_id not in self._offloaded_requests: + return + + offloaded_info = self._offloaded_requests[request_id] + + # 清理CPU内存中的KV cache tensors + kv_cache_cpu = offloaded_info.get("kv_cache_cpu") + if kv_cache_cpu is not None: + # 显式删除CPU tensors释放内存 + for key in ["key_caches", "value_caches", "key_scales", "value_scales"]: + cache_list = kv_cache_cpu.get(key) + if cache_list: + for tensor in cache_list: + del tensor + kv_cache_cpu[key] = None + del offloaded_info["kv_cache_cpu"] + + # 清理SSD存储文件 + storage_path = offloaded_info.get("storage_path") + if storage_path and os.path.exists(storage_path): + try: + os.remove(storage_path) + offload_logger.info(f"[DEBUG: offload] Deleted storage file: {storage_path}") + except Exception as e: + offload_logger.warning(f"[DEBUG: offload] Failed to delete storage file: {e}") + + self._offloaded_requests.pop(request_id) + offload_logger.info(f"[DEBUG: offload] Cleaned up offloaded request: {request_id}") + + def get_offloaded_request_count(self) -> int: + """获取当前offloaded的请求数量""" + with self._lock: + return len(self._offloaded_requests) + + def get_offloaded_request_ids(self) -> List[str]: + """获取当前所有offloaded的请求ID列表""" + with self._lock: + return list(self._offloaded_requests.keys()) + + def prefetch_ssd_to_cpu(self) -> int: + """ + 后台预取:将 SSD 上的 KV Cache 预取到 CPU 内存 + + 当 CPU 内存有空闲时调用,减少 resume 时的延迟 + + Returns: + int: 成功预取的请求数量 + """ + if not self.enable_offload or self.cache_manager is None: + return 0 + + prefetched_count = 0 + + with self._lock: + # 找出存储在 SSD 上且还没有 CPU copy 的请求 + ssd_requests = [ + (req_id, info) + for req_id, info in self._offloaded_requests.items() + if info.get("storage_level") == self.STORAGE_LEVEL_SSD and info.get("kv_cache_cpu") is None + ] + + for req_id, info in ssd_requests: + storage_path = info.get("storage_path") + if not storage_path or not os.path.exists(storage_path): + continue + + # 检查 CPU 内存是否充足 + num_blocks = info.get("num_blocks_needed", 0) + if num_blocks > len(self.cache_manager.cpu_free_block_list): + break # CPU 内存不足,停止预取 + + try: + # 从 SSD 加载到 CPU + kv_cache_cpu = self.load_from_storage(storage_path) + if kv_cache_cpu is None: + continue + + # 更新 offloaded 信息 + with self._lock: + if req_id in self._offloaded_requests: + self._offloaded_requests[req_id]["kv_cache_cpu"] = kv_cache_cpu + self._offloaded_requests[req_id]["storage_level"] = self.STORAGE_LEVEL_CPU + + prefetched_count += 1 + offload_logger.info( + f"[DEBUG: offload_prefetch_ssd_to_cpu] Prefetched request {req_id} from SSD to CPU" + ) + + except Exception as e: + offload_logger.warning(f"[DEBUG: offload_prefetch_ssd_to_cpu] Failed to prefetch {req_id}: {e}") + + return prefetched_count diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 1e2a53ed205..d17ddf6c7c8 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -185,6 +185,7 @@ def __init__( self.status = RequestStatus.WAITING self.task_type = RequestType.PREFILL self.has_been_preempted_before = False + self.is_offloaded = False self.idx = None self.need_prefill_tokens = self.prompt_token_ids_len self.audio_output_token_ids = [] diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 9205f1c05c5..cfb4440a815 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -49,7 +49,12 @@ from fastdeploy.spec_decode import SpecMethod from fastdeploy.trace.constants import LoggingEventName from fastdeploy.trace.trace_logger import print as trace_print -from fastdeploy.utils import download_from_bos, init_bos_client, llm_logger +from fastdeploy.utils import ( + download_from_bos, + init_bos_client, + llm_logger, + offload_logger, +) @dataclass @@ -220,6 +225,13 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l # Scheduler-side requests that have not been moved into resource manager waiting queue yet. self.scheduler_unhandled_request_num = 0 + # OffloadManager for decode instances + self.offload_manager = None + if config.scheduler_config.splitwise_role == "decode" and getattr(config, "enable_decode_offload", False): + from fastdeploy.engine.offload_manager import OffloadManager + + self.offload_manager = OffloadManager(config, self.cache_manager, None) + def allocated_slots(self, request: Request): return len(request.block_tables) * self.config.cache_config.block_size @@ -322,6 +334,27 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re if preempted_req.use_extend_tables: self.running.insert(0, preempted_req) continue + + # Try offload for decode instance requests in decode phase + is_decode_phase = ( + preempted_req.num_computed_tokens >= preempted_req.need_prefill_tokens + if preempted_req.need_prefill_tokens is not None + else False + ) + offloaded = False + if ( + self.config.scheduler_config.splitwise_role == "decode" + and is_decode_phase + and self.offload_manager is not None + and self.offload_manager.can_offload(preempted_req) + ): + if self.offload_manager.offload_req(preempted_req): + offloaded = True + offload_logger.info( + f"Request {preempted_req.request_id} offloaded before preempt, " + f"tokens={preempted_req.num_computed_tokens}" + ) + preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 if self.config.scheduler_config.splitwise_role == "decode": @@ -331,10 +364,12 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re del self.requests[preempted_req.request_id] if preempted_req.request_id in self.req_dict: del self.req_dict[preempted_req.request_id] - self._free_blocks(preempted_req) + if not offloaded: + self._free_blocks(preempted_req) llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}") else: - self._free_blocks(preempted_req) + if not offloaded: + self._free_blocks(preempted_req) preempted_req.num_cached_blocks = 0 self.to_be_rescheduled_request_id_set.add(preempted_req.request_id) trace_print( @@ -956,6 +991,23 @@ def _allocate_decode_and_extend(): self._free_blocks(request) break elif request.status == RequestStatus.PREEMPTED: + # Try to resume offloaded request first + if request.is_offloaded and self.offload_manager is not None: + resume_success, _ = self.offload_manager.resume_decode(request) + if resume_success: + offload_logger.info(f"Resumed offloaded request {request.request_id}") + self.waiting.popleft() + self.running.append(request) + scheduled_reqs.append(self._prepare_decode_task(request)) + continue + else: + offload_logger.debug( + f"Failed to resume offloaded request {request.request_id}, will retry" + ) + skip_requests.append(request) + self.waiting.popleft() + continue + request.need_prefill_tokens = ( request.num_total_tokens ) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 3ab5061e78e..9fb3c292a2e 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -169,6 +169,12 @@ def _validate_split_kv_size(value: int) -> int: "FD_FILL_BITMASK_BATCH": lambda: int(os.getenv("FD_FILL_BITMASK_BATCH", "4")), "FD_ENABLE_PDL": lambda: int(os.getenv("FD_ENABLE_PDL", "1")), "FD_ENABLE_ASYNC_LLM": lambda: int(os.getenv("FD_ENABLE_ASYNC_LLM", "0")), + # Enable decode KV cache offload for preempted requests + "FD_ENABLE_DECODE_OFFLOAD": lambda: bool(int(os.getenv("FD_ENABLE_DECODE_OFFLOAD", "0"))), + # CPU memory limit in bytes for offload + "FD_CPU_MEMORY_LIMIT": lambda: int(os.getenv("FD_CPU_MEMORY_LIMIT", str(50 * 1024 * 1024 * 1024))), + "FD_CPU_OFFLOAD_CHUNK_SIZE": lambda: int(os.getenv("FD_CPU_OFFLOAD_CHUNK_SIZE", "8192")), + "FD_OFFLOAD_STORAGE_PATH": lambda: os.getenv("FD_OFFLOAD_STORAGE_PATH", "/tmp/fastdeploy_offload"), "FD_GUIDANCE_DISABLE_ADDITIONAL": lambda: bool(int(os.getenv("FD_GUIDANCE_DISABLE_ADDITIONAL", "1"))), "FD_LLGUIDANCE_LOG_LEVEL": lambda: int(os.getenv("FD_LLGUIDANCE_LOG_LEVEL", "0")), # "Number of tokens in the group for Mixture of Experts (MoE) computation processing on HPU" diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index f09082364f4..06a79251e82 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -1163,6 +1163,7 @@ def _bos_download(bos_client, link): router_logger = get_logger("router", "router.log") fmq_logger = get_logger("fmq", "fmq.log") obj_logger = get_logger("obj", "obj.log") # debug内存问题 +offload_logger = get_logger("offload", "offload_manager.log") # debug offload def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]: diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 940e37a9421..edd58631870 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -868,6 +868,11 @@ def parse_args(): action="store_true", help="enable chunked moe", ) + parser.add_argument( + "--enable_decode_offload", + action="store_true", + help="enable decode KV cache offload for preempted requests", + ) parser.add_argument( "--chunked_moe_size", type=int, @@ -1223,6 +1228,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: scheduler_config=scheduler_config, ips=args.ips, plas_attention_config=plas_attention_config, + enable_decode_offload=args.enable_decode_offload, structured_outputs_config=structured_outputs_config, eplb_config=eplb_config, routing_replay_config=routing_replay_config, From 8462090dd1e6b44a6ccd2d7405c1d24af706b839 Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Tue, 24 Mar 2026 16:43:51 +0800 Subject: [PATCH 2/6] use cache_transfer_manager --- fastdeploy/cache_manager/cache_data.py | 3 + fastdeploy/cache_manager/cache_tasks.py | 17 ++ .../cache_manager/cache_transfer_manager.py | 135 ++++++++- .../cache_manager/prefix_cache_manager.py | 27 +- fastdeploy/engine/common_engine.py | 13 +- fastdeploy/engine/offload_manager.py | 268 ++++++++---------- .../engine/sched/resource_manager_v1.py | 14 +- fastdeploy/output/token_processor.py | 3 + 8 files changed, 309 insertions(+), 171 deletions(-) diff --git a/fastdeploy/cache_manager/cache_data.py b/fastdeploy/cache_manager/cache_data.py index 84e7d804c32..ea273d49446 100644 --- a/fastdeploy/cache_manager/cache_data.py +++ b/fastdeploy/cache_manager/cache_data.py @@ -32,6 +32,9 @@ class CacheStatus(Enum): CPU = 3 GPU2STORAGE = 4 STORAGE2GPU = 5 + DECODE_OFFLOAD = 6 + DECODE_RESUME = 7 + DECODE_CLEANUP = 8 class BlockNode: diff --git a/fastdeploy/cache_manager/cache_tasks.py b/fastdeploy/cache_manager/cache_tasks.py index fe15263827a..a34294e28a0 100644 --- a/fastdeploy/cache_manager/cache_tasks.py +++ b/fastdeploy/cache_manager/cache_tasks.py @@ -35,3 +35,20 @@ class ReadStorageTask(CacheTask): @dataclass(frozen=True, kw_only=True) class WriteStorageTask(CacheTask): timeout: float = 30.0 + + +@dataclass(frozen=True, kw_only=True) +class DecodeOffloadTask: + task_id: str + gpu_block_ids: List[int] + + +@dataclass(frozen=True, kw_only=True) +class DecodeResumeTask: + task_id: str + gpu_block_ids: List[int] + + +@dataclass(frozen=True, kw_only=True) +class DecodeCleanupTask: + task_id: str diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index 74386f909af..a5434f00c5e 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -31,7 +31,13 @@ from fastdeploy import envs from fastdeploy.cache_manager.cache_data import CacheStatus -from fastdeploy.cache_manager.cache_tasks import ReadStorageTask, WriteStorageTask +from fastdeploy.cache_manager.cache_tasks import ( + DecodeCleanupTask, + DecodeOffloadTask, + DecodeResumeTask, + ReadStorageTask, + WriteStorageTask, +) from fastdeploy.cache_manager.ops import ( cuda_host_alloc, cuda_host_free, @@ -199,9 +205,13 @@ def __init__(self, args): self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.read_storage_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.write_back_storage_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self.decode_offload_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self.decode_resume_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self.decode_cleanup_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.timeout_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=2) self.transfer_task_queue = queue.Queue() # 用来接收传输任务 self.tansfer_done_queue = queue.Queue() # 用来告知任务执行完毕 + self.decode_offload_snapshots = {} address = (args.pod_ip, args.cache_queue_port) self.cache_task_queue = EngineCacheQueue( @@ -1035,6 +1045,108 @@ def check_work_status(self, time_interval_threashold=envs.FD_CACHE_PROC_EXIT_TIM return True, "" + def _snapshot_blocks_to_cpu(self, gpu_cache_tensors, block_ids: List[int]): + snapshots = [] + for cache_tensor in gpu_cache_tensors: + blocks = [cache_tensor[block_id] for block_id in block_ids] + layer_tensor = paddle.stack(blocks) if len(blocks) > 1 else blocks[0].unsqueeze(0) + snapshots.append(layer_tensor.to("cpu")) + return snapshots + + def _restore_blocks_from_cpu(self, cpu_tensors, gpu_cache_tensors, block_ids: List[int]): + device = f"gpu:{self.device}" + for layer_id, cpu_tensor in enumerate(cpu_tensors): + gpu_tensor = gpu_cache_tensors[layer_id] + gpu_data = cpu_tensor.to(device) + for idx, block_id in enumerate(block_ids): + gpu_tensor[block_id] = gpu_data[idx] + + def decode_offload_task(self, task: DecodeOffloadTask): + ok = False + meta = { + "rank": self.rank, + "num_blocks": len(task.gpu_block_ids), + "storage_level": "L2", + } + try: + if not task.gpu_block_ids: + raise ValueError(f"decode offload task {task.task_id} has empty gpu_block_ids") + snapshot = { + "key_caches": self._snapshot_blocks_to_cpu(self.gpu_cache_k_tensors, task.gpu_block_ids), + "value_caches": ( + self._snapshot_blocks_to_cpu(self.gpu_cache_v_tensors, task.gpu_block_ids) + if self.gpu_cache_v_tensors + else [] + ), + "key_scales": ( + self._snapshot_blocks_to_cpu(self.gpu_cache_scales_k_tensors, task.gpu_block_ids) + if self.gpu_cache_scales_k_tensors + else [] + ), + "value_scales": ( + self._snapshot_blocks_to_cpu(self.gpu_cache_scales_v_tensors, task.gpu_block_ids) + if self.gpu_cache_scales_v_tensors + else [] + ), + } + self.decode_offload_snapshots[task.task_id] = snapshot + ok = True + except Exception as e: + meta["error"] = str(e) + logger.error( + f"decode_offload_task failed for {task.task_id}, error: {e}, traceback:\n{traceback.format_exc()}" + ) + finally: + result = (CacheStatus.DECODE_OFFLOAD, task.task_id, self.rank, ok, meta) + self.cache_task_queue.put_transfer_done_signal(result) + + def decode_resume_task(self, task: DecodeResumeTask): + ok = False + meta = { + "rank": self.rank, + "num_blocks": len(task.gpu_block_ids), + } + try: + if task.task_id not in self.decode_offload_snapshots: + raise KeyError(f"snapshot for {task.task_id} not found") + snapshot = self.decode_offload_snapshots[task.task_id] + self._restore_blocks_from_cpu(snapshot["key_caches"], self.gpu_cache_k_tensors, task.gpu_block_ids) + if self.gpu_cache_v_tensors and snapshot["value_caches"]: + self._restore_blocks_from_cpu(snapshot["value_caches"], self.gpu_cache_v_tensors, task.gpu_block_ids) + if self.gpu_cache_scales_k_tensors and snapshot["key_scales"]: + self._restore_blocks_from_cpu( + snapshot["key_scales"], self.gpu_cache_scales_k_tensors, task.gpu_block_ids + ) + if self.gpu_cache_scales_v_tensors and snapshot["value_scales"]: + self._restore_blocks_from_cpu( + snapshot["value_scales"], self.gpu_cache_scales_v_tensors, task.gpu_block_ids + ) + del self.decode_offload_snapshots[task.task_id] + ok = True + except Exception as e: + meta["error"] = str(e) + logger.error( + f"decode_resume_task failed for {task.task_id}, error: {e}, traceback:\n{traceback.format_exc()}" + ) + finally: + result = (CacheStatus.DECODE_RESUME, task.task_id, self.rank, ok, meta) + self.cache_task_queue.put_transfer_done_signal(result) + + def decode_cleanup_task(self, task: DecodeCleanupTask): + ok = False + meta = {"rank": self.rank} + try: + self.decode_offload_snapshots.pop(task.task_id, None) + ok = True + except Exception as e: + meta["error"] = str(e) + logger.error( + f"decode_cleanup_task failed for {task.task_id}, error: {e}, traceback:\n{traceback.format_exc()}" + ) + finally: + result = (CacheStatus.DECODE_CLEANUP, task.task_id, self.rank, ok, meta) + self.cache_task_queue.put_transfer_done_signal(result) + def submit_task(self, thread_pool: concurrent.futures.ThreadPoolExecutor, task_fn, *args): def inflight_task(fn, *args): @@ -1129,6 +1241,27 @@ def do_data_transfer(self): self.write_back_storage_task, write_storage_task, ) + elif event_type.value == CacheStatus.DECODE_OFFLOAD.value: + decode_offload_task = event_args[0] + self.submit_task( + self.decode_offload_thread_pool, + self.decode_offload_task, + decode_offload_task, + ) + elif event_type.value == CacheStatus.DECODE_RESUME.value: + decode_resume_task = event_args[0] + self.submit_task( + self.decode_resume_thread_pool, + self.decode_resume_task, + decode_resume_task, + ) + elif event_type.value == CacheStatus.DECODE_CLEANUP.value: + decode_cleanup_task = event_args[0] + self.submit_task( + self.decode_cleanup_thread_pool, + self.decode_cleanup_task, + decode_cleanup_task, + ) else: if self.n_ranks > 1: self.cache_task_queue.barrier2.wait() diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index b022f61c26e..0fcabc7f204 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -97,6 +97,7 @@ def __init__( self.task_write_back_event = {} self.task_prefetch_event = {} self.storage_prefetch_block_ids = {} + self.transfer_result_handlers = [] # gpu cache data structure self.gpu_lru_leaf_heap = [] @@ -293,7 +294,11 @@ def launch_cache_manager( else: storage_arg_str = " " - if self.cache_config.swap_space or self.cache_config.kvcache_storage_backend: + if ( + self.cache_config.swap_space + or self.cache_config.kvcache_storage_backend + or getattr(self.config, "enable_decode_offload", False) + ): for i in range(tensor_parallel_size): launch_cmd = ( "FLAGS_allocator_strategy=auto_growth " @@ -352,7 +357,11 @@ def launch_cache_manager( ) # Start additional threads - if cache_config.kvcache_storage_backend or self.num_cpu_blocks > 0: + if ( + cache_config.kvcache_storage_backend + or self.num_cpu_blocks > 0 + or getattr(self.config, "enable_decode_offload", False) + ): logger.info("Enable hierarchical cache.") threading.Thread(target=self.recv_data_transfer_result, daemon=True).start() if cache_config.enable_prefix_caching: @@ -1191,6 +1200,10 @@ def wait_prefetch_storage_task(self, req_id): del self.storage_prefetch_block_ids[req_id] return storage_block_ids + def register_transfer_result_handler(self, handler): + if handler not in self.transfer_result_handlers: + self.transfer_result_handlers.append(handler) + def free_nodes_directly(self, node): with self.request_release_lock: try: @@ -2058,6 +2071,16 @@ def recv_data_transfer_result(self): time.sleep(0.001) continue event_type = data[0] + handled = False + for handler in self.transfer_result_handlers: + try: + if handler(data): + handled = True + break + except Exception as e: + logger.warning(f"transfer result handler failed: {e}") + if handled: + continue if event_type.value == CacheStatus.STORAGE2GPU.value: logger.info(f"recv_data_transfer_result: {data}") diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 5a060a1cade..8948a330df0 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -1028,17 +1028,8 @@ def _fetch_request(): if self.cfg.scheduler_config.splitwise_role == "decode": for task in tasks: if task.task_type == RequestType.PREEMPTED: - msg = f"{task.request_id} decode not enough blocks, need to be rescheduled." - self.llm_logger.error(msg) - self.scheduler.put_results( - [ - RequestOutput( - request_id=task.request_id, - finished=True, - error_code=500, - error_msg=msg, - ) - ] + self.llm_logger.info( + f"{task.request_id} decode request is preempted and waiting for reschedule." ) self.resource_manager.get_real_bsz() for task in tasks: diff --git a/fastdeploy/engine/offload_manager.py b/fastdeploy/engine/offload_manager.py index 418c115c49f..865b72e4123 100644 --- a/fastdeploy/engine/offload_manager.py +++ b/fastdeploy/engine/offload_manager.py @@ -22,14 +22,17 @@ import paddle from fastdeploy import envs +from fastdeploy.cache_manager.cache_data import CacheStatus +from fastdeploy.cache_manager.cache_tasks import ( + DecodeCleanupTask, + DecodeOffloadTask, + DecodeResumeTask, +) from fastdeploy.engine.request import Request, RequestStatus from fastdeploy.utils import offload_logger -# 导入 share_external_data 用于从共享内存获取 KV cache -try: - from fastdeploy.cache_manager.ops import share_external_data_ -except ImportError: - share_external_data_ = None +# Legacy direct-tensor helpers are kept only for SSD/offline fallback stubs. +share_external_data_ = None class OffloadManager: @@ -75,13 +78,16 @@ def __init__(self, config=None, cache_manager=None, model_runner=None): # 保存offloaded请求的相关cache信息 self._offloaded_requests: Dict[str, dict] = {} self._lock = threading.Lock() + self.max_resume_retry = 3 + self._transfer_events: Dict[Tuple[int, str], threading.Event] = {} + self._transfer_results: Dict[Tuple[int, str], list] = {} # 缓存配置信息(延迟初始化) self._cache_config = None self._key_cache_shape = None self._value_cache_shape = None self._num_layers = None - self._tensor_parallel_size = None + self._tensor_parallel_size = getattr(getattr(config, "parallel_config", None), "tensor_parallel_size", 1) self._local_rank = 0 self._device_id = 0 self._cache_dtype = None @@ -94,6 +100,55 @@ def __init__(self, config=None, cache_manager=None, model_runner=None): f"[DEBUG: offload] OffloadManager initialized: enable_offload={self.enable_offload}, " f"min_steps={self.min_steps}, storage_path={self.storage_path}" ) + if self.cache_manager is not None and hasattr(self.cache_manager, "register_transfer_result_handler"): + self.cache_manager.register_transfer_result_handler(self._handle_transfer_result) + + def _transfer_key(self, event_type, task_id: str) -> Tuple[int, str]: + return (event_type.value, task_id) + + def _handle_transfer_result(self, data) -> bool: + event_type = data[0] + if event_type.value not in ( + CacheStatus.DECODE_OFFLOAD.value, + CacheStatus.DECODE_RESUME.value, + CacheStatus.DECODE_CLEANUP.value, + ): + return False + task_id, rank, ok, meta = data[1:] + key = self._transfer_key(event_type, task_id) + with self._lock: + if key not in self._transfer_results: + self._transfer_results[key] = [] + self._transfer_results[key].append( + { + "rank": rank, + "ok": ok, + "meta": meta, + } + ) + if len(self._transfer_results[key]) >= self._tensor_parallel_size: + event = self._transfer_events.get(key) + if event is not None: + event.set() + return True + + def _issue_transfer_task(self, event_type, task): + if self.cache_manager is None or not hasattr(self.cache_manager, "cache_task_queue"): + return None + key = self._transfer_key(event_type, task.task_id) + event = threading.Event() + with self._lock: + self._transfer_events[key] = event + self._transfer_results.pop(key, None) + self.cache_manager.cache_task_queue.put_transfer_task((event_type, task)) + event.wait() + with self._lock: + results = self._transfer_results.pop(key, []) + self._transfer_events.pop(key, None) + return { + "ok": bool(results) and all(item["ok"] for item in results), + "results": results, + } def _init_cache_info(self): """初始化cache配置信息(延迟初始化)""" @@ -359,16 +414,7 @@ def can_resume(self, request: Request) -> bool: if offloaded_info is None: return False - # 检查是否存在有效的KV Cache副本 - storage_level = offloaded_info.get("storage_level") - if storage_level == self.STORAGE_LEVEL_CPU: - if offloaded_info.get("kv_cache_cpu") is None: - return False - elif storage_level == self.STORAGE_LEVEL_SSD: - storage_path = offloaded_info.get("storage_path") - if not storage_path or not os.path.exists(storage_path): - return False - else: + if offloaded_info.get("snapshot_handle") is None: return False # 检查GPU内存是否充足 @@ -481,42 +527,16 @@ def offload_req(self, request: Request) -> bool: # 初始化cache信息 self._init_cache_info() - - # 尝试L2 offload (CPU) storage_level = self.STORAGE_LEVEL_CPU - kv_cache_cpu = None - - try: - kv_cache_cpu = self.get_cpu_copy(request) - if kv_cache_cpu is None: - # CPU offload失败,尝试SSD offload - # 注意: SSD offload同样需要先获取数据,这里直接返回失败 - # 如果未来需要SSD offload,需要实现直接GPU->SSD的传输 - offload_logger.error( - f"[DEBUG: offload_req] CPU offload failed for {request.request_id}, " f"no available fallback" - ) - return False - except Exception as e: - offload_logger.error(f"[DEBUG: offload_req] CPU offload failed: {e}") + if self.cache_manager is None: + return False + snapshot_task = DecodeOffloadTask(task_id=request.request_id, gpu_block_ids=list(request.block_tables)) + snapshot_result = self._issue_transfer_task(CacheStatus.DECODE_OFFLOAD, snapshot_task) + if snapshot_result is None or not snapshot_result.get("ok", False): + offload_logger.error( + f"[DEBUG: offload_req] Failed to snapshot request {request.request_id}, result={snapshot_result}" + ) return False - - # 如果需要L3,保存到SSD (当前kv_cache_cpu已包含数据) - storage_path = None - if storage_level == self.STORAGE_LEVEL_SSD: - try: - storage_path = self.save_to_storage(kv_cache_cpu) - if storage_path is None: - offload_logger.error(f"[DEBUG: offload_req] SSD offload failed for {request.request_id}") - # 清理已分配的CPU blocks - if kv_cache_cpu and self.cache_manager: - self.cache_manager.recycle_cpu_blocks(kv_cache_cpu.get("cpu_block_ids", [])) - return False - if kv_cache_cpu is not None: - del kv_cache_cpu - kv_cache_cpu = None - except Exception as e: - offload_logger.error(f"[DEBUG: offload_req] SSD offload failed: {e}") - return False # 保存offload信息 - 在释放GPU blocks之前保存 with self._lock: @@ -533,8 +553,6 @@ def offload_req(self, request: Request) -> bool: ) self._offloaded_requests[request.request_id] = { - "kv_cache_cpu": kv_cache_cpu, - "storage_path": storage_path, "storage_level": storage_level, "num_tokens": request.num_total_tokens, "num_blocks_needed": len(original_block_tables), @@ -545,6 +563,8 @@ def offload_req(self, request: Request) -> bool: "prompt_token_ids_len": request.prompt_token_ids_len, "sampling_params": request.sampling_params, "block_tables": original_block_tables, + "snapshot_handle": request.request_id, + "resume_retry_count": 0, } # 释放GPU blocks @@ -911,7 +931,7 @@ def load_from_storage(self, storage_path: str) -> Optional[dict]: # ==================== Resume接口 ==================== - def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: + def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: """ 恢复被offload的请求到GPU @@ -930,28 +950,25 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: """ start_time = time.time() if not self.enable_offload: - return False, None + return False, None, False # 使用锁保护offloaded_requests的读取 with self._lock: if request.request_id not in self._offloaded_requests: offload_logger.warning(f"[DEBUG: resume_decode] Request {request.request_id} is not offloaded") - return False, None + return False, None, True offloaded_info = self._offloaded_requests.get(request.request_id) if offloaded_info is None: - return False, None + return False, None, True # 复制需要的信息,避免长时间持有锁 storage_level = offloaded_info["storage_level"] num_blocks_needed = offloaded_info["num_blocks_needed"] - saved_num_tokens = offloaded_info["num_tokens"] saved_num_computed_tokens = offloaded_info["num_computed_tokens"] saved_need_prefill_tokens = offloaded_info["need_prefill_tokens"] - storage_path = offloaded_info.get("storage_path") - # 对于CPU层级,需要复制kv_cache_cpu引用(用于完整性检查) - kv_cache_cpu_ref = offloaded_info.get("kv_cache_cpu") if storage_level == self.STORAGE_LEVEL_CPU else None - cache_valid_flag = offloaded_info.get("cache_valid", True) + snapshot_handle = offloaded_info.get("snapshot_handle") + resume_retry_count = offloaded_info.get("resume_retry_count", 0) # 复制output_token_ids和need_prefill_tokens用于恢复 output_token_ids = list(offloaded_info.get("output_token_ids", [])) need_prefill_tokens = offloaded_info.get("need_prefill_tokens") @@ -964,100 +981,43 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: f"need_prefill_tokens={saved_need_prefill_tokens}), " f"should recompute instead of resume" ) - # 返回token数,让调用者决定是否重新计算 - return False, saved_num_computed_tokens + return False, saved_num_computed_tokens, True if self.cache_manager is None: - return False, saved_num_computed_tokens + return False, saved_num_computed_tokens, True if not self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed): offload_logger.warning( f"[DEBUG: resume_decode] Insufficient GPU memory for request {request.request_id}, " f"need {num_blocks_needed} blocks" ) - return False, saved_num_computed_tokens - - # 检查cache_valid_flag,如果之前已经标记为无效,直接返回失败 - if not cache_valid_flag: - offload_logger.warning( - f"[DEBUG: resume_decode] Cache for request {request.request_id} is marked as invalid" - ) - return False, saved_num_computed_tokens + should_recompute = resume_retry_count + 1 >= self.max_resume_retry + with self._lock: + if request.request_id in self._offloaded_requests: + self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 + return False, saved_num_computed_tokens, should_recompute try: - kv_cache_cpu = None - cache_valid = False - - # 根据存储层级恢复 - if storage_level == self.STORAGE_LEVEL_CPU: - kv_cache_cpu = kv_cache_cpu_ref - if kv_cache_cpu is None: - offload_logger.error(f"[DEBUG: resume_decode] No CPU cache found for {request.request_id}") - else: - # 构建临时的offloaded_info用于验证 - temp_offloaded_info = { - "num_blocks_needed": num_blocks_needed, - "num_tokens": saved_num_tokens, - } - cache_valid = self._verify_cache_integrity(kv_cache_cpu, temp_offloaded_info) - - elif storage_level == self.STORAGE_LEVEL_SSD: - if not storage_path or not os.path.exists(storage_path): - offload_logger.error(f"[DEBUG: resume_decode] No SSD storage path for {request.request_id}") - else: - kv_cache_cpu = self.load_from_storage(storage_path) - if kv_cache_cpu is not None: - # 构建临时的offloaded_info用于验证 - temp_offloaded_info = { - "num_blocks_needed": num_blocks_needed, - "num_tokens": saved_num_tokens, - } - cache_valid = self._verify_cache_integrity(kv_cache_cpu, temp_offloaded_info) - else: - offload_logger.error(f"[DEBUG: resume_decode] Failed to load from storage: {storage_path}") - - # 验证cache完整性 - if not cache_valid: - offload_logger.error( - f"[DEBUG: resume_decode] Cache integrity check failed for {request.request_id}, " - f"saved_tokens={saved_num_tokens}, cache may be corrupted" - ) - # 清理无效的cache资源 - if kv_cache_cpu is not None and isinstance(kv_cache_cpu, dict): - cpu_block_ids = kv_cache_cpu.get("cpu_block_ids", []) - if cpu_block_ids and self.cache_manager: - self.cache_manager.recycle_cpu_blocks(cpu_block_ids) - - # 更新offloaded_info标记为无效,避免后续再次尝试使用无效的CPU blocks - with self._lock: - if request.request_id in self._offloaded_requests: - offloaded_info = self._offloaded_requests[request.request_id] - offloaded_info["kv_cache_cpu"] = None - offloaded_info["cache_valid"] = False - # 不删除offloaded_info,保留其他元数据供后续使用 - - # 返回token数,让调用者可以重新计算 - return False, saved_num_computed_tokens + if snapshot_handle is None: + return False, saved_num_computed_tokens, True # 分配GPU blocks new_block_ids = self.cache_manager.allocate_gpu_blocks(num_blocks_needed, request.request_id) request.block_tables = new_block_ids - - # 更新kv_cache_cpu中的block_ids为新的分配 - if kv_cache_cpu is not None: - kv_cache_cpu["block_ids"] = new_block_ids - - # 加载cache到GPU - if not self.load_cpu_copy(kv_cache_cpu, request): - offload_logger.error(f"[DEBUG: resume_decode] Failed to load CPU copy to GPU for {request.request_id}") - # 释放已分配的blocks + resume_task = DecodeResumeTask(task_id=snapshot_handle, gpu_block_ids=new_block_ids) + restore_result = self._issue_transfer_task(CacheStatus.DECODE_RESUME, resume_task) + if restore_result is None or not restore_result.get("ok", False): + offload_logger.error( + f"[DEBUG: resume_decode] Failed to restore CPU snapshot for {request.request_id}, " + f"result={restore_result}" + ) self.cache_manager.recycle_gpu_blocks(new_block_ids, request.request_id) request.block_tables = [] - return False, saved_num_computed_tokens - - # 对于SSD层级,清理临时内存 - if storage_level == self.STORAGE_LEVEL_SSD: - del kv_cache_cpu + should_recompute = resume_retry_count + 1 >= self.max_resume_retry + with self._lock: + if request.request_id in self._offloaded_requests: + self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 + return False, saved_num_computed_tokens, should_recompute # 恢复请求状态 request.output_token_ids = output_token_ids @@ -1094,15 +1054,20 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: # 尝试预取其他 SSD 数据到 CPU self.prefetch_ssd_to_cpu() - return True, saved_num_computed_tokens + return True, saved_num_computed_tokens, False except Exception as e: elapsed_time = time.time() - start_time + should_recompute = False offload_logger.error( f"[DEBUG: resume_decode] Failed to resume request {request.request_id}: {e}, elapsed_time={elapsed_time:.4f}s" ) - # 失败时保持offload状态,下次可以重试 - return False, saved_num_computed_tokens + with self._lock: + if request.request_id in self._offloaded_requests: + retries = self._offloaded_requests[request.request_id].get("resume_retry_count", 0) + 1 + self._offloaded_requests[request.request_id]["resume_retry_count"] = retries + should_recompute = retries >= self.max_resume_retry + return False, saved_num_computed_tokens, should_recompute def _verify_cache_integrity(self, kv_cache_cpu: dict, offloaded_info: dict) -> bool: """ @@ -1164,18 +1129,7 @@ def cleanup_offloaded_request(self, request_id: str) -> None: return offloaded_info = self._offloaded_requests[request_id] - - # 清理CPU内存中的KV cache tensors - kv_cache_cpu = offloaded_info.get("kv_cache_cpu") - if kv_cache_cpu is not None: - # 显式删除CPU tensors释放内存 - for key in ["key_caches", "value_caches", "key_scales", "value_scales"]: - cache_list = kv_cache_cpu.get(key) - if cache_list: - for tensor in cache_list: - del tensor - kv_cache_cpu[key] = None - del offloaded_info["kv_cache_cpu"] + snapshot_handle = offloaded_info.get("snapshot_handle") # 清理SSD存储文件 storage_path = offloaded_info.get("storage_path") @@ -1187,6 +1141,12 @@ def cleanup_offloaded_request(self, request_id: str) -> None: offload_logger.warning(f"[DEBUG: offload] Failed to delete storage file: {e}") self._offloaded_requests.pop(request_id) + if self.cache_manager is not None and snapshot_handle is not None: + try: + self._issue_transfer_task(CacheStatus.DECODE_CLEANUP, DecodeCleanupTask(task_id=snapshot_handle)) + except Exception as e: + offload_logger.warning(f"[DEBUG: offload] Failed to cleanup snapshot {snapshot_handle}: {e}") + with self._lock: offload_logger.info(f"[DEBUG: offload] Cleaned up offloaded request: {request_id}") def get_offloaded_request_count(self) -> int: diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index cfb4440a815..28fb9bda3aa 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -360,12 +360,11 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re if self.config.scheduler_config.splitwise_role == "decode": self.tasks_list[preempted_req.idx] = None self.stop_flags[preempted_req.idx] = True - if preempted_req.request_id in self.requests: - del self.requests[preempted_req.request_id] if preempted_req.request_id in self.req_dict: del self.req_dict[preempted_req.request_id] if not offloaded: self._free_blocks(preempted_req) + self.to_be_rescheduled_request_id_set.add(preempted_req.request_id) llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}") else: if not offloaded: @@ -993,13 +992,20 @@ def _allocate_decode_and_extend(): elif request.status == RequestStatus.PREEMPTED: # Try to resume offloaded request first if request.is_offloaded and self.offload_manager is not None: - resume_success, _ = self.offload_manager.resume_decode(request) + resume_success, _, should_recompute = self.offload_manager.resume_decode(request) if resume_success: offload_logger.info(f"Resumed offloaded request {request.request_id}") self.waiting.popleft() self.running.append(request) scheduled_reqs.append(self._prepare_decode_task(request)) continue + if should_recompute: + offload_logger.info( + f"Resume retry limit reached or snapshot invalid for {request.request_id}, " + "fallback to recompute" + ) + request.is_offloaded = False + self.offload_manager.cleanup_offloaded_request(request.request_id) else: offload_logger.debug( f"Failed to resume offloaded request {request.request_id}, will retry" @@ -1464,6 +1470,8 @@ def finish_requests(self, request_ids: Union[str, Iterable[str]]): if request is None: llm_logger.error(f"invalid request id: {req_id} self.requests: {self.requests}") continue + if self.offload_manager is not None: + self.offload_manager.cleanup_offloaded_request(req_id) if request in self.waiting: llm_logger.error(f"request {request.request_id} scheduled into waiting list, after finished") continue diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 662b1cfc8a3..a5d57ff79b1 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -201,6 +201,8 @@ def _reschedule_preempt_task_use_zmq(self, datas): batch_id_set.add(data.batch_id) llm_logger.debug(f"_reschedule_preempt_task_use_zmq batch_id_set {batch_id_set}") for request_id in need_to_be_reschedule_req_ids: + if request_id not in self.resource_manager.requests: + continue if ( self.resource_manager.requests[request_id].idx not in batch_id_set ): # No more token generated for preempted request @@ -821,6 +823,7 @@ def _process_batch_output(self): if ( task_id in self.resource_manager.to_be_rescheduled_request_id_set and token_id == PREEMPTED_TOKEN_ID + and task_id in self.resource_manager.requests ): llm_logger.info(f"sync preemption for request_id {task_id} done.") self.resource_manager.reschedule_preempt_task(task_id) From d4686744a60195a0d6a2ea015956e1f76e842c6e Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Tue, 24 Mar 2026 17:00:13 +0800 Subject: [PATCH 3/6] use cache_transfer_manager --- fastdeploy/engine/offload_manager.py | 966 +----------------- .../engine/sched/resource_manager_v1.py | 3 + 2 files changed, 49 insertions(+), 920 deletions(-) diff --git a/fastdeploy/engine/offload_manager.py b/fastdeploy/engine/offload_manager.py index 865b72e4123..f4f96fa642b 100644 --- a/fastdeploy/engine/offload_manager.py +++ b/fastdeploy/engine/offload_manager.py @@ -14,91 +14,48 @@ # limitations under the License. """ -import os import threading -import uuid from typing import Dict, List, Optional, Tuple -import paddle - from fastdeploy import envs from fastdeploy.cache_manager.cache_data import CacheStatus -from fastdeploy.cache_manager.cache_tasks import ( - DecodeCleanupTask, - DecodeOffloadTask, - DecodeResumeTask, -) +from fastdeploy.cache_manager.cache_tasks import DecodeCleanupTask, DecodeOffloadTask, DecodeResumeTask from fastdeploy.engine.request import Request, RequestStatus from fastdeploy.utils import offload_logger -# Legacy direct-tensor helpers are kept only for SSD/offline fallback stubs. -share_external_data_ = None - class OffloadManager: """ - KV Cache Offload管理器 + Decode request KV cache offload orchestrator. - 职责: - 1. 管理被offload请求的KV Cache - 2. 提供Decode阶段批量offload和resume接口 - 3. 维护offloaded请求队列 - 4. 支持多级卸载策略 (L1 GPU -> L2 CPU -> L3 SSD) + Real KV cache snapshot/restore is executed inside cache_transfer_manager. + This class only manages request-level state, retry policy and task/result + synchronization. """ - # 存储层级常量 STORAGE_LEVEL_CPU = "L2" STORAGE_LEVEL_SSD = "L3" def __init__(self, config=None, cache_manager=None, model_runner=None): - """ - 初始化OffloadManager - - Args: - config: FastDeploy配置对象 - cache_manager: PrefixCacheManager实例 - model_runner: ModelRunner实例 (用于访问 KV cache tensors) - """ self.config = config self.cache_manager = cache_manager - self.model_runner = model_runner # 用于访问 KV cache + self.model_runner = model_runner - # offload开关 self.enable_offload = getattr(config, "enable_decode_offload", False) if config else False - - # TODO:offload策略参数,需要兼容性处理 self.min_steps = 20 - # cpu block大小为8KB self.cpu_offloading_chunk_size = getattr(envs, "FD_CPU_OFFLOAD_CHUNK_SIZE", 8192) - # 默认cpu memory限制为50GB self.cpu_memory_limit = getattr(envs, "FD_CPU_MEMORY_LIMIT", 50 * 1024 * 1024 * 1024) - # ssd存储路径 - self.storage_path = getattr(envs, "FD_OFFLOAD_STORAGE_PATH", "/tmp/fd_offload") + self.max_resume_retry = 3 - # 保存offloaded请求的相关cache信息 self._offloaded_requests: Dict[str, dict] = {} self._lock = threading.Lock() - self.max_resume_retry = 3 self._transfer_events: Dict[Tuple[int, str], threading.Event] = {} self._transfer_results: Dict[Tuple[int, str], list] = {} - - # 缓存配置信息(延迟初始化) - self._cache_config = None - self._key_cache_shape = None - self._value_cache_shape = None - self._num_layers = None self._tensor_parallel_size = getattr(getattr(config, "parallel_config", None), "tensor_parallel_size", 1) - self._local_rank = 0 - self._device_id = 0 - self._cache_dtype = None - - # 确保存储目录存在 - if self.enable_offload and not os.path.exists(self.storage_path): - os.makedirs(self.storage_path, exist_ok=True) offload_logger.info( f"[DEBUG: offload] OffloadManager initialized: enable_offload={self.enable_offload}, " - f"min_steps={self.min_steps}, storage_path={self.storage_path}" + f"min_steps={self.min_steps}" ) if self.cache_manager is not None and hasattr(self.cache_manager, "register_transfer_result_handler"): self.cache_manager.register_transfer_result_handler(self._handle_transfer_result) @@ -114,6 +71,7 @@ def _handle_transfer_result(self, data) -> bool: CacheStatus.DECODE_CLEANUP.value, ): return False + task_id, rank, ok, meta = data[1:] key = self._transfer_key(event_type, task_id) with self._lock: @@ -135,6 +93,7 @@ def _handle_transfer_result(self, data) -> bool: def _issue_transfer_task(self, event_type, task): if self.cache_manager is None or not hasattr(self.cache_manager, "cache_task_queue"): return None + key = self._transfer_key(event_type, task.task_id) event = threading.Event() with self._lock: @@ -150,236 +109,18 @@ def _issue_transfer_task(self, event_type, task): "results": results, } - def _init_cache_info(self): - """初始化cache配置信息(延迟初始化)""" - if self._cache_config is not None: - return - - if self.cache_manager is None: - return - - self._cache_config = self.cache_manager.cache_config - self._num_layers = self.config.model_config.num_hidden_layers - self._tensor_parallel_size = getattr(self.config.parallel_config, "tensor_parallel_size", 1) - self._local_rank = getattr(self.config.parallel_config, "tensor_parallel_rank", 0) - - # 从 device_ids 获取实际的设备号(考虑 CUDA_VISIBLE_DEVICES) - device_ids_str = getattr(self.config.parallel_config, "device_ids", "0") - device_ids = device_ids_str.split(",") - local_dp_id = getattr(self.config.parallel_config, "local_data_parallel_id", 0) - # device_id 是当前 rank 对应的实际 GPU 设备号 - rank_in_node = self._local_rank % len(device_ids) - self._device_id = int(device_ids[rank_in_node]) if rank_in_node < len(device_ids) else 0 - - self._cache_dtype = self._cache_config.cache_dtype - - offload_logger.info( - f"[DEBUG: offload] _init_cache_info: local_rank={self._local_rank}, " - f"device_ids={device_ids}, rank_in_node={rank_in_node}, device_id={self._device_id}" - ) - - # 计算KV cache shape - 使用实际的GPU block数量 - # 从cache_config获取总的GPU block数 - total_gpu_blocks = getattr(self._cache_config, "total_block_num", None) - if total_gpu_blocks is None: - # 尝试从cache_manager获取 - total_gpu_blocks = getattr(self.cache_manager, "num_gpu_blocks", None) - if total_gpu_blocks is None: - # 最后尝试从gpu_free_block_list推断 - total_gpu_blocks = len(getattr(self.cache_manager, "gpu_free_block_list", [])) - - if not total_gpu_blocks: - offload_logger.error("[DEBUG: offload] Failed to get total_gpu_blocks, using default 100") - total_gpu_blocks = 100 # 默认值,防止错误 - - key_shape, val_shape = self._compute_kv_cache_shape(total_gpu_blocks) - self._key_cache_shape = key_shape - self._value_cache_shape = val_shape - - offload_logger.info( - f"[DEBUG: offload] Cache info initialized: num_layers={self._num_layers}, " - f"total_gpu_blocks={total_gpu_blocks}, key_shape={self._key_cache_shape}, " - f"value_shape={self._value_cache_shape}" - ) - - def _get_cache_tensor_name(self, layer_id: int, is_key: bool) -> str: - """获取共享内存中cache tensor的名称""" - prefix = "key_caches" if is_key else "value_caches" - return f"{prefix}_{layer_id}_rank{self._local_rank}.device{self._device_id}" - - def _get_cache_scale_tensor_name(self, layer_id: int, is_key: bool) -> str: - """获取共享内存中cache scale tensor的名称(用于量化)""" - prefix = "key_cache_scales" if is_key else "value_cache_scales" - return f"{prefix}_{layer_id}_rank{self._local_rank}.device{self._device_id}" - - def _get_gpu_cache_tensor(self, layer_id: int, is_key: bool) -> Optional[paddle.Tensor]: - """ - 从共享内存获取GPU cache tensor的引用 - - Args: - layer_id: 层ID - is_key: 是否为key cache - - Returns: - paddle.Tensor: GPU cache tensor引用,失败返回None - """ - if share_external_data_ is None: - offload_logger.error("[DEBUG: offload] share_external_data_ is not available") - return None - - try: - tensor_name = self._get_cache_tensor_name(layer_id, is_key) - cache_shape = self._key_cache_shape if is_key else self._value_cache_shape - - if cache_shape is None or len(cache_shape) == 0: - offload_logger.error(f"[DEBUG: offload] cache_shape is None or empty for layer {layer_id}") - return None - - # 创建空tensor并通过share_external_data_绑定到共享内存 - dtype = self._cache_dtype if self._cache_dtype else "bfloat16" - if dtype == "block_wise_fp8": - dtype = paddle.float8_e4m3fn - elif dtype == "int4_zp": - dtype = paddle.int8 - elif dtype == "bfloat16": - dtype = paddle.bfloat16 - else: - dtype = paddle.float16 - - empty_tensor = paddle.empty(shape=[], dtype=dtype) - try: - cache_tensor = share_external_data_(empty_tensor, tensor_name, cache_shape, True) - except Exception as e: - offload_logger.error(f"[DEBUG: offload] share_external_data_ FAILED for {tensor_name}: {e}") - raise - - return cache_tensor - - except Exception as e: - offload_logger.error(f"[DEBUG: offload] Failed to get GPU cache tensor for layer {layer_id}: {e}") - return None - - def _get_gpu_cache_scale_tensor(self, layer_id: int, is_key: bool) -> Optional[paddle.Tensor]: - """ - 从共享内存获取GPU cache scale tensor的引用(用于量化) - """ - if share_external_data_ is None: - return None - - try: - tensor_name = self._get_cache_scale_tensor_name(layer_id, is_key) - # scale shape: [num_blocks, num_heads, block_size] - scale_shape = [ - self._key_cache_shape[0], - self._key_cache_shape[1], - self._key_cache_shape[2], - ] - - empty_tensor = paddle.empty(shape=[], dtype=paddle.float32) - scale_tensor = share_external_data_(empty_tensor, tensor_name, scale_shape, True) - - return scale_tensor - - except Exception as e: - offload_logger.error(f"[DEBUG: offload] Failed to get GPU scale tensor for layer {layer_id}: {e}") - return None - - def _compute_kv_cache_shape(self, max_block_num): - """ - 计算 KV Cache 的 shape - - Args: - max_block_num: 最大 block 数量 - - Returns: - tuple: (key_cache_shape, val_cache_shape) - """ - try: - from fastdeploy.model_executor.layers.attention import get_attention_backend - - config = self.cache_manager.config - cache_config = self.cache_manager.cache_config - - attn_cls = get_attention_backend() - tp_size = getattr(config.parallel_config, "tensor_parallel_size", 1) - num_heads = config.model_config.num_attention_heads // tp_size - kv_num_heads = max( - 1, - int(config.model_config.num_key_value_heads) // tp_size, - ) - head_dim = config.model_config.head_dim - - kv_cache_quant_type = None - if ( - config.quant_config - and hasattr(config.quant_config, "kv_cache_quant_type") - and config.quant_config.kv_cache_quant_type is not None - ): - kv_cache_quant_type = config.quant_config.kv_cache_quant_type - - encoder_block_shape_q = 64 - decoder_block_shape_q = 16 - key_cache_shape, value_cache_shape = attn_cls( - config, - kv_num_heads=kv_num_heads, - num_heads=num_heads, - head_dim=head_dim, - encoder_block_shape_q=encoder_block_shape_q, - decoder_block_shape_q=decoder_block_shape_q, - ).get_kv_cache_shape(max_num_blocks=max_block_num, kv_cache_quant_type=kv_cache_quant_type) - - offload_logger.info( - f"[DEBUG: offload] Computed key_cache_shape: {key_cache_shape}, value_cache_shape: {value_cache_shape}" - ) - return key_cache_shape, value_cache_shape - - except Exception as e: - offload_logger.error(f"[DEBUG: offload] Failed to compute kv_cache_shape: {e}") - return None, None - - # ==================== 判断接口 ==================== - def can_offload(self, request: Request) -> bool: - """ - 检查请求是否可以被offload - - 条件: - 1. offload功能已启用 - 2. 请求未被offload过 - 3. 请求有block_tables可被offload - 4. 请求处于decode阶段(num_computed_tokens >= need_prefill_tokens) - 5. need_prefill_tokens已初始化 - 6. CPU内存充足 - - 注意: decode阶段判定和存储空间检查由调用者或offload_req负责 - """ - # DEBUG: can_offload入口检查 - offload_logger.debug( - f"[DEBUG: can_offload] Checking request {request.request_id}, " - f"enable_offload={self.enable_offload}, is_offloaded={request.is_offloaded}" - ) - if not self.enable_offload: - offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: offload disabled") return False - if request.is_offloaded: - offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: already offloaded") return False - - # 检查是否有可被offload的blocks if not request.block_tables: - offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: no block_tables") return False - - # 新增:检查need_prefill_tokens是否已初始化 if request.need_prefill_tokens is None: offload_logger.warning( f"[DEBUG: can_offload] {request.request_id}: need_prefill_tokens is None, cannot offload" ) return False - - # 新增:检查请求是否处于decode阶段 if request.num_computed_tokens < request.need_prefill_tokens: offload_logger.warning( f"[DEBUG: can_offload] {request.request_id} is not in decode phase, " @@ -387,70 +128,23 @@ def can_offload(self, request: Request) -> bool: f"need_prefill_tokens={request.need_prefill_tokens}, cannot offload" ) return False - - # 注意:新方法 .to("cpu") 不需要预先分配的 CPU blocks - # 因此移除对 cpu_free_block_list 的检查 - - offload_logger.debug(f"[DEBUG: can_offload] {request.request_id}: can offload = True") return True def can_resume(self, request: Request) -> bool: - """ - 检查请求是否可以被恢复 - - 条件: - 1. offload功能已启用 - 2. 请求存在offloaded信息 - 3. 存在有效的KV Cache副本(cpu_copy或ssd_copy) - 4. GPU内存充足 - """ if not self.enable_offload: return False - if request.request_id not in self._offloaded_requests: return False offloaded_info = self._offloaded_requests.get(request.request_id) - if offloaded_info is None: - return False - - if offloaded_info.get("snapshot_handle") is None: + if offloaded_info is None or offloaded_info.get("snapshot_handle") is None: return False - - # 检查GPU内存是否充足 - num_blocks_needed = offloaded_info.get("num_blocks_needed", 0) if self.cache_manager is None: return False - return self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed) - - def offload_decode( - self, running_requests: List[Request], min_steps: int = 20 - ) -> Tuple[List[Request], List[Request]]: - """ - 批量offload decode请求,直到当前batch能运行min_steps个step为止 - - 注意: 调度策略由外层ResourceManager决定,此函数仅负责: - 1. 检测请求是否处于decode阶段 - 2. 执行offload操作 - 3. 保存KV Cache相关信息 - 4. 更新请求状态 - - Args: - running_requests: 待offload的请求列表(已按调度策略排序) - min_steps: 最小运行step数(默认20) - - Returns: - Tuple[offloaded_reqs, abort_reqs]: - - offloaded_reqs: 成功offload的请求 - - abort_reqs: offload失败的请求(需要调用者abort处理) - """ - # DEBUG: offload - 入口调试 - offload_logger.info( - f"[DEBUG: offload_decode] offload_decode called, enable_offload={self.enable_offload}, " - f"num_requests={len(running_requests)}" - ) + return self.cache_manager.can_allocate_gpu_blocks(offloaded_info.get("num_blocks_needed", 0)) + def offload_decode(self, running_requests: List[Request], min_steps: int = 20) -> Tuple[List[Request], List[Request]]: if not self.enable_offload: return [], [] @@ -459,35 +153,14 @@ def offload_decode( remaining_count = len(running_requests) for req in running_requests: - # 非decode阶段的请求不应出现在这里,记录warning但不处理 - # 请求状态未修改,调用者可重新调度 - if req.num_computed_tokens < req.need_prefill_tokens: - offload_logger.warning( - f"[DEBUG: offload_decode] Request {req.request_id} is not in decode phase, " - f"num_computed_tokens={req.num_computed_tokens}, " - f"need_prefill_tokens={req.need_prefill_tokens}" - ) - continue - - # DEBUG: offload - can_offload 检查 - can_offload_result = self.can_offload(req) - offload_logger.info( - f"[DEBUG: offload_decode] can_offload({req.request_id})={can_offload_result}, " - f"is_offloaded={req.is_offloaded}, block_tables={len(req.block_tables) if req.block_tables else 0}" - ) - - if not can_offload_result: + if not self.can_offload(req): continue - # 执行offload if self.offload_req(req): offloaded_reqs.append(req) remaining_count -= 1 - offload_logger.info(f"[DEBUG: offload_decode] Successfully offloaded request {req.request_id}") else: - # offload失败,返回给调用者处理(需要abort) abort_reqs.append(req) - offload_logger.warning(f"[DEBUG: offload_decode] Failed to offload request {req.request_id}") if self.cache_manager is not None and remaining_count > 0: block_size = self.cache_manager.cache_config.block_size @@ -495,41 +168,17 @@ def offload_decode( total_blocks_needed = remaining_count * blocks_needed_per_request current_free_blocks = len(getattr(self.cache_manager, "gpu_free_block_list", [])) if current_free_blocks >= total_blocks_needed: - offload_logger.info( - f"[DEBUG: offload_decode] Memory sufficient after offloading " - f"{len(offloaded_reqs)} requests, remaining={remaining_count}, " - f"free_blocks={current_free_blocks}, needed={total_blocks_needed}" - ) break return offloaded_reqs, abort_reqs - # ==================== 单请求多级Offload接口 ==================== - def offload_req(self, request: Request) -> bool: - """ - 指定请求触发多级卸载(L1→L2→L3) - - 执行步骤: - 1. 检查请求是否处于decode阶段 - 2. 尝试L2 offload(CPU内存) - 3. 如果L2内存不足,触发L3 offload(SSD存储) - 4. 释放对应GPU blocks - 5. 更新请求状态 - """ - if not self.enable_offload: + if not self.enable_offload or self.cache_manager is None: return False - - # 检查是否已经被offload if request.is_offloaded: offload_logger.warning(f"[DEBUG: offload_req] Request {request.request_id} already offloaded") return False - # 初始化cache信息 - self._init_cache_info() - storage_level = self.STORAGE_LEVEL_CPU - if self.cache_manager is None: - return False snapshot_task = DecodeOffloadTask(task_id=request.request_id, gpu_block_ids=list(request.block_tables)) snapshot_result = self._issue_transfer_task(CacheStatus.DECODE_OFFLOAD, snapshot_task) if snapshot_result is None or not snapshot_result.get("ok", False): @@ -538,27 +187,18 @@ def offload_req(self, request: Request) -> bool: ) return False - # 保存offload信息 - 在释放GPU blocks之前保存 with self._lock: + need_prefill_tokens = request.need_prefill_tokens + if need_prefill_tokens is None: + need_prefill_tokens = request.prompt_token_ids_len if request.prompt_token_ids_len else 0 original_block_tables = list(request.block_tables) if request.block_tables else [] - - # 新增:确保need_prefill_tokens不为None,提供默认值 - need_prefill_tokens_value = request.need_prefill_tokens - if need_prefill_tokens_value is None: - # 如果need_prefill_tokens未初始化,使用prompt_token_ids_len作为默认值 - need_prefill_tokens_value = request.prompt_token_ids_len if request.prompt_token_ids_len else 0 - offload_logger.warning( - f"[DEBUG: offload_req] Request {request.request_id} need_prefill_tokens is None during offload, " - f"using default value: {need_prefill_tokens_value}" - ) - self._offloaded_requests[request.request_id] = { - "storage_level": storage_level, + "storage_level": self.STORAGE_LEVEL_CPU, "num_tokens": request.num_total_tokens, "num_blocks_needed": len(original_block_tables), "output_token_ids": list(request.output_token_ids), "num_computed_tokens": request.num_computed_tokens, - "need_prefill_tokens": need_prefill_tokens_value, + "need_prefill_tokens": need_prefill_tokens, "prompt_token_ids": list(request.prompt_token_ids) if request.prompt_token_ids else None, "prompt_token_ids_len": request.prompt_token_ids_len, "sampling_params": request.sampling_params, @@ -567,430 +207,68 @@ def offload_req(self, request: Request) -> bool: "resume_retry_count": 0, } - # 释放GPU blocks self.release_gpu_blocks(request) - - # 更新请求状态 request.status = RequestStatus.PREEMPTED request.is_offloaded = True - offload_logger.info( - f"[DEBUG: offload_req] Request {request.request_id} offloaded to {storage_level}, " - f"num_tokens={request.num_total_tokens}, output_tokens={len(request.output_token_ids)}, " + f"[DEBUG: offload_req] Request {request.request_id} offloaded to {self.STORAGE_LEVEL_CPU}, " f"blocks_needed={len(original_block_tables)}" ) - return True def offload_kv_cache(self, request: Request, target_level: str = "L2") -> bool: """ - 调用多级memory offload工具函数,并释放相应GPU blocks - - 用于扩展性调用,可单独对指定请求进行KV Cache offload - - Args: - request: 需要offload的请求 - target_level: 目标存储层级("L2"=CPU, "L3"=SSD) - - Returns: - bool: offload是否成功 + Compatibility shim for future multi-level offload. """ - try: - if target_level == self.STORAGE_LEVEL_CPU: - kv_cache_cpu = self.get_cpu_copy(request) - return kv_cache_cpu is not None - elif target_level == self.STORAGE_LEVEL_SSD: - kv_cache_cpu = self.get_cpu_copy(request) - storage_path = self.save_to_storage(kv_cache_cpu) - if kv_cache_cpu is not None: - del kv_cache_cpu - return storage_path is not None - else: - offload_logger.error(f"[DEBUG: offload_kv_cache] Invalid target_level: {target_level}") - return False - except Exception as e: - offload_logger.error(f"[DEBUG: offload_kv_cache] offload_kv_cache failed: {e}") + if target_level == self.STORAGE_LEVEL_CPU: + return self.offload_req(request) + if target_level == self.STORAGE_LEVEL_SSD: + offload_logger.warning("[DEBUG: offload_kv_cache] SSD offload is not implemented in the first version") return False + offload_logger.error(f"[DEBUG: offload_kv_cache] Invalid target_level: {target_level}") + return False def release_gpu_blocks(self, request: Request) -> None: if self.cache_manager is None: return - if request.block_tables: blocks_to_release = list(request.block_tables) - offload_logger.info( - f"[DEBUG: release_gpu_blocks] Releasing {len(blocks_to_release)} blocks for request {request.request_id}" - ) self.cache_manager.recycle_gpu_blocks(blocks_to_release, request.request_id) request.block_tables = [] - # ==================== CPU Memory Offload接口 ==================== - - def get_cpu_copy(self, request: Request) -> Optional[dict]: - """ - 从GPU获取KV Cache的CPU副本 - - 使用 paddle.Tensor.to("cpu") 将KV cache从GPU复制到CPU内存 - - Args: - request: 需要offload的请求 - - Returns: - dict: 包含CPU上KV cache数据的字典,失败返回None - { - "key_caches": List[paddle.Tensor], # CPU上的key cache列表 - "value_caches": List[paddle.Tensor], # CPU上的value cache列表 - "key_scales": List[paddle.Tensor], # 可选,用于量化 - "value_scales": List[paddle.Tensor], # 可选,用于量化 - "block_ids": List[int], # 对应的block IDs - "num_blocks": int, - } - """ - import time - - start_time = time.time() - - if not request.block_tables: - offload_logger.warning(f"[DEBUG: get_cpu_copy] {request.request_id}: no block_tables") - return None - - self._init_cache_info() - - if self._key_cache_shape is None: - offload_logger.error("[DEBUG: get_cpu_copy] key_cache_shape is not initialized") - return None - - try: - block_ids = list(request.block_tables) - num_blocks = len(block_ids) - - key_caches_cpu = [] - value_caches_cpu = [] - key_scales_cpu = [] - value_scales_cpu = [] - - offload_logger.info( - f"[DEBUG: get_cpu_copy] Copying KV cache for request {request.request_id}, " - f"num_layers={self._num_layers}, num_blocks={num_blocks}, block_ids={block_ids}" - ) - - for layer_id in range(self._num_layers): - # 获取GPU上的key cache - key_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=True) - if key_cache_gpu is None: - offload_logger.error(f"[DEBUG: get_cpu_copy] Failed to get key cache for layer {layer_id}") - return None - - # 获取需要的blocks数据 - key_cache_blocks = [] - for block_id in block_ids: - if block_id < key_cache_gpu.shape[0]: - key_cache_blocks.append(key_cache_gpu[block_id]) - else: - offload_logger.error( - f"[DEBUG: get_cpu_copy] Block {block_id} out of range for key cache (shape={key_cache_gpu.shape})" - ) - return None - - # 拼接并复制到CPU - key_cache_layer = ( - paddle.stack(key_cache_blocks) if len(key_cache_blocks) > 1 else key_cache_blocks[0].unsqueeze(0) - ) - key_cache_cpu = key_cache_layer.to("cpu") - key_caches_cpu.append(key_cache_cpu) - - # 获取value cache(如果有) - if self._value_cache_shape and len(self._value_cache_shape) > 0: - value_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=False) - if value_cache_gpu is not None: - value_cache_blocks = [] - for block_id in block_ids: - if block_id < value_cache_gpu.shape[0]: - value_cache_blocks.append(value_cache_gpu[block_id]) - else: - offload_logger.error( - f"[DEBUG: get_cpu_copy] Block {block_id} out of range for value cache" - ) - return None - - value_cache_layer = ( - paddle.stack(value_cache_blocks) - if len(value_cache_blocks) > 1 - else value_cache_blocks[0].unsqueeze(0) - ) - value_cache_cpu = value_cache_layer.to("cpu") - value_caches_cpu.append(value_cache_cpu) - - # 获取scale tensors(用于量化) - if self._cache_dtype == "block_wise_fp8": - key_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=True) - if key_scale_gpu is not None: - key_scale_blocks = [key_scale_gpu[block_id] for block_id in block_ids] - key_scale_layer = ( - paddle.stack(key_scale_blocks) - if len(key_scale_blocks) > 1 - else key_scale_blocks[0].unsqueeze(0) - ) - key_scales_cpu.append(key_scale_layer.to("cpu")) - - value_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=False) - if value_scale_gpu is not None: - value_scale_blocks = [value_scale_gpu[block_id] for block_id in block_ids] - value_scale_layer = ( - paddle.stack(value_scale_blocks) - if len(value_scale_blocks) > 1 - else value_scale_blocks[0].unsqueeze(0) - ) - value_scales_cpu.append(value_scale_layer.to("cpu")) - - result = { - "key_caches": key_caches_cpu, - "value_caches": value_caches_cpu, - "key_scales": key_scales_cpu if key_scales_cpu else None, - "value_scales": value_scales_cpu if value_scales_cpu else None, - "block_ids": block_ids, - "num_blocks": num_blocks, - } - - elapsed_time = time.time() - start_time - offload_logger.info( - f"[DEBUG: get_cpu_copy] Successfully copied KV cache to CPU for request {request.request_id}, " - f"key_cache_shape={key_caches_cpu[0].shape if key_caches_cpu else None}, " - f"elapsed_time={elapsed_time:.4f}s" - ) - - return result - - except Exception as e: - elapsed_time = time.time() - start_time - offload_logger.error( - f"[DEBUG: get_cpu_copy] Failed to copy KV cache to CPU: {e}, elapsed_time={elapsed_time:.4f}s" - ) - return None - - def load_cpu_copy(self, kv_cache_cpu: dict, request: Request) -> bool: - """ - 将CPU上的KV Cache加载回GPU - - 使用 paddle.Tensor.to(device) 将KV cache从CPU复制回GPU共享内存 - - Args: - kv_cache_cpu: CPU上的KV Cache信息(get_cpu_copy返回的字典) - request: 需要恢复的请求 - - Returns: - bool: 加载是否成功 - """ - if kv_cache_cpu is None: - offload_logger.error("[DEBUG: load_cpu_copy] kv_cache_cpu is None") - return False - - if not request.block_tables: - offload_logger.error(f"[DEBUG: load_cpu_copy] {request.request_id}: no block_tables allocated") - return False - - start_time = time.time() - try: - key_caches_cpu = kv_cache_cpu.get("key_caches") - value_caches_cpu = kv_cache_cpu.get("value_caches") - key_scales_cpu = kv_cache_cpu.get("key_scales") - value_scales_cpu = kv_cache_cpu.get("value_scales") - - if not key_caches_cpu: - offload_logger.error("[DEBUG: load_cpu_copy] key_caches is empty") - return False - - device = f"gpu:{self._device_id}" - new_block_ids = list(request.block_tables) - - offload_logger.info( - f"[DEBUG: load_cpu_copy] Loading KV cache from CPU for request {request.request_id}, " - f"num_layers={len(key_caches_cpu)}, num_blocks={len(new_block_ids)}" - ) - - for layer_id, key_cache_cpu in enumerate(key_caches_cpu): - # 获取GPU上的key cache - key_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=True) - if key_cache_gpu is None: - offload_logger.error(f"[DEBUG: load_cpu_copy] Failed to get GPU key cache for layer {layer_id}") - return False - - # 复制到GPU - key_cache_gpu_data = key_cache_cpu.to(device) - - # 写入到新的block IDs - for idx, block_id in enumerate(new_block_ids): - if block_id < key_cache_gpu.shape[0] and idx < key_cache_gpu_data.shape[0]: - key_cache_gpu[block_id] = key_cache_gpu_data[idx] - else: - offload_logger.error( - f"[DEBUG: load_cpu_copy] Block ID out of range: block_id={block_id}, " - f"gpu_shape={key_cache_gpu.shape}, idx={idx}" - ) - return False - - # 复制value cache(如果有) - if value_caches_cpu and layer_id < len(value_caches_cpu): - value_cache_cpu = value_caches_cpu[layer_id] - value_cache_gpu = self._get_gpu_cache_tensor(layer_id, is_key=False) - if value_cache_gpu is not None: - value_cache_gpu_data = value_cache_cpu.to(device) - for idx, block_id in enumerate(new_block_ids): - if block_id < value_cache_gpu.shape[0] and idx < value_cache_gpu_data.shape[0]: - value_cache_gpu[block_id] = value_cache_gpu_data[idx] - else: - offload_logger.error( - f"[DEBUG: load_cpu_copy] Block ID out of range for value: block_id={block_id}" - ) - return False - - # 复制scales(用于量化) - if key_scales_cpu and layer_id < len(key_scales_cpu): - key_scale_cpu = key_scales_cpu[layer_id] - key_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=True) - if key_scale_gpu is not None: - key_scale_gpu_data = key_scale_cpu.to(device) - for idx, block_id in enumerate(new_block_ids): - if block_id < key_scale_gpu.shape[0] and idx < key_scale_gpu_data.shape[0]: - key_scale_gpu[block_id] = key_scale_gpu_data[idx] - - if value_scales_cpu and layer_id < len(value_scales_cpu): - value_scale_cpu = value_scales_cpu[layer_id] - value_scale_gpu = self._get_gpu_cache_scale_tensor(layer_id, is_key=False) - if value_scale_gpu is not None: - value_scale_gpu_data = value_scale_cpu.to(device) - for idx, block_id in enumerate(new_block_ids): - if block_id < value_scale_gpu.shape[0] and idx < value_scale_gpu_data.shape[0]: - value_scale_gpu[block_id] = value_scale_gpu_data[idx] - - elapsed_time = time.time() - start_time - offload_logger.info( - f"[DEBUG: load_cpu_copy] Successfully loaded KV cache to GPU for request {request.request_id}, " - f"elapsed_time={elapsed_time:.4f}s" - ) - return True - - except Exception as e: - elapsed_time = time.time() - start_time - offload_logger.error( - f"[DEBUG: load_cpu_copy] Failed to load KV cache to GPU: {e}, elapsed_time={elapsed_time:.4f}s" - ) - return False - - # ==================== SSD Storage Offload接口 ==================== - def save_to_storage(self, kv_cache_cpu) -> Optional[str]: - """ - 将CPU上的KV Cache保存到SSD存储 - - Args: - kv_cache_cpu: CPU上的KV Cache信息 - - Returns: - str: 存储文件路径,失败返回None - """ - try: - if kv_cache_cpu is None: - return None - - storage_file = f"kv_cache_{uuid.uuid4().hex}.pdparams" - storage_path = os.path.join(self.storage_path, storage_file) - - # 保存KV Cache信息到文件 - paddle.save(kv_cache_cpu, storage_path) - - offload_logger.info(f"[DEBUG: offload_save_to_storage] Saved KV cache to {storage_path}") - return storage_path - - except Exception as e: - offload_logger.error(f"[DEBUG: offload_save_to_storage] save_to_storage failed: {e}") - return None + """Compatibility placeholder for future SSD offload support.""" + offload_logger.warning("[DEBUG: save_to_storage] SSD offload is not implemented in the first version") + return None def load_from_storage(self, storage_path: str) -> Optional[dict]: - """ - 从SSD存储加载KV Cache到CPU - - Args: - storage_path: 存储文件路径 - - Returns: - dict: CPU上的KV Cache信息,失败返回None - """ - try: - if not os.path.exists(storage_path): - offload_logger.error(f"[DEBUG: off_load_save_to_storage] Storage file not found: {storage_path}") - return None - - kv_cache_cpu = paddle.load(storage_path) - offload_logger.info(f"[DEBUG: off_load_save_to_storage] Loaded KV cache from {storage_path}") - return kv_cache_cpu - - except Exception as e: - offload_logger.error(f"[DEBUG: off_load_save_to_storage] load_from_storage failed: {e}") - return None - - # ==================== Resume接口 ==================== + """Compatibility placeholder for future SSD resume support.""" + offload_logger.warning("[DEBUG: load_from_storage] SSD resume is not implemented in the first version") + return None def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: - """ - 恢复被offload的请求到GPU - - 执行步骤: - 1. 检查GPU是否有足够内存 - 2. 根据存储层级选择恢复路径(L2/L3) - 3. 验证cache数据完整性 - 4. 检查是否为decode阶段(token_num > prefill_token_num) - 5. 分配GPU blocks并加载数据 - 6. 恢复请求状态 - - Returns: - Tuple[bool, Optional[int]]: - - bool: resume是否成功 - - int: cache中的token数量(即使resume失败也返回,用于重新计算) - """ - start_time = time.time() if not self.enable_offload: return False, None, False - # 使用锁保护offloaded_requests的读取 with self._lock: - if request.request_id not in self._offloaded_requests: - offload_logger.warning(f"[DEBUG: resume_decode] Request {request.request_id} is not offloaded") - return False, None, True - offloaded_info = self._offloaded_requests.get(request.request_id) if offloaded_info is None: + offload_logger.warning(f"[DEBUG: resume_decode] Request {request.request_id} is not offloaded") return False, None, True - # 复制需要的信息,避免长时间持有锁 - storage_level = offloaded_info["storage_level"] num_blocks_needed = offloaded_info["num_blocks_needed"] saved_num_computed_tokens = offloaded_info["num_computed_tokens"] saved_need_prefill_tokens = offloaded_info["need_prefill_tokens"] snapshot_handle = offloaded_info.get("snapshot_handle") resume_retry_count = offloaded_info.get("resume_retry_count", 0) - # 复制output_token_ids和need_prefill_tokens用于恢复 output_token_ids = list(offloaded_info.get("output_token_ids", [])) need_prefill_tokens = offloaded_info.get("need_prefill_tokens") - # 检查是否为decode阶段 if saved_num_computed_tokens <= saved_need_prefill_tokens: - offload_logger.warning( - f"[DEBUG: resume_decode] Request {request.request_id} is not in decode phase " - f"(num_computed_tokens={saved_num_computed_tokens}, " - f"need_prefill_tokens={saved_need_prefill_tokens}), " - f"should recompute instead of resume" - ) return False, saved_num_computed_tokens, True - if self.cache_manager is None: return False, saved_num_computed_tokens, True - if not self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed): - offload_logger.warning( - f"[DEBUG: resume_decode] Insufficient GPU memory for request {request.request_id}, " - f"need {num_blocks_needed} blocks" - ) should_recompute = resume_retry_count + 1 >= self.max_resume_retry with self._lock: if request.request_id in self._offloaded_requests: @@ -1001,16 +279,11 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: if snapshot_handle is None: return False, saved_num_computed_tokens, True - # 分配GPU blocks new_block_ids = self.cache_manager.allocate_gpu_blocks(num_blocks_needed, request.request_id) request.block_tables = new_block_ids resume_task = DecodeResumeTask(task_id=snapshot_handle, gpu_block_ids=new_block_ids) - restore_result = self._issue_transfer_task(CacheStatus.DECODE_RESUME, resume_task) - if restore_result is None or not restore_result.get("ok", False): - offload_logger.error( - f"[DEBUG: resume_decode] Failed to restore CPU snapshot for {request.request_id}, " - f"result={restore_result}" - ) + resume_result = self._issue_transfer_task(CacheStatus.DECODE_RESUME, resume_task) + if resume_result is None or not resume_result.get("ok", False): self.cache_manager.recycle_gpu_blocks(new_block_ids, request.request_id) request.block_tables = [] should_recompute = resume_retry_count + 1 >= self.max_resume_retry @@ -1019,49 +292,20 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 return False, saved_num_computed_tokens, should_recompute - # 恢复请求状态 request.output_token_ids = output_token_ids request.num_computed_tokens = saved_num_computed_tokens request.need_prefill_tokens = need_prefill_tokens request.status = RequestStatus.RUNNING request.is_offloaded = False - # 在 resume 成功时,清理 abort 标志,避免后续生成错误的 RequestOutput - # 注意:Request 类没有 outputs 属性,outputs 是 RequestOutput 的属性 - # 这里我们设置一个标记,表示该请求已成功恢复,后续处理不应生成 abort 的 RequestOutput - offload_logger.info( - f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully, " - f"output_tokens={len(output_token_ids)}, idx={request.idx}" - ) - - # 清理offloaded信息 with self._lock: self._offloaded_requests.pop(request.request_id, None) - # 清理SSD存储文件 - if storage_level == self.STORAGE_LEVEL_SSD and storage_path: - try: - os.remove(storage_path) - except Exception as e: - offload_logger.warning(f"[DEBUG: resume_decode] Failed to delete storage file: {e}") - - elapsed_time = time.time() - start_time - offload_logger.info( - f"[DEBUG: resume_decode] Resumed request {request.request_id} from {storage_level}, " - f"output_tokens={len(request.output_token_ids)}, elapsed_time={elapsed_time:.4f}s" - ) - - # 尝试预取其他 SSD 数据到 CPU - self.prefetch_ssd_to_cpu() - + offload_logger.info(f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully") return True, saved_num_computed_tokens, False - except Exception as e: - elapsed_time = time.time() - start_time should_recompute = False - offload_logger.error( - f"[DEBUG: resume_decode] Failed to resume request {request.request_id}: {e}, elapsed_time={elapsed_time:.4f}s" - ) + offload_logger.error(f"[DEBUG: resume_decode] Failed to resume request {request.request_id}: {e}") with self._lock: if request.request_id in self._offloaded_requests: retries = self._offloaded_requests[request.request_id].get("resume_retry_count", 0) + 1 @@ -1069,146 +313,28 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: should_recompute = retries >= self.max_resume_retry return False, saved_num_computed_tokens, should_recompute - def _verify_cache_integrity(self, kv_cache_cpu: dict, offloaded_info: dict) -> bool: - """ - 验证cache数据的完整性 - - Args: - kv_cache_cpu: CPU上的KV Cache信息 - offloaded_info: 保存的offload信息 - - Returns: - bool: cache是否有效 - """ - if kv_cache_cpu is None: - return False - - # 检查必要的字段 - cpu_block_ids = kv_cache_cpu.get("block_ids", []) - num_blocks = kv_cache_cpu.get("num_blocks", 0) - - if not cpu_block_ids or num_blocks == 0: - offload_logger.warning("[DEBUG: offload] Cache integrity check: missing block_ids or num_blocks") - return False - - # 检查block数量是否匹配 - expected_num_blocks = offloaded_info.get("num_blocks_needed", 0) - if len(cpu_block_ids) != num_blocks or num_blocks != expected_num_blocks: - offload_logger.warning( - f"[DEBUG: offload] Cache integrity check: block count mismatch, " - f"expected={expected_num_blocks}, actual={num_blocks}" - ) - return False - - # 检查key_caches是否存在 - key_caches = kv_cache_cpu.get("key_caches") - if not key_caches or len(key_caches) == 0: - offload_logger.warning("[DEBUG: offload] Cache integrity check: missing key_caches") - return False - - # 检查token数量是否一致 - saved_num_tokens = offloaded_info.get("num_tokens", 0) - block_size = self.cache_manager.cache_config.block_size if self.cache_manager else 64 - actual_max_tokens = num_blocks * block_size - if saved_num_tokens > actual_max_tokens: - offload_logger.warning( - f"[DEBUG: offload] Cache integrity check: token count exceeds capacity, " - f"saved_tokens={saved_num_tokens}, max_capacity={actual_max_tokens}" - ) - return False - - offload_logger.debug(f"Cache integrity check passed: num_blocks={num_blocks}, num_tokens={saved_num_tokens}") - return True - - # ==================== 辅助接口 ==================== - def cleanup_offloaded_request(self, request_id: str) -> None: - """清理被offload请求的缓存(请求完成时调用)""" with self._lock: - if request_id not in self._offloaded_requests: - return - - offloaded_info = self._offloaded_requests[request_id] - snapshot_handle = offloaded_info.get("snapshot_handle") - - # 清理SSD存储文件 - storage_path = offloaded_info.get("storage_path") - if storage_path and os.path.exists(storage_path): - try: - os.remove(storage_path) - offload_logger.info(f"[DEBUG: offload] Deleted storage file: {storage_path}") - except Exception as e: - offload_logger.warning(f"[DEBUG: offload] Failed to delete storage file: {e}") + offloaded_info = self._offloaded_requests.pop(request_id, None) + if offloaded_info is None: + return - self._offloaded_requests.pop(request_id) + snapshot_handle = offloaded_info.get("snapshot_handle") if self.cache_manager is not None and snapshot_handle is not None: try: self._issue_transfer_task(CacheStatus.DECODE_CLEANUP, DecodeCleanupTask(task_id=snapshot_handle)) except Exception as e: offload_logger.warning(f"[DEBUG: offload] Failed to cleanup snapshot {snapshot_handle}: {e}") - with self._lock: - offload_logger.info(f"[DEBUG: offload] Cleaned up offloaded request: {request_id}") + offload_logger.info(f"[DEBUG: offload] Cleaned up offloaded request: {request_id}") def get_offloaded_request_count(self) -> int: - """获取当前offloaded的请求数量""" with self._lock: return len(self._offloaded_requests) def get_offloaded_request_ids(self) -> List[str]: - """获取当前所有offloaded的请求ID列表""" with self._lock: return list(self._offloaded_requests.keys()) def prefetch_ssd_to_cpu(self) -> int: - """ - 后台预取:将 SSD 上的 KV Cache 预取到 CPU 内存 - - 当 CPU 内存有空闲时调用,减少 resume 时的延迟 - - Returns: - int: 成功预取的请求数量 - """ - if not self.enable_offload or self.cache_manager is None: - return 0 - - prefetched_count = 0 - - with self._lock: - # 找出存储在 SSD 上且还没有 CPU copy 的请求 - ssd_requests = [ - (req_id, info) - for req_id, info in self._offloaded_requests.items() - if info.get("storage_level") == self.STORAGE_LEVEL_SSD and info.get("kv_cache_cpu") is None - ] - - for req_id, info in ssd_requests: - storage_path = info.get("storage_path") - if not storage_path or not os.path.exists(storage_path): - continue - - # 检查 CPU 内存是否充足 - num_blocks = info.get("num_blocks_needed", 0) - if num_blocks > len(self.cache_manager.cpu_free_block_list): - break # CPU 内存不足,停止预取 - - try: - # 从 SSD 加载到 CPU - kv_cache_cpu = self.load_from_storage(storage_path) - if kv_cache_cpu is None: - continue - - # 更新 offloaded 信息 - with self._lock: - if req_id in self._offloaded_requests: - self._offloaded_requests[req_id]["kv_cache_cpu"] = kv_cache_cpu - self._offloaded_requests[req_id]["storage_level"] = self.STORAGE_LEVEL_CPU - - prefetched_count += 1 - offload_logger.info( - f"[DEBUG: offload_prefetch_ssd_to_cpu] Prefetched request {req_id} from SSD to CPU" - ) - - except Exception as e: - offload_logger.warning(f"[DEBUG: offload_prefetch_ssd_to_cpu] Failed to prefetch {req_id}: {e}") - - return prefetched_count + """Compatibility placeholder for future SSD prefetch support.""" + return 0 diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 28fb9bda3aa..8aed8f98390 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -996,6 +996,9 @@ def _allocate_decode_and_extend(): if resume_success: offload_logger.info(f"Resumed offloaded request {request.request_id}") self.waiting.popleft() + self.tasks_list[request.idx] = request + self.stop_flags[request.idx] = False + self.req_dict[request.request_id] = request.idx self.running.append(request) scheduled_reqs.append(self._prepare_decode_task(request)) continue From 97fcc761d7ce54df4ba0d4a4190f1dd36b7162b6 Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Wed, 25 Mar 2026 03:14:39 +0000 Subject: [PATCH 4/6] use transfer_manager --- fastdeploy/engine/offload_manager.py | 27 ++++++- .../engine/sched/resource_manager_v1.py | 38 ++++++++- fastdeploy/output/token_processor.py | 30 +++++++ tests/engine/test_resource_manager_v1.py | 78 +++++++++++++++++++ tests/output/test_token_processor.py | 33 ++++++++ 5 files changed, 198 insertions(+), 8 deletions(-) diff --git a/fastdeploy/engine/offload_manager.py b/fastdeploy/engine/offload_manager.py index f4f96fa642b..60f38ae4e9c 100644 --- a/fastdeploy/engine/offload_manager.py +++ b/fastdeploy/engine/offload_manager.py @@ -15,6 +15,7 @@ """ import threading +import time from typing import Dict, List, Optional, Tuple from fastdeploy import envs @@ -179,11 +180,14 @@ def offload_req(self, request: Request) -> bool: offload_logger.warning(f"[DEBUG: offload_req] Request {request.request_id} already offloaded") return False + start_time = time.perf_counter() snapshot_task = DecodeOffloadTask(task_id=request.request_id, gpu_block_ids=list(request.block_tables)) snapshot_result = self._issue_transfer_task(CacheStatus.DECODE_OFFLOAD, snapshot_task) if snapshot_result is None or not snapshot_result.get("ok", False): + elapsed_ms = (time.perf_counter() - start_time) * 1000 offload_logger.error( - f"[DEBUG: offload_req] Failed to snapshot request {request.request_id}, result={snapshot_result}" + f"[DEBUG: offload_req] Failed to snapshot request {request.request_id}, " + f"elapsed_ms={elapsed_ms:.2f}, result={snapshot_result}" ) return False @@ -210,9 +214,10 @@ def offload_req(self, request: Request) -> bool: self.release_gpu_blocks(request) request.status = RequestStatus.PREEMPTED request.is_offloaded = True + elapsed_ms = (time.perf_counter() - start_time) * 1000 offload_logger.info( f"[DEBUG: offload_req] Request {request.request_id} offloaded to {self.STORAGE_LEVEL_CPU}, " - f"blocks_needed={len(original_block_tables)}" + f"blocks_needed={len(original_block_tables)}, offload_time_ms={elapsed_ms:.2f}" ) return True @@ -250,6 +255,7 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: if not self.enable_offload: return False, None, False + start_time = time.perf_counter() with self._lock: offloaded_info = self._offloaded_requests.get(request.request_id) if offloaded_info is None: @@ -284,12 +290,17 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: resume_task = DecodeResumeTask(task_id=snapshot_handle, gpu_block_ids=new_block_ids) resume_result = self._issue_transfer_task(CacheStatus.DECODE_RESUME, resume_task) if resume_result is None or not resume_result.get("ok", False): + elapsed_ms = (time.perf_counter() - start_time) * 1000 self.cache_manager.recycle_gpu_blocks(new_block_ids, request.request_id) request.block_tables = [] should_recompute = resume_retry_count + 1 >= self.max_resume_retry with self._lock: if request.request_id in self._offloaded_requests: self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 + offload_logger.warning( + f"[DEBUG: resume_decode] Resume transfer failed for {request.request_id}, " + f"elapsed_ms={elapsed_ms:.2f}, should_recompute={should_recompute}, result={resume_result}" + ) return False, saved_num_computed_tokens, should_recompute request.output_token_ids = output_token_ids @@ -301,11 +312,19 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: with self._lock: self._offloaded_requests.pop(request.request_id, None) - offload_logger.info(f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully") + elapsed_ms = (time.perf_counter() - start_time) * 1000 + offload_logger.info( + f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully, " + f"resume_time_ms={elapsed_ms:.2f}" + ) return True, saved_num_computed_tokens, False except Exception as e: should_recompute = False - offload_logger.error(f"[DEBUG: resume_decode] Failed to resume request {request.request_id}: {e}") + elapsed_ms = (time.perf_counter() - start_time) * 1000 + offload_logger.error( + f"[DEBUG: resume_decode] Failed to resume request {request.request_id}, " + f"elapsed_ms={elapsed_ms:.2f}: {e}" + ) with self._lock: if request.request_id in self._offloaded_requests: retries = self._offloaded_requests[request.request_id].get("resume_retry_count", 0) + 1 diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 8aed8f98390..bff01483bb0 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -258,6 +258,27 @@ def _prepare_decode_task(self, request): def _prepare_preempt_task(self, request): return ScheduledPreemptTask(idx=request.idx, request_id=request.request_id) + def _get_pending_preempt_slots(self) -> set[int]: + pending_slots = set() + request_id_set = getattr(self, "to_be_rescheduled_request_id_set", set()) + requests = getattr(self, "requests", {}) + for request_id in request_id_set: + request = requests.get(request_id) + if request is not None and request.idx is not None: + pending_slots.add(request.idx) + return pending_slots + + def _assign_rescheduled_slot(self, request): + allocated_position = self.get_available_position() + request.idx = allocated_position + self.tasks_list[allocated_position] = request + self.stop_flags[allocated_position] = False + self.req_dict[request.request_id] = allocated_position + return allocated_position + + def available_batch(self): + return max(super().available_batch() - len(self._get_pending_preempt_slots()), 0) + def reschedule_preempt_task(self, request_id, process_func=None): with self.lock: llm_logger.debug(f"reschedule {request_id} into waiting queue") @@ -267,6 +288,7 @@ def reschedule_preempt_task(self, request_id, process_func=None): request.metrics.preempted_count += 1 if process_func is not None: process_func(request) + request.idx = None llm_logger.debug(f"self.waiting append request:{request.request_id},req.type:{request.status}") self.waiting.appendleft(request) self.to_be_rescheduled_request_id_set.remove(request_id) @@ -503,6 +525,10 @@ def _get_num_new_tokens(self, request, token_budget): f"need_prefill={request.need_prefill_tokens}, computed={request.num_computed_tokens}" ) num_new_tokens = min(num_new_tokens, token_budget) + decode_chunk_limit = None + if self.config.scheduler_config.splitwise_role == "decode": + decode_chunk_limit = self.config.get_max_chunk_tokens(self.config.model_config.mm_max_tokens_per_item) + num_new_tokens = min(num_new_tokens, decode_chunk_limit) # Deterministic mode: align chunk boundaries to split_kv_size # This ensures batch-invariant attention by making each chunk @@ -543,6 +569,8 @@ def _get_num_new_tokens(self, request, token_budget): request.with_image = False if not self.config.model_config.enable_mm: + if decode_chunk_limit is not None: + num_new_tokens = min(num_new_tokens, decode_chunk_limit) return num_new_tokens inputs = request.multimodal_inputs @@ -713,6 +741,8 @@ def _compute_audio_prefix_count(end_idx, end_patch_idx): request.evict_mm_hashes = self.encoder_cache.apply_cache(cur_mm_hashes, cur_mm_positions) # Compatible with scenarios without images and videos. + if decode_chunk_limit is not None: + num_new_tokens = min(num_new_tokens, decode_chunk_limit) return num_new_tokens def exist_mm_prefill(self, scheduled_reqs): @@ -996,9 +1026,7 @@ def _allocate_decode_and_extend(): if resume_success: offload_logger.info(f"Resumed offloaded request {request.request_id}") self.waiting.popleft() - self.tasks_list[request.idx] = request - self.stop_flags[request.idx] = False - self.req_dict[request.request_id] = request.idx + self._assign_rescheduled_slot(request) self.running.append(request) scheduled_reqs.append(self._prepare_decode_task(request)) continue @@ -1055,6 +1083,7 @@ def _allocate_decode_and_extend(): ) request.block_tables.extend(extra_gpu_block_ids) self.waiting.popleft() + self._assign_rescheduled_slot(request) self.running.append(request) scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) token_budget -= num_new_tokens @@ -1199,9 +1228,10 @@ def download_bos_features(bos_client, features_urls): inputs["audio_features"] = result def get_available_position(self) -> int: + pending_preempt_slots = self._get_pending_preempt_slots() position = 0 while position < self.max_num_seqs: - if self.stop_flags[position] is True: + if self.stop_flags[position] is True and position not in pending_preempt_slots: return position position += 1 raise RuntimeError("No available position is available for new request") diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index a5d57ff79b1..bff28de71ff 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -214,6 +214,15 @@ def _reschedule_preempt_task_use_zmq(self, datas): f"finish reschedule_preempt_task request_id {request_id} at {self.resource_manager.requests[request_id].idx}" ) + def _find_pending_preempt_request_id_by_idx(self, idx: int): + if not envs.ENABLE_V1_KVCACHE_SCHEDULER: + return None + for request_id in list(self.resource_manager.to_be_rescheduled_request_id_set): + request = self.resource_manager.requests.get(request_id) + if request is not None and getattr(request, "idx", None) == idx: + return request_id + return None + def _process_per_token(self, task, batch_id: int, token_ids: np.ndarray, result: RequestOutput, is_prefill: bool): """ process output token by token @@ -280,6 +289,13 @@ def _process_batch_output_use_zmq(self, receive_datas): for _, stream_data in enumerate(receive_datas): i = stream_data.batch_id if self.resource_manager.stop_flags[i]: + pending_preempt_request_id = self._find_pending_preempt_request_id_by_idx(i) + if pending_preempt_request_id is None: + continue + token_ids = stream_data.tokens + if token_ids is not None and token_ids[-1] == PREEMPTED_TOKEN_ID: + llm_logger.info(f"sync preemption for request_id {pending_preempt_request_id} done.") + self.resource_manager.reschedule_preempt_task(pending_preempt_request_id) continue task: Request = self.resource_manager.tasks_list[i] @@ -742,7 +758,21 @@ def _process_batch_output(self): batch_result = list() # reschedule for i in range(batch): + pending_preempt_request_id = None if self.resource_manager.stop_flags[i]: + pending_preempt_request_id = self._find_pending_preempt_request_id_by_idx(i) + if pending_preempt_request_id is None: + continue + if self.cfg.speculative_config.method: + if accept_num[i] == PREEMPTED_TOKEN_ID: + llm_logger.info(f"sync preemption for request_id {pending_preempt_request_id} done.") + self.resource_manager.reschedule_preempt_task(pending_preempt_request_id) + continue + + token_id = int(tokens[i, 0]) + if token_id == PREEMPTED_TOKEN_ID: + llm_logger.info(f"sync preemption for request_id {pending_preempt_request_id} done.") + self.resource_manager.reschedule_preempt_task(pending_preempt_request_id) continue recovery_stop = False diff --git a/tests/engine/test_resource_manager_v1.py b/tests/engine/test_resource_manager_v1.py index 0031a2e4f69..f820254542d 100644 --- a/tests/engine/test_resource_manager_v1.py +++ b/tests/engine/test_resource_manager_v1.py @@ -99,6 +99,84 @@ def test_preempted_all_with_normal_requests(self): self.assertEqual(len(self.manager.waiting), 0) self.assertEqual(len(self.manager.to_be_rescheduled_request_id_set), 2) + def test_schedule_resumed_offloaded_request_reassigns_slot(self): + engine_args = EngineArgs( + model=MODEL_NAME, + max_model_len=8192, + tensor_parallel_size=1, + engine_worker_queue_port=int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778")), + cache_queue_port=int(os.getenv("FD_CACHE_QUEUE_PORT", "6779")), + ) + mock_config = engine_args.create_engine_config() + manager = ResourceManagerV1( + max_num_seqs=2, + config=mock_config, + tensor_parallel_size=1, + splitwise_role="decode", + local_data_parallel_id=0, + ) + manager.cache_manager = Mock() + manager.cache_manager.can_allocate_gpu_blocks.return_value = True + manager.offload_manager = Mock() + + request = Mock(spec=Request) + request.request_id = "req-offloaded" + request.status = RequestStatus.PREEMPTED + request.is_offloaded = True + request.idx = 0 + request.block_tables = [7, 8] + request.num_total_tokens = 32 + request.need_prefill_tokens = 16 + request.num_computed_tokens = 32 + + def _resume(req): + req.block_tables = [11, 12] + return True, req.num_computed_tokens, False + + manager.offload_manager.resume_decode.side_effect = _resume + manager.requests[request.request_id] = request + manager.waiting.append(request) + manager.tasks_list[0] = Mock() + manager.stop_flags[0] = False + manager.tasks_list[1] = None + manager.stop_flags[1] = True + + scheduled_reqs, error_reqs = manager.schedule() + + self.assertEqual(error_reqs, []) + self.assertEqual(len(scheduled_reqs), 1) + self.assertEqual(scheduled_reqs[0].request_id, request.request_id) + self.assertEqual(scheduled_reqs[0].idx, 1) + self.assertEqual(request.idx, 1) + self.assertEqual(manager.req_dict[request.request_id], 1) + self.assertIs(manager.tasks_list[1], request) + self.assertIsNot(manager.tasks_list[0], request) + self.assertFalse(manager.stop_flags[1]) + manager.need_block_num_signal.clear() + + def test_pending_preempt_slot_is_reserved_until_ack(self): + request = Mock(spec=Request) + request.request_id = "req-pending" + request.idx = 0 + request.status = RequestStatus.PREEMPTED + request.has_been_preempted_before = False + request.metrics = Mock() + request.metrics.preempted_count = 0 + + self.manager.stop_flags = [True, True, True, True] + self.manager.requests[request.request_id] = request + self.manager.to_be_rescheduled_request_id_set.add(request.request_id) + + self.assertEqual(self.manager.available_batch(), 3) + self.assertEqual(self.manager.get_available_position(), 1) + + self.manager.reschedule_preempt_task(request.request_id) + + self.assertEqual(request.idx, None) + self.assertEqual(self.manager.available_batch(), 4) + self.assertEqual(self.manager.get_available_position(), 0) + self.assertEqual(self.manager.waiting[0], request) + if __name__ == "__main__": unittest.main() diff --git a/tests/output/test_token_processor.py b/tests/output/test_token_processor.py index e8ff821a268..e49ee1ace41 100644 --- a/tests/output/test_token_processor.py +++ b/tests/output/test_token_processor.py @@ -25,6 +25,7 @@ import pytest from fastdeploy import envs +from fastdeploy.config import PREEMPTED_TOKEN_ID from fastdeploy.engine.request import Request, RequestMetrics, RequestOutput from fastdeploy.output import token_processor from fastdeploy.output.token_processor import ( @@ -253,6 +254,20 @@ def test_reschedule_preempt_task_use_zmq_reschedules_missing_batch(): assert "reschedule-req-a" in rm.recycled +def test_process_batch_output_use_zmq_reschedules_preempted_stopped_slot(): + processor, rm, _, _ = _make_processor() + rm.stop_flags[0] = True + rm.to_be_rescheduled_request_id_set = {"req-a"} + rm.requests = {"req-a": types.SimpleNamespace(idx=0)} + receive_datas = [types.SimpleNamespace(batch_id=0, tokens=np.array([PREEMPTED_TOKEN_ID]), pooler_output=None)] + + with mock.patch.object(envs, "ENABLE_V1_KVCACHE_SCHEDULER", True): + batch_result = processor._process_batch_output_use_zmq(receive_datas) + + assert batch_result == [] + assert "reschedule-req-a" in rm.recycled + + def test_process_batch_draft_tokens_collects_top_logprobs(): processor, rm, _, _ = _make_processor(speculative_method="mtp", enable_logprob=True) rm.tasks_list[0] = types.SimpleNamespace(request_id="task-0", block_tables=[1]) @@ -996,6 +1011,24 @@ def test_process_batch_output_skips_already_stopped_slot(): assert processor.cached_generated_tokens.put_results.called +def test_process_batch_output_reschedules_stopped_slot_negative_token(): + processor, rm, _, _ = _make_processor() + task_id = "req-stopped-neg" + rm.stop_flags[0] = True + rm.requests = {task_id: types.SimpleNamespace(idx=0)} + rm.to_be_rescheduled_request_id_set = {task_id} + processor.output_tokens[1, 0] = 1 + processor.output_tokens[2, 0] = -9 + + with ( + mock.patch.object(envs, "ENABLE_V1_KVCACHE_SCHEDULER", True), + mock.patch.object(token_processor, "main_process_metrics", _Metrics()), + ): + processor._process_batch_output() + + assert rm.recycled[-1] == f"reschedule-{task_id}" + + def test_process_batch_output_speculative_negative_token_reschedules(): processor, rm, _, _ = _make_processor(speculative_method="mtp") task_id = "req-spec-neg" From 7c08a7cf33aeaab3ff85a5b9989ac5f13beb3ec9 Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Wed, 25 Mar 2026 09:55:59 +0000 Subject: [PATCH 5/6] use transfer_manager --- fastdeploy/engine/common_engine.py | 21 ++++++- fastdeploy/engine/offload_manager.py | 59 +++++++++---------- .../engine/sched/resource_manager_v1.py | 32 ++++++---- fastdeploy/output/token_processor.py | 13 ++++ tests/engine/test_resource_manager_v1.py | 2 +- 5 files changed, 81 insertions(+), 46 deletions(-) diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 8948a330df0..df707eddeff 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -1028,9 +1028,24 @@ def _fetch_request(): if self.cfg.scheduler_config.splitwise_role == "decode": for task in tasks: if task.task_type == RequestType.PREEMPTED: - self.llm_logger.info( - f"{task.request_id} decode request is preempted and waiting for reschedule." - ) + req = self.resource_manager.requests.get(task.request_id) + if req is not None and req.is_offloaded: + self.llm_logger.info( + f"{task.request_id} decode request is preempted and offloaded, waiting for resume." + ) + else: + msg = f"{task.request_id} decode not enough blocks, need to be rescheduled." + self.llm_logger.error(msg) + self.scheduler.put_results( + [ + RequestOutput( + request_id=task.request_id, + finished=True, + error_code=500, + error_msg=msg, + ) + ] + ) self.resource_manager.get_real_bsz() for task in tasks: if task.task_type == RequestType.PREFILL: diff --git a/fastdeploy/engine/offload_manager.py b/fastdeploy/engine/offload_manager.py index 60f38ae4e9c..dd2e17070b0 100644 --- a/fastdeploy/engine/offload_manager.py +++ b/fastdeploy/engine/offload_manager.py @@ -46,7 +46,6 @@ def __init__(self, config=None, cache_manager=None, model_runner=None): self.min_steps = 20 self.cpu_offloading_chunk_size = getattr(envs, "FD_CPU_OFFLOAD_CHUNK_SIZE", 8192) self.cpu_memory_limit = getattr(envs, "FD_CPU_MEMORY_LIMIT", 50 * 1024 * 1024 * 1024) - self.max_resume_retry = 3 self._offloaded_requests: Dict[str, dict] = {} self._lock = threading.Lock() @@ -101,7 +100,13 @@ def _issue_transfer_task(self, event_type, task): self._transfer_events[key] = event self._transfer_results.pop(key, None) self.cache_manager.cache_task_queue.put_transfer_task((event_type, task)) - event.wait() + event.wait(timeout=30) + if not event.is_set(): + offload_logger.error(f"Transfer task {task.task_id} timed out after 30s") + with self._lock: + self._transfer_results.pop(key, None) + self._transfer_events.pop(key, None) + return None with self._lock: results = self._transfer_results.pop(key, []) self._transfer_events.pop(key, None) @@ -208,11 +213,9 @@ def offload_req(self, request: Request) -> bool: "sampling_params": request.sampling_params, "block_tables": original_block_tables, "snapshot_handle": request.request_id, - "resume_retry_count": 0, } self.release_gpu_blocks(request) - request.status = RequestStatus.PREEMPTED request.is_offloaded = True elapsed_ms = (time.perf_counter() - start_time) * 1000 offload_logger.info( @@ -251,39 +254,45 @@ def load_from_storage(self, storage_path: str) -> Optional[dict]: offload_logger.warning("[DEBUG: load_from_storage] SSD resume is not implemented in the first version") return None - def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: + def resume_decode(self, request: Request) -> Tuple[bool, Optional[int]]: if not self.enable_offload: - return False, None, False + return False, None start_time = time.perf_counter() with self._lock: offloaded_info = self._offloaded_requests.get(request.request_id) if offloaded_info is None: offload_logger.warning(f"[DEBUG: resume_decode] Request {request.request_id} is not offloaded") - return False, None, True + return False, None num_blocks_needed = offloaded_info["num_blocks_needed"] saved_num_computed_tokens = offloaded_info["num_computed_tokens"] saved_need_prefill_tokens = offloaded_info["need_prefill_tokens"] snapshot_handle = offloaded_info.get("snapshot_handle") - resume_retry_count = offloaded_info.get("resume_retry_count", 0) output_token_ids = list(offloaded_info.get("output_token_ids", [])) need_prefill_tokens = offloaded_info.get("need_prefill_tokens") if saved_num_computed_tokens <= saved_need_prefill_tokens: - return False, saved_num_computed_tokens, True + offload_logger.warning( + f"[DEBUG: resume_decode] Request {request.request_id} has invalid state: " + f"num_computed_tokens={saved_num_computed_tokens} <= need_prefill_tokens={saved_need_prefill_tokens}" + ) + return False, saved_num_computed_tokens if self.cache_manager is None: - return False, saved_num_computed_tokens, True + return False, saved_num_computed_tokens if not self.cache_manager.can_allocate_gpu_blocks(num_blocks_needed): - should_recompute = resume_retry_count + 1 >= self.max_resume_retry - with self._lock: - if request.request_id in self._offloaded_requests: - self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 - return False, saved_num_computed_tokens, should_recompute + offload_logger.debug( + f"[DEBUG: resume_decode] Not enough GPU blocks for {request.request_id}, " + f"need={num_blocks_needed}, will retry later" + ) + return False, saved_num_computed_tokens try: if snapshot_handle is None: - return False, saved_num_computed_tokens, True + offload_logger.warning( + f"[DEBUG: resume_decode] Request {request.request_id} has no snapshot handle" + ) + return False, saved_num_computed_tokens new_block_ids = self.cache_manager.allocate_gpu_blocks(num_blocks_needed, request.request_id) request.block_tables = new_block_ids @@ -293,15 +302,11 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: elapsed_ms = (time.perf_counter() - start_time) * 1000 self.cache_manager.recycle_gpu_blocks(new_block_ids, request.request_id) request.block_tables = [] - should_recompute = resume_retry_count + 1 >= self.max_resume_retry - with self._lock: - if request.request_id in self._offloaded_requests: - self._offloaded_requests[request.request_id]["resume_retry_count"] = resume_retry_count + 1 offload_logger.warning( f"[DEBUG: resume_decode] Resume transfer failed for {request.request_id}, " - f"elapsed_ms={elapsed_ms:.2f}, should_recompute={should_recompute}, result={resume_result}" + f"elapsed_ms={elapsed_ms:.2f}, result={resume_result}" ) - return False, saved_num_computed_tokens, should_recompute + return False, saved_num_computed_tokens request.output_token_ids = output_token_ids request.num_computed_tokens = saved_num_computed_tokens @@ -317,20 +322,14 @@ def resume_decode(self, request: Request) -> Tuple[bool, Optional[int], bool]: f"[DEBUG: resume_decode] Request {request.request_id} resumed successfully, " f"resume_time_ms={elapsed_ms:.2f}" ) - return True, saved_num_computed_tokens, False + return True, saved_num_computed_tokens except Exception as e: - should_recompute = False elapsed_ms = (time.perf_counter() - start_time) * 1000 offload_logger.error( f"[DEBUG: resume_decode] Failed to resume request {request.request_id}, " f"elapsed_ms={elapsed_ms:.2f}: {e}" ) - with self._lock: - if request.request_id in self._offloaded_requests: - retries = self._offloaded_requests[request.request_id].get("resume_retry_count", 0) + 1 - self._offloaded_requests[request.request_id]["resume_retry_count"] = retries - should_recompute = retries >= self.max_resume_retry - return False, saved_num_computed_tokens, should_recompute + return False, saved_num_computed_tokens def cleanup_offloaded_request(self, request_id: str) -> None: with self._lock: diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index bff01483bb0..6969f3e3975 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -378,7 +378,8 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re ) preempted_req.status = RequestStatus.PREEMPTED - preempted_req.num_computed_tokens = 0 + if not offloaded: + preempted_req.num_computed_tokens = 0 if self.config.scheduler_config.splitwise_role == "decode": self.tasks_list[preempted_req.idx] = None self.stop_flags[preempted_req.idx] = True @@ -1022,7 +1023,23 @@ def _allocate_decode_and_extend(): elif request.status == RequestStatus.PREEMPTED: # Try to resume offloaded request first if request.is_offloaded and self.offload_manager is not None: - resume_success, _, should_recompute = self.offload_manager.resume_decode(request) + # Only attempt resume when running requests have finished + # or there are enough free blocks to sustain all requests. + # This prevents thrashing (immediate re-preempt after resume). + if len(self.running) > 0: + offloaded_info = self.offload_manager._offloaded_requests.get(request.request_id) + num_blocks_for_resume = offloaded_info["num_blocks_needed"] if offloaded_info else 0 + block_size = self.cache_manager.cache_config.block_size + min_steps = self.offload_manager.min_steps + blocks_per_step = (min_steps + block_size - 1) // block_size + total_running_after_resume = len(self.running) + 1 + total_blocks_needed = num_blocks_for_resume + total_running_after_resume * blocks_per_step + free_blocks = len(getattr(self.cache_manager, "gpu_free_block_list", [])) + if free_blocks < total_blocks_needed: + # Not enough blocks to resume without thrashing, wait for running to finish + break + + resume_success, _ = self.offload_manager.resume_decode(request) if resume_success: offload_logger.info(f"Resumed offloaded request {request.request_id}") self.waiting.popleft() @@ -1030,20 +1047,11 @@ def _allocate_decode_and_extend(): self.running.append(request) scheduled_reqs.append(self._prepare_decode_task(request)) continue - if should_recompute: - offload_logger.info( - f"Resume retry limit reached or snapshot invalid for {request.request_id}, " - "fallback to recompute" - ) - request.is_offloaded = False - self.offload_manager.cleanup_offloaded_request(request.request_id) else: offload_logger.debug( f"Failed to resume offloaded request {request.request_id}, will retry" ) - skip_requests.append(request) - self.waiting.popleft() - continue + break request.need_prefill_tokens = ( request.num_total_tokens diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index bff28de71ff..b5f72860601 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -1012,6 +1012,19 @@ def _process_batch_output(self): llm_logger.debug(f"get response from infer: {result}") batch_result.append(result) + # Reschedule preempted requests whose idx >= batch (not covered by range(batch)) + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + batch_id_set = set(range(batch)) + for request_id in list(self.resource_manager.to_be_rescheduled_request_id_set): + if request_id not in self.resource_manager.requests: + continue + req = self.resource_manager.requests[request_id] + if getattr(req, "idx", None) not in batch_id_set: + llm_logger.debug( + f"reschedule_preempt_task request_id {request_id} at idx {getattr(req, 'idx', None)} (out of batch range {batch})" + ) + self.resource_manager.reschedule_preempt_task(request_id) + if self.cfg.speculative_config.method: self._record_speculative_decoding_metrics(accept_num) self.postprocess(batch_result, mtype) diff --git a/tests/engine/test_resource_manager_v1.py b/tests/engine/test_resource_manager_v1.py index f820254542d..3e6459dacbe 100644 --- a/tests/engine/test_resource_manager_v1.py +++ b/tests/engine/test_resource_manager_v1.py @@ -131,7 +131,7 @@ def test_schedule_resumed_offloaded_request_reassigns_slot(self): def _resume(req): req.block_tables = [11, 12] - return True, req.num_computed_tokens, False + return True, req.num_computed_tokens manager.offload_manager.resume_decode.side_effect = _resume manager.requests[request.request_id] = request From 75b1013e54c869b9868f08d9066171ebb55ad635 Mon Sep 17 00:00:00 2001 From: CyanScholar Date: Wed, 25 Mar 2026 12:40:16 +0000 Subject: [PATCH 6/6] use transfer_manager --- fastdeploy/engine/sched/resource_manager_v1.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 6969f3e3975..3db479710a7 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -271,6 +271,10 @@ def _get_pending_preempt_slots(self) -> set[int]: def _assign_rescheduled_slot(self, request): allocated_position = self.get_available_position() request.idx = allocated_position + allocated_position = request.idx + if allocated_position is None: + allocated_position = self.get_available_position() + request.idx = allocated_position self.tasks_list[allocated_position] = request self.stop_flags[allocated_position] = False self.req_dict[request.request_id] = allocated_position