diff --git a/lightllm/common/kv_trans_kernel/nixl_kv_trans.py b/lightllm/common/kv_trans_kernel/nixl_kv_trans.py index c753a85c8e..f95cebbc18 100644 --- a/lightllm/common/kv_trans_kernel/nixl_kv_trans.py +++ b/lightllm/common/kv_trans_kernel/nixl_kv_trans.py @@ -10,6 +10,8 @@ @triton.jit def _page_io( mem_index_ptr, + token_num, + page_write_head_num, k_page_ptr, k_page_stride_size, k_page_stride_layer_num, @@ -45,88 +47,91 @@ def _page_io( k_stride_size = tl.cast(k_stride_size, dtype=tl.int64) v_stride_size = tl.cast(v_stride_size, dtype=tl.int64) - tid = tl.program_id(0) - kv_head_id = tl.program_id(1) - page_head_id = page_head_start + kv_head_id + start_index = tl.program_id(0) + grid_num = tl.num_programs(0) - mem_index = tl.load(mem_index_ptr + tid) - off_dim = tl.arange(0, HEAD_DIM_BLOCK) - if NEED_MASK: - mask = off_dim < head_dim - else: - mask = None + for tid in tl.range(start_index, token_num, step=grid_num): + for kv_head_id in tl.range(page_write_head_num): - for layer_index in tl.range(layer_num, num_stages=3): - if IS_WRITE: - k_tensor = tl.load( - k_ptr - + layer_index * k_stride_layer_num - + mem_index * k_stride_size - + kv_head_id * k_stride_head - + off_dim * k_stride_dim, - mask=mask, - ) - v_tensor = tl.load( - v_ptr - + layer_index * v_stride_layer_num - + mem_index * v_stride_size - + kv_head_id * v_stride_head - + off_dim * v_stride_dim, - mask=mask, - ) - tl.store( - k_page_ptr - + tid * k_page_stride_size - + layer_index * k_page_stride_layer_num - + page_head_id * k_page_stride_head - + off_dim * k_page_stride_dim, - k_tensor, - mask=mask, - ) - tl.store( - v_page_ptr - + tid * v_page_stride_size - + layer_index * v_page_stride_layer_num - + page_head_id * v_page_stride_head - + off_dim * v_page_stride_dim, - v_tensor, - mask=mask, - ) - else: - k_page_tensor = tl.load( - k_page_ptr - + tid * k_page_stride_size - + layer_index * k_page_stride_layer_num - + page_head_id * k_page_stride_head - + off_dim * k_page_stride_dim, - mask=mask, - ) - v_page_tensor = tl.load( - v_page_ptr - + tid * v_page_stride_size - + layer_index * v_page_stride_layer_num - + page_head_id * v_page_stride_head - + off_dim * v_page_stride_dim, - mask=mask, - ) - tl.store( - k_ptr - + layer_index * k_stride_layer_num - + mem_index * k_stride_size - + kv_head_id * k_stride_head - + off_dim * k_stride_dim, - k_page_tensor, - mask=mask, - ) - tl.store( - v_ptr - + layer_index * v_stride_layer_num - + mem_index * v_stride_size - + kv_head_id * v_stride_head - + off_dim * v_stride_dim, - v_page_tensor, - mask=mask, - ) + page_head_id = page_head_start + kv_head_id + mem_index = tl.load(mem_index_ptr + tid) + off_dim = tl.arange(0, HEAD_DIM_BLOCK) + if NEED_MASK: + mask = off_dim < head_dim + else: + mask = None + + for layer_index in tl.range(layer_num, num_stages=3): + if IS_WRITE: + k_tensor = tl.load( + k_ptr + + layer_index * k_stride_layer_num + + mem_index * k_stride_size + + kv_head_id * k_stride_head + + off_dim, + mask=mask, + ) + v_tensor = tl.load( + v_ptr + + layer_index * v_stride_layer_num + + mem_index * v_stride_size + + kv_head_id * v_stride_head + + off_dim, + mask=mask, + ) + tl.store( + k_page_ptr + + tid * k_page_stride_size + + layer_index * k_page_stride_layer_num + + page_head_id * k_page_stride_head + + off_dim, + k_tensor, + mask=mask, + ) + tl.store( + v_page_ptr + + tid * v_page_stride_size + + layer_index * v_page_stride_layer_num + + page_head_id * v_page_stride_head + + off_dim, + v_tensor, + mask=mask, + ) + else: + k_page_tensor = tl.load( + k_page_ptr + + tid * k_page_stride_size + + layer_index * k_page_stride_layer_num + + page_head_id * k_page_stride_head + + off_dim, + mask=mask, + ) + v_page_tensor = tl.load( + v_page_ptr + + tid * v_page_stride_size + + layer_index * v_page_stride_layer_num + + page_head_id * v_page_stride_head + + off_dim, + mask=mask, + ) + tl.store( + k_ptr + + layer_index * k_stride_layer_num + + mem_index * k_stride_size + + kv_head_id * k_stride_head + + off_dim, + k_page_tensor, + mask=mask, + ) + tl.store( + v_ptr + + layer_index * v_stride_layer_num + + mem_index * v_stride_size + + kv_head_id * v_stride_head + + off_dim, + v_page_tensor, + mask=mask, + ) return @@ -169,10 +174,17 @@ def page_io( page_head_start = tp_index * (page_write_head_num) token_num = len(mem_indexes) - grid = (token_num, page_write_head_num) + grid = (128,) + + assert k_page_tensor.stride(3) == 1 + assert v_page_tensor.stride(3) == 1 + assert k_buffer.stride(3) == 1 + assert v_buffer.stride(3) == 1 _page_io[grid]( mem_index_ptr=mem_indexes, + token_num=token_num, + page_write_head_num=page_write_head_num, k_page_ptr=k_page_tensor, k_page_stride_size=k_page_tensor.stride(0), k_page_stride_layer_num=k_page_tensor.stride(1), @@ -207,6 +219,7 @@ def page_io( @triton.jit def _mla_page_io( mem_index_ptr, + token_num, page_ptr, page_stride_size, page_stride_layer_num, @@ -227,52 +240,54 @@ def _mla_page_io( kv_stride_layer_num = tl.cast(kv_stride_layer_num, dtype=tl.int64) kv_stride_size = tl.cast(kv_stride_size, dtype=tl.int64) - tid = tl.program_id(0) + start_index = tl.program_id(0) + grid_num = tl.num_programs(0) - mem_index = tl.load(mem_index_ptr + tid) - off_dim = tl.arange(0, HEAD_DIM_BLOCK) - if NEED_MASK: - mask = off_dim < head_dim - else: - mask = None - - for layer_index in tl.range(layer_num, num_stages=3): - if IS_WRITE: - kv_tensor = tl.load( - kv_ptr - + layer_index * kv_stride_layer_num - + mem_index * kv_stride_size - + 0 * kv_stride_head - + off_dim * kv_stride_dim, - mask=mask, - ) - tl.store( - page_ptr - + tid * page_stride_size - + layer_index * page_stride_layer_num - + 0 * page_stride_head - + off_dim * page_stride_dim, - kv_tensor, - mask=mask, - ) + for tid in tl.range(start_index, token_num, step=grid_num): + mem_index = tl.load(mem_index_ptr + tid) + off_dim = tl.arange(0, HEAD_DIM_BLOCK) + if NEED_MASK: + mask = off_dim < head_dim else: - page_tensor = tl.load( - page_ptr - + tid * page_stride_size - + layer_index * page_stride_layer_num - + 0 * page_stride_head - + off_dim * page_stride_dim, - mask=mask, - ) - tl.store( - kv_ptr - + layer_index * kv_stride_layer_num - + mem_index * kv_stride_size - + 0 * kv_stride_head - + off_dim * kv_stride_dim, - page_tensor, - mask=mask, - ) + mask = None + + for layer_index in tl.range(layer_num, num_stages=3): + if IS_WRITE: + kv_tensor = tl.load( + kv_ptr + + layer_index * kv_stride_layer_num + + mem_index * kv_stride_size + + 0 * kv_stride_head + + off_dim * kv_stride_dim, + mask=mask, + ) + tl.store( + page_ptr + + tid * page_stride_size + + layer_index * page_stride_layer_num + + 0 * page_stride_head + + off_dim * page_stride_dim, + kv_tensor, + mask=mask, + ) + else: + page_tensor = tl.load( + page_ptr + + tid * page_stride_size + + layer_index * page_stride_layer_num + + 0 * page_stride_head + + off_dim * page_stride_dim, + mask=mask, + ) + tl.store( + kv_ptr + + layer_index * kv_stride_layer_num + + mem_index * kv_stride_size + + 0 * kv_stride_head + + off_dim * kv_stride_dim, + page_tensor, + mask=mask, + ) return @@ -290,10 +305,11 @@ def mla_page_io(mem_indexes: torch.Tensor, page_tensor: torch.Tensor, kv_buffer: assert page_head_num == kv_head_num == 1 token_num = len(mem_indexes) - grid = (token_num,) + grid = (64,) _mla_page_io[grid]( mem_index_ptr=mem_indexes, + token_num=token_num, page_ptr=page_tensor, page_stride_size=page_tensor.stride(0), page_stride_layer_num=page_tensor.stride(1), diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py index 307a3d48a8..7b4b8ccaad 100644 --- a/lightllm/server/httpserver_for_pd_master/manager.py +++ b/lightllm/server/httpserver_for_pd_master/manager.py @@ -331,7 +331,7 @@ async def fetch_nixl_stream( ) try: - await asyncio.wait_for(up_status_event.wait(), timeout=60) + await asyncio.wait_for(up_status_event.wait(), timeout=180) except asyncio.TimeoutError: logger.warning(f"group_request_id: {group_request_id} kv move time out err, server is busy now.") raise ServerBusyError() diff --git a/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_impl.py b/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_impl.py index 481a3197d7..3ffa15b154 100644 --- a/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_impl.py +++ b/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_impl.py @@ -178,14 +178,17 @@ def _create_nixl_trans_task( ): # 确定传输设备 if req_obj.nixl_trans_device_id == -1: + if not hasattr(self, "nixl_iter_device_id"): + self.nixl_iter_device_id = 0 + req_obj.nixl_trans_device_id = self.nixl_iter_device_id # only self.is_master_in_dp will be used. - req_obj.nixl_trans_device_id = random.randint(0, self.node_world_size - 1) + self.nixl_iter_device_id = (self.nixl_iter_device_id + 1) % self.node_world_size trans_task = NIXLChunckedTransTask( request_id=req_obj.req_id, start_kv_index=kv_start_index, end_kv_index=kv_end_index, - time_out_secs=80, + time_out_secs=180, pd_master_node_id=req_obj.sampling_param.pd_master_node_id, prefill_dp_index=None, decode_dp_index=self.dp_rank_in_node, diff --git a/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_trans_process.py b/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_trans_process.py index 7913865406..49e40c284a 100644 --- a/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_trans_process.py +++ b/lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_trans_process.py @@ -49,6 +49,29 @@ def _init_env( task_out_queue: mp.Queue, up_status_in_queue: Optional[mp.SimpleQueue], ): + import os + + # ------------------------------------------------------------------------- + # 问题背景(PD NIXL + 同卡多进程): + # decode 物理 GPU 上至少有两个独立 CUDA 进程:model_infer(解码推理)与 + # nixl_decode_trans(把 prefill 侧 KV page 拷入 decode KV cache)。 + # lm_eval batch=64 时会在短时间内并发大量 read_page;拷贝在 copy_cuda_stream + # 上排队,而推理在另一进程的 stream 上执行,彼此无法 cudaStreamWaitEvent + # 协调。日志里的 read_page_gpu_time(event 差值)会把「等 GPU 时间片 / + # 与推理争抢 SM」算进去,出现数十秒级毛刺,但并不代表单次 memcpy 真那么慢。 + # + # 解决思路:依赖 NVIDIA MPS(Multi-Process Service)在同一 GPU 上多进程 + # 共享上下文并做客户端级调度;在子进程 import torch / 创建 CUDA 上下文 + # **之前**设置下列环境变量(故必须放在本函数最前)。 + # + # CUDA_MPS_CLIENT_PRIORITY="0": + # MPS 下数值越小优先级越高。decode 侧 KV 拷贝处于 decode 关键路径(须先 + # 落盘 KV 才能出首 token),故给 trans 进程最高优先级,减轻被同卡推理 + # 饿死导致的排队放大。须集群已启动 nvidia-cuda-mps-control / mps-server, + # 否则该变量不生效。 启动 mps 的命令为 nvidia-cuda-mps-control -d + # ------------------------------------------------------------------------- + os.environ["CUDA_MPS_CLIENT_PRIORITY"] = "0" + torch.backends.cudnn.enabled = False setproctitle.setproctitle(f"lightllm::{get_unique_server_name()}::nixl_decode_trans:Device{device_id}") @@ -177,7 +200,7 @@ def dispatch_task_loop(self): self.waiting_dict[task.get_key()] = task else: task.start_trans_time = time.time() - self.success_queue.put((None, task)) + self.success_queue.put((None, None, task)) # up status task = trans_task_group.task_list[0] @@ -335,7 +358,10 @@ def read_page_to_mems_loop(self): while True: trans_task: NIXLChunckedTransTask = self.ready_page_task_queue.get() # 将数据写回 mem manger + copy_start_event = torch.cuda.Event(enable_timing=True) + copy_end_event = torch.cuda.Event(enable_timing=True) with torch.cuda.stream(stream=self.copy_cuda_stream): + copy_start_event.record(self.copy_cuda_stream) cur_mem = self.mem_managers[self.device_id] cur_mem.read_page_kv_move_buffer_to_mem( mem_indexes=trans_task.mem_indexes, @@ -344,22 +370,21 @@ def read_page_to_mems_loop(self): mem_managers=self.mem_managers, dp_world_size=self.dp_world_size, ) - sync_event = torch.cuda.Event() - sync_event.record() + copy_end_event.record(self.copy_cuda_stream) - self.success_queue.put((sync_event, trans_task)) + self.success_queue.put((copy_end_event, copy_start_event, trans_task)) return @log_exception def success_loop(self): torch.cuda.set_device(self.device_id) while True: - sync_event, trans_task = self.success_queue.get() + copy_end_event, copy_start_event, trans_task = self.success_queue.get() trans_task: NIXLChunckedTransTask = trans_task - sync_event: Optional[torch.cuda.Event] = sync_event - # 兼容传输kv 数量为0的时候, sync_event 为 None的情况。 - if sync_event is not None: - sync_event.synchronize() + read_page_gpu_time_ms = -1.0 + if copy_end_event is not None: + copy_end_event.synchronize() + read_page_gpu_time_ms = copy_start_event.elapsed_time(copy_end_event) if trans_task.nixl_dst_page_index is not None: self.page_index_queue.put(trans_task.nixl_dst_page_index) @@ -369,7 +394,13 @@ def success_loop(self): ret = trans_task.createRetObj() self.task_out_queue.put(ret) - logger.info(f"trans task ret success:{ret} cost time: {trans_task.transfer_time()} s") + if read_page_gpu_time_ms >= 0: + logger.info( + f"trans task ret success:{ret} cost time: {trans_task.transfer_time()} s " + f"read_page_gpu_time: {read_page_gpu_time_ms:.3f} ms" + ) + else: + logger.info(f"trans task ret success:{ret} cost time: {trans_task.transfer_time()} s") @log_exception def fail_loop(self): diff --git a/lightllm/server/router/model_infer/mode_backend/pd_nixl/nixl_kv_transporter.py b/lightllm/server/router/model_infer/mode_backend/pd_nixl/nixl_kv_transporter.py index 134fbd5027..dd4b1851bd 100644 --- a/lightllm/server/router/model_infer/mode_backend/pd_nixl/nixl_kv_transporter.py +++ b/lightllm/server/router/model_infer/mode_backend/pd_nixl/nixl_kv_transporter.py @@ -1,5 +1,6 @@ import pickle import copy +import time from dataclasses import dataclass from collections import defaultdict from typing import Dict, List, Any, Optional, Tuple @@ -63,6 +64,8 @@ def connect_add_remote_agent(self, remote_agent: NixlAgentMetadata): if remote_agent.agent_name in self.remote_agents: return + start_time = time.time() + peer_name = self.nixl_agent.add_remote_agent(remote_agent.agent_metadata) if isinstance(peer_name, bytes): peer_name = peer_name.decode() @@ -77,7 +80,9 @@ def connect_add_remote_agent(self, remote_agent: NixlAgentMetadata): ) remote_agent.page_xfer_handles = kv_page_xfer_handles - logger.info(f"Added remote agent {peer_name} with mem desc {page_mem_desc}") + logger.info( + f"Added remote agent {peer_name} with mem desc {page_mem_desc} cost time: {time.time() - start_time} s" + ) self.remote_agents[remote_agent.agent_name] = remote_agent return diff --git a/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_impl.py b/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_impl.py index 20c487d56e..b75c60b8ca 100644 --- a/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_impl.py +++ b/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_impl.py @@ -89,7 +89,10 @@ def _create_nixl_trans_task( ) -> NIXLChunckedTransTask: # 确定传输设备 if req_obj.nixl_trans_device_id == -1: - req_obj.nixl_trans_device_id = random.randint(0, self.node_world_size - 1) + if not hasattr(self, "nixl_iter_device_id"): + self.nixl_iter_device_id = 0 + req_obj.nixl_trans_device_id = self.nixl_iter_device_id + self.nixl_iter_device_id = (self.nixl_iter_device_id + 1) % self.node_world_size nixl_decode_node_info = req_obj.sampling_param.nixl_decode_node mem_indexes = ( @@ -102,7 +105,7 @@ def _create_nixl_trans_task( request_id=req_obj.req_id, start_kv_index=kv_start_index, end_kv_index=kv_end_index, - time_out_secs=82, + time_out_secs=182, pd_master_node_id=req_obj.sampling_param.pd_master_node_id, prefill_dp_index=self.dp_rank_in_node, decode_dp_index=None, diff --git a/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_trans_process.py b/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_trans_process.py index 7975a253f1..115077ebc7 100644 --- a/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_trans_process.py +++ b/lightllm/server/router/model_infer/mode_backend/pd_nixl/prefill_node_impl/prefill_trans_process.py @@ -42,6 +42,13 @@ def _init_env( task_in_queue: mp.Queue, task_out_queue: mp.Queue, ): + + import os + + # prefill 节点不一定需要 mps 来协调,所以优先级设置为 1. + # 本身并不产生严重的阻塞。 + os.environ["CUDA_MPS_CLIENT_PRIORITY"] = "1" + torch.backends.cudnn.enabled = False setproctitle.setproctitle(f"lightllm::{get_unique_server_name()}::nixl_prefill_trans:Device{device_id}") diff --git a/skills/test_model/qwen3-8b-pd-nixl/SKILL.md b/skills/test_model/qwen3-8b-pd-nixl/SKILL.md index cf62a3a4dd..cee20bc3d7 100644 --- a/skills/test_model/qwen3-8b-pd-nixl/SKILL.md +++ b/skills/test_model/qwen3-8b-pd-nixl/SKILL.md @@ -7,7 +7,8 @@ description: >- UCX_NET_DEVICES and TLS for RDMA per cluster. lm_eval hits pd_master URL. HOST vs PD_MASTER_IP when co-located. Before lm_eval, must POST one completion via curl to pd_master for warmup verification. Requires LOG_DIR, MODEL_DIR, proxy cleared, no_proxy, - summary.txt. Use for PD NIXL-style separation tests. + summary.txt. Same-GPU model_infer + nixl_*_trans need NVIDIA MPS for best KV copy perf; + record MPS on/off in summary. Use for PD NIXL-style separation tests. --- # Qwen3-8B **PD 分离(NIXL)**(`pd_master` + `nixl_prefill` + `nixl_decode`)本地 GSM8K 评测 @@ -24,7 +25,7 @@ description: >- - 每次评测先选定或新建**一个日志目录**(例如带时间戳或任务名),与其它测试轮次分开。 - **三个 `api_server` 的标准输出/错误**分别写入该目录,建议命名:**`pd_master.log`**、**`prefill.log`**、**`decode.log`**(文件名可沿用习惯,与 NCCL 测试一致便于对比)。 -- **`summary.txt` 固定放在该日志目录下**,汇总:三台进程的启动参数摘要、端口与就绪情况、**UCX 配置要点**、`lm_eval` 关键结果、失败原因与最终结论。 +- **`summary.txt` 固定放在该日志目录下**,汇总:三台进程的启动参数摘要、端口与就绪情况、**UCX 配置要点**、**MPS 是否开启**、**KV 传输指标**、`lm_eval` 关键结果、失败原因与最终结论。 - **`eval_gsm8k.log`**:`lm_eval` 终端输出;**`curl_warmup.log`**:测试前 **`curl`** 打 **`pd_master`** 的留档(建议);**`summary.txt`** 仍为**总览结论**。 ## 启动说明 @@ -38,6 +39,7 @@ description: >- 3. **网络 / IP**:**`HOST`** 与 **`PD_MASTER_IP`** 约定同 NCCL PD skill;单机三进程 **`export HOST="${PD_MASTER_IP}"`**。 4. **代理**:启动 **任一 server 前**将 **`http_proxy` / `https_proxy` 置空**;评测使用 **`no_proxy`**(见评测命令)。 5. **RDMA / UCX**:prefill 与 decode 进程在启动 Python 前须设置 **`UCX_NET_DEVICES`**(及可选 **`UCX_LOG_LEVEL`**、**`UCX_TLS`**),取值依赖本机 **`ibv_devinfo`** 与机房拓扑(见「UCX / RDMA」);**不要**默认照抄他机上的设备名或排除列表。 +6. **CUDA MPS(强烈建议,见下节)**:**要达到 NIXL PD 最优 KV 拷贝与 batch 评测性能,须在启动 `api_server` 之前在本机启用 NVIDIA MPS**。未开 MPS 时功能通常仍可用,但易出现 **`read_page_gpu_time` 数十秒级毛刺**、**`lm_eval` 单 batch 近百秒**;**`summary.txt` 须写明 MPS 是否已开启及验证方式**。 ### 启动服务的命令模板(可变项) @@ -80,6 +82,26 @@ export UCX_TLS=rc,cuda,gdr_copy - **`UCX_NET_DEVICES`**:须覆盖本进程要用的 **RDMA 设备**;是否排除某些 HCA(例如数据面网卡)由**本机拓扑**决定,在 **`summary.txt`** 中写明依据。 - **`UCX_TLS`**:常见 **`rc,cuda,gdr_copy`**;若环境不支持再按报错调整。 +#### 要达到最优性能:须开启 MPS + +如果用户没有特别说明要开启 mps,测试的时候可以不开启。 + +1. **在启动任意 `api_server` 之前**,按机房规范启动 MPS(示例,**以本集群文档为准**): + +```bash +# 确认无其它任务占用目标 GPU 后再执行;具体参数问运维 +export CUDA_VISIBLE_DEVICES="${PREFILL_CUDA_DEVICES},${DECODE_CUDA_DEVICES}" # 或整机 MPS,按规范 +nvidia-cuda-mps-control -d +# 验证:nvidia-smi 应出现 nvidia-cuda-mps-server,且各 GPU 有少量固定占用 +``` + +2. **验证 MPS 已生效**(写入 **`summary.txt`**): + +```bash +nvidia-smi --query-compute-apps=pid,process_name --format=csv | grep -i mps || true +pgrep -a mps-control || pgrep -a cuda-mps +``` + ### 1)启动 `pd_master`(须最先就绪监听) ```bash