diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py index d871108e737..bfb0776be45 100644 --- a/fastdeploy/model_executor/layers/sample/meta_data.py +++ b/fastdeploy/model_executor/layers/sample/meta_data.py @@ -67,3 +67,5 @@ class SamplingMetadata: # Add for HPU post-processing seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None + # Add for sampler to distinguish dummy run and profile run + is_dummy_or_profile_run: bool = False diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index 0c6c11265ef..70d6cc029bc 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -221,6 +221,8 @@ def _sample_from_probs(probs, sampling_metadata, top_p=None, top_k=None, topp_se if need_top_k_sampling: probs = dispatch_top_k_renorm_probs(probs, top_k) next_tokens = _random_sample(probs, topp_seed=topp_seed) + if sampling_metadata.is_dummy_or_profile_run: # warmup top_p != 1.0 path + _, next_tokens = top_k_top_p_sampling(probs, top_p, top_k, top_k_list, topp_seed=topp_seed) else: _, next_tokens = top_k_top_p_sampling( probs, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 31c7d491035..0ada1d193eb 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1383,6 +1383,7 @@ def _prepare_inputs(self, cached_token_num=-1, cached_real_bsz=-1, is_dummy_or_p top_p_normalized_logprobs=self.share_inputs["top_p_normalized_logprobs"], logits_processors=self.share_inputs["logits_processors"], share_inputs=self.share_inputs, + is_dummy_or_profile_run=is_dummy_or_profile_run, ) return token_num, token_num_event