Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 14 additions & 19 deletions custom_ops/xpu_ops/src/ops/get_padding_offset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,29 @@
#include "paddle/extension.h"
#include "xpu/plugin.h"

std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
const paddle::Tensor &cum_offsets,
const paddle::Tensor &token_num,
const paddle::Tensor &seq_len) {
std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor& input_ids,
const paddle::Tensor& seq_len,
const int64_t cpu_token_num) {
phi::XPUPlace place(phi::backends::xpu::GetXPUCurrentDeviceId());
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext *>(dev_ctx);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);

std::vector<int64_t> input_ids_shape = input_ids.shape();
const int bsz = seq_len.shape()[0];
const int seq_length = input_ids_shape[1];
auto cum_offsets_out = cum_offsets.copy_to(cum_offsets.place(), false);
auto cpu_token_num = token_num.copy_to(paddle::CPUPlace(), false);
const int token_num_data = static_cast<int>(cpu_token_num);

Comment on lines 26 to 30
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cpu_token_num 直接 static_cast 会在超出 int 范围时截断,进而造成输出张量 shape 分配错误甚至越界访问。建议对 cpu_token_num 做范围校验(>=0、<=bsz*seq_length、<=INT_MAX),并尽量在后续逻辑里保持 int64_t 以避免隐式溢出。

Copilot uses AI. Check for mistakes.
const int token_num_data = cpu_token_num.data<int64_t>()[0];
auto x_remove_padding = paddle::full(
{token_num_data}, 0, paddle::DataType::INT64, input_ids.place());
auto batch_id_per_token = paddle::full(
{token_num_data}, 0, paddle::DataType::INT32, input_ids.place());
auto cum_offsets_out =
paddle::full({bsz}, 0, paddle::DataType::INT32, input_ids.place());
auto cu_seqlens_q =
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
auto cu_seqlens_k =
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());

if (token_num_data > 0) {
int r =
fastdeploy::plugin::get_padding_offset(xpu_ctx->x_context(),
Expand All @@ -48,7 +48,6 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
cu_seqlens_k.data<int>(),
x_remove_padding.data<int64_t>(),
input_ids.data<int64_t>(),
cum_offsets.data<int>(),
seq_len.data<int>(),
seq_length,
bsz,
Expand All @@ -64,20 +63,15 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
}

std::vector<std::vector<int64_t>> GetPaddingOffsetInferShape(
const std::vector<int64_t> &input_ids_shape,
const std::vector<int64_t> &cum_offsets_shape,
const std::vector<int64_t> &token_num_shape,
const std::vector<int64_t> &seq_len_shape) {
const std::vector<int64_t>& input_ids_shape,
const std::vector<int64_t>& seq_len_shape) {
int64_t bsz = seq_len_shape[0];
int64_t seq_len = input_ids_shape[1];
return {{-1}, {bsz}, {-1}, {bsz + 1}, {bsz + 1}};
}

std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
const paddle::DataType &input_ids_dtype,
const paddle::DataType &cum_offsets_dtype,
const paddle::DataType &token_num_dtype,
const paddle::DataType &seq_len_dtype) {
const paddle::DataType& input_ids_dtype,
const paddle::DataType& seq_len_dtype) {
return {input_ids_dtype,
seq_len_dtype,
seq_len_dtype,
Expand All @@ -86,12 +80,13 @@ std::vector<paddle::DataType> GetPaddingOffsetInferDtype(
}

PD_BUILD_OP(get_padding_offset)
.Inputs({"input_ids", "cum_offsets", "token_num", "seq_len"})
.Inputs({"input_ids", "seq_len"})
.Outputs({"x_remove_padding",
"cum_offsets_out",
"batch_id_per_token",
"cu_seqlens_q",
"cu_seqlens_k"})
.Attrs({"cpu_token_num: int64_t"})
.SetKernelFn(PD_KERNEL(GetPaddingOffset))
.SetInferShapeFn(PD_INFER_SHAPE(GetPaddingOffsetInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(GetPaddingOffsetInferDtype));
8 changes: 3 additions & 5 deletions custom_ops/xpu_ops/src/ops/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -456,9 +456,8 @@ void GetOutputEPDynamic(const paddle::Tensor& x,
int msg_queue_id);

std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor& input_ids,
const paddle::Tensor& cum_offsets,
const paddle::Tensor& token_num,
const paddle::Tensor& seq_len);
const paddle::Tensor& seq_len,
const int64_t cpu_token_num);

void GetStopFlagsMulti(const paddle::Tensor& topk_ids,
const paddle::Tensor& stop_flags,
Expand Down Expand Up @@ -975,9 +974,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_padding_offset",
&GetPaddingOffset,
py::arg("input_ids"),
py::arg("cum_offsets"),
py::arg("token_num"),
py::arg("seq_len"),
py::arg("cpu_token_num"),
"get padding offset function");

m.def("init_kv_signal_per_query",
Expand Down
3 changes: 1 addition & 2 deletions custom_ops/xpu_ops/src/plugin/include/xpu/plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,12 @@ DLL_EXPORT int token_penalty_multi_scores(api::Context* ctx,
const int64_t length_bad_words);

DLL_EXPORT int get_padding_offset(api::Context* ctx,
int* padding_offset,
int* batch_id_per_token,
int* cum_offsets_out,
int* cu_seqlens_q,
int* cu_seqlens_k,
int64_t* x_remove_padding,
const int64_t* input_ids,
const int* cum_offsets,
const int* seq_lens,
const int max_seq_len,
const int bs,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,50 +1,123 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_simd.h"

namespace fd_xpu3 {

__global__ void get_padding_offset(int *batch_id_per_token,
int *cum_offsets_out,
int *cu_seqlens_q,
int *cu_seqlens_k,
const int *cum_offsets,
const int *seq_lens,
#define MAX_BATCH_SIZE 1024

static inline __device__ int v_reduce_sum_int32(int32x16_t& v0) {
auto v1 = vsrlp_int32x16(1 << 8, v0);
v0 = vvadd_int32x16(v0, v1);
v1 = vsrlp_int32x16(1 << 7, v0);
v0 = vvadd_int32x16(v0, v1);
v1 = vsrlp_int32x16(1 << 6, v0);
v0 = vvadd_int32x16(v0, v1);
v1 = vsrlp_int32x16(1 << 5, v0);
v0 = vvadd_int32x16(v0, v1);
return vextract_int32x16(v0, 1);
}

inline __device__ int primitive_reduce_sum_sm(__shared_ptr__ const int* x,
int64_t len) {
int32x16_t x_l, x_h;
int32x16_t sum = vset_zero_int();
const auto rounddown_len = rounddown32(len);

for (int64_t i = 0; i < rounddown_len; i += 32) {
vload2_sm(x + i, x_l, x_h);
sum = vvadd_int32x16(sum, x_l);
sum = vvadd_int32x16(sum, x_h);
}

if (rounddown_len < len) {
const auto mask = ~(-1 << (len - rounddown_len));
vload2_sm_mz(x + rounddown_len, x_l, x_h, mask);
sum = vvadd_int32x16(sum, x_l);
sum = vvadd_int32x16(sum, x_h);
}
return v_reduce_sum_int32(sum);
}

__global__ void get_padding_offset(int64_t* ids_remove_padding,
int* batch_id_per_token,
int* cum_offsets_out,
int* cu_seqlens_q,
int* cu_seqlens_k,
const int64_t* input_data,
const int* seq_lens,
const int max_seq_len,
const int bs) {
int cid = core_id();
int ncores = core_num();
int clusterid = cluster_id();
int nclusters = cluster_num();
int tid = clusterid * ncores + cid;

int buf_len = 32;
__simd__ int batch_id_per_token_lm[buf_len];
__simd__ int cum_offsets_lm[16];
int seq_len_lm;
for (int i = clusterid; i < bs; i += nclusters) {
GM2LM_ASYNC(seq_lens + i, &seq_len_lm, sizeof(int));
GM2LM(cum_offsets + i - 1, cum_offsets_lm, 2 * sizeof(int));
if (i == 0) {
cum_offsets_lm[0] = 0;
}
for (int j = cid * buf_len; j < seq_len_lm; j += ncores * buf_len) {
int cur_len = min(seq_len_lm - j, buf_len);
for (int k = 0; k < cur_len; k++) {
batch_id_per_token_lm[k] = i;
}
mfence_lm();
LM2GM(batch_id_per_token_lm,
batch_id_per_token + i * max_seq_len - cum_offsets_lm[0] + j,
cur_len * sizeof(int));

__shared__ int sm_seq_lens[MAX_BATCH_SIZE];
__shared__ int sm_cum_seq_len;
__simd__ __shared__ int buffer_cu_seqlens[64];

Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sm_seq_lens 使用固定大小 MAX_BATCH_SIZE=1024 的 shared buffer,但没有对 bs 做上界校验;当 bs>1024 时 GM2SM 会越界写 shared memory,导致未定义行为。建议在 wrapper 或 kernel 开头增加显式检查/断言(bs<=MAX_BATCH_SIZE),或改为分块加载/使用动态分配方案。

Suggested change
// Ensure bs does not exceed the shared memory buffer capacity
if (bs > MAX_BATCH_SIZE) {
return;
}

Copilot uses AI. Check for mistakes.
if (cid == 0) {
GM2SM(seq_lens, sm_seq_lens, sizeof(int) * bs);
}
Comment on lines +57 to +63
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kernel 里 sm_seq_lens 使用固定大小 MAX_BATCH_SIZE=1024 的 shared 数组,但没有任何地方保证/检查 bs <= 1024。一旦 bs 超过该值,GM2SM(seq_lens, sm_seq_lens, sizeof(int)*bs) 会发生越界写入,导致结果错误甚至 kernel 崩溃。建议在 wrapper 侧增加显式断言/返回错误,或把 shared 缓冲改为能覆盖任意 bs 的实现(例如分块加载或动态策略)。

Copilot uses AI. Check for mistakes.
sync_all();

for (int bi = clusterid; bi < bs; bi += nclusters) {
int cum_seq_len = 0;
for (int i = cid; i <= bi; i += ncores) {
cum_seq_len += sm_seq_lens[i];
}
buffer_cu_seqlens[cid] = cum_seq_len;
mfence();
sync_all();

Comment on lines +64 to +74
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

当前 kernel 在 clusterid 分片的 for 循环里多次调用 sync_all()。如果 sync_all 是跨 cluster 的全局 barrier(本仓库其他 kernel 通常只在所有 cluster 都会执行相同 barrier 次数的场景才用),则不同 cluster 循环迭代次数不一致会导致死锁/卡住。建议改用 sync_cluster()(若只需 cluster 内同步),或重构为所有 cluster 执行一致的 barrier 次数(例如用 tid/nthreads 的全局并行方式而不是 clusterid-stride 循环)。

Copilot uses AI. Check for mistakes.
if (cid == 0) {
int cum_seq_len = (i + 1) * max_seq_len - cum_offsets_lm[1];
mfence_lm();
LM2GM_ASYNC(cum_offsets_lm, cum_offsets_out + i, sizeof(int));
LM2GM_ASYNC(&cum_seq_len, cu_seqlens_q + i + 1, sizeof(int));
LM2GM(&cum_seq_len, cu_seqlens_k + i + 1, sizeof(int));
cum_seq_len =
primitive_reduce_sum_sm(buffer_cu_seqlens, min(bi + 1, ncores));

LM2GM_ASYNC(&cum_seq_len, cu_seqlens_q + bi + 1, sizeof(int));
LM2GM_ASYNC(&cum_seq_len, cu_seqlens_k + bi + 1, sizeof(int));

int cum_offset = bi * max_seq_len - (cum_seq_len - sm_seq_lens[bi]);
LM2GM(&cum_offset, cum_offsets_out + bi, sizeof(int));

sm_cum_seq_len = cum_seq_len;
}
mfence();
sync_all();
Comment on lines +66 to +88
Copy link

Copilot AI Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

当前实现对每个 bi 都通过遍历 seq_lens[0..bi] 来重新计算 cum_seq_len(并做一次 reduce),整体复杂度是 O(bs^2);相比旧实现利用 cum_offsets 做 O(1) 索引,会在大 batch 时明显放大开销。建议改成一次性在 shared/GM 上做 prefix-sum(scan)后直接读 cum_seq_len[bi],或至少避免在每个 bi 上从 0 重新累加。

Copilot uses AI. Check for mistakes.

const int lm_seq_lens = sm_seq_lens[bi];
const int tgt_offset = sm_cum_seq_len - lm_seq_lens;
const int buf_len = 32;
__simd__ int64_t input_lm[buf_len];
__simd__ int batch_id_lm[buf_len];

for (int k = 0; k < buf_len; k++) {
batch_id_lm[k] = bi;
}
mfence_lm();

for (int j = cid * buf_len; j < lm_seq_lens; j += ncores * buf_len) {
int cur_len = min(lm_seq_lens - j, buf_len);
GM2LM(input_data + bi * max_seq_len + j,
input_lm,
sizeof(int64_t) * cur_len);
LM2GM(input_lm,
ids_remove_padding + tgt_offset + j,
sizeof(int64_t) * cur_len);
LM2GM(batch_id_lm,
batch_id_per_token + tgt_offset + j,
sizeof(int) * cur_len);
}
mfence();
sync_all();
}

if (cid == 0 && clusterid == 0) {
const int lm_zero = 0;
LM2GM_ASYNC(&lm_zero, cu_seqlens_q, sizeof(int));
LM2GM(&lm_zero, cu_seqlens_k, sizeof(int));
}
}

Expand Down
Loading
Loading