InfiniTensor · PanZezhong1725 · Dec 30, 2025 · Dec 30, 2025
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -15,6 +15,9 @@
 #include "infiniop/ops/lp_norm.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/ones.h"
+#include "infiniop/ops/paged_attention.h"
+#include "infiniop/ops/paged_attention_prefill.h"
+#include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
@@ -31,7 +34,5 @@
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
 #include "infiniop/tensor_descriptor.h"
-#include "infiniop/ops/paged_attention.h"
-#include "infiniop/ops/paged_caching.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/paged_attention_prefill.h b/include/infiniop/ops/paged_attention_prefill.h
@@ -0,0 +1,83 @@
+#ifndef __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+#define __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+
+#include "../operator_descriptor.h"
+
+// Define an opaque handle for the Paged Attention Prefill descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
+
+/**
+ * @brief Creates a descriptor for the Paged Attention Prefill operation.
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param out_desc Descriptor for the output tensor.
+ * @param q_desc Descriptor for the query tensor (packed/flattened).
+ * @param k_cache_desc Descriptor for the global physical key cache.
+ * @param v_cache_desc Descriptor for the global physical value cache.
+ * @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
+ * @param cache_lens_desc Descriptor for the total sequence lengths (history + current).
+ * @param seq_lens_desc Descriptor for the current prefill sequence lengths.
+ * @param offset_desc Descriptor for the start position of each sequence in the packed Q tensor.
+ * @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
+ * @param scale The attention scaling factor.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedAttentionPrefillDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t cache_lens_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t offset_desc,
+    infiniopTensorDescriptor_t alibi_slopes_desc,
+    float scale);
+
+/**
+ * @brief Retrieves the workspace size required for the Paged Attention Prefill operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
+    infiniopPagedAttentionPrefillDescriptor_t desc, size_t *size);
+
+/**
+ * @brief Executes the Paged Attention Prefill operation.
+ * @param desc The Paged Attention Prefill descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param out Pointer to the output tensor data.
+ * @param q Pointer to the query tensor data (packed).
+ * @param k_cache Pointer to the global key cache data.
+ * @param v_cache Pointer to the global value cache data.
+ * @param block_tables Pointer to the block tables data.
+ * @param cache_lens Pointer to the total sequence lengths data.
+ * @param seq_lens Pointer to the current prefill sequence lengths data.
+ * @param offset Pointer to the sequence start offsets data.
+ * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
+ * @param stream The CUDA/device stream for the operation.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedAttentionPrefill(
+    infiniopPagedAttentionPrefillDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k_cache,
+    const void *v_cache,
+    const void *block_tables,
+    const void *cache_lens,
+    const void *seq_lens,
+    const void *offset,
+    const void *alibi_slopes,
+    void *stream);
+
+/**
+ * @brief Destroys a Paged Attention Prefill descriptor.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedAttentionPrefillDescriptor(
+    infiniopPagedAttentionPrefillDescriptor_t desc);
+
+#endif // __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
diff --git a/src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh b/src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh
@@ -0,0 +1,134 @@
+#ifndef __PAGED_ATTENTION_PREFILL_KERNEL_CUH__
+#define __PAGED_ATTENTION_PREFILL_KERNEL_CUH__
+
+namespace op::paged_attention_prefill::cuda {
+
+// 辅助函数：二分查找确定当前 global_token_idx 属于哪个 sequence
+__device__ __forceinline__ int find_seq_id(int token_idx, const int64_t *offset, int num_seqs) {
+    int low = 0, high = num_seqs - 1;
+    while (low <= high) {
+        int mid = (low + high) >> 1;
+        if (token_idx >= offset[mid] && token_idx < offset[mid + 1]) {
+            return mid;
+        } else if (token_idx < offset[mid]) {
+            high = mid - 1;
+        } else {
+            low = mid + 1;
+        }
+    }
+    return 0;
+}
+
+template <typename Tdata, typename Tcompute>
+__global__ void pagedAttentionPrefillKernel(
+    Tdata *out_, const Tdata *q_, const Tdata *k_cache_, const Tdata *v_cache_,
+    const int64_t *block_tables_, const int64_t *cache_lens_, const int64_t *seq_lens_,
+    const float *alibi_slopes_,
+    const size_t num_heads, const size_t num_kv_heads, const float scale,
+    const size_t max_num_blocks_per_seq, const size_t block_size,
+    const ptrdiff_t kv_block_stride, const ptrdiff_t kv_head_stride,
+    const size_t head_size,
+    const int64_t *offset_,
+    const size_t num_seqs) {
+
+    // --- 使用 2D Grid 坐标 ---
+    const int global_token_idx = blockIdx.x; // 展平后的全局 token 索引
+    const int head_idx = blockIdx.y;         // Head 索引
+    const int dim_idx = threadIdx.x;         // Head 内部维度
+
+    if (dim_idx >= head_size) {
+        return;
+    }
+
+    // --- 通过二分查找 offset 找到所属的 seq_idx ---
+    int seq_idx = find_seq_id(global_token_idx, offset_, num_seqs);
+
+    // --- 获取该 Sequence 本次 Prefill 的长度
+    const int64_t cur_new_len = seq_lens_[seq_idx];
+
+    // --- 该 token 在当前序列中的相对位置
+    int q_token_idx = global_token_idx - offset_[seq_idx];
+
+    const Tdata *q_ptr_base = q_ + global_token_idx * num_heads * head_size + head_idx * head_size;
+    Tdata *out_ptr = out_ + global_token_idx * num_heads * head_size + head_idx * head_size;
+
+    // --- KV Cache 相关信息
+    const int64_t total_seq_len = cache_lens_[seq_idx];
+    const int64_t history_len = total_seq_len - cur_new_len;
+    const int64_t causal_limit = history_len + q_token_idx;
+
+    const size_t num_queries_per_kv = num_heads / num_kv_heads;
+    const size_t kv_head_idx = head_idx / num_queries_per_kv;
+    const int64_t *block_table = block_tables_ + seq_idx * max_num_blocks_per_seq;
+
+    const float alibi_slope = (alibi_slopes_ == nullptr) ? 0.0f : alibi_slopes_[head_idx];
+
+    // Pass 1: 计算 Score 并找最大值
+    Tcompute max_score = -FLT_MAX;
+    for (int t = 0; t <= causal_limit; ++t) {
+        const int64_t b_idx = t / block_size;
+        const int64_t t_off = t % block_size;
+        const int64_t physical_block_id = block_table[b_idx];
+        const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
+
+        Tcompute score = 0.0f;
+        for (int d = 0; d < head_size; ++d) {
+            score += static_cast<Tcompute>(q_ptr_base[d]) * static_cast<Tcompute>(k_vec[d]);
+        }
+        score *= static_cast<Tcompute>(scale);
+        if (alibi_slope != 0.0f) {
+            score += alibi_slope * static_cast<float>(t - causal_limit);
+        }
+        if (score > max_score) {
+            max_score = score;
+        }
+    }
+
+    // Pass 2: 计算 Sum of Exp
+    Tcompute sum_exp = 0.0f;
+    for (int t = 0; t <= causal_limit; ++t) {
+        const int64_t b_idx = t / block_size;
+        const int64_t t_off = t % block_size;
+        const int64_t physical_block_id = block_table[b_idx];
+        const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
+
+        Tcompute score = 0.0f;
+        for (int d = 0; d < head_size; ++d) {
+            score += static_cast<Tcompute>(q_ptr_base[d]) * static_cast<Tcompute>(k_vec[d]);
+        }
+        score *= static_cast<Tcompute>(scale);
+        if (alibi_slope != 0.0f) {
+            score += alibi_slope * static_cast<float>(t - causal_limit);
+        }
+        sum_exp += expf(static_cast<float>(score - max_score));
+    }
+
+    // Pass 3: 加权求和得到输出
+    Tcompute acc = 0.0f;
+    Tcompute inv_sum = 1.0f / (sum_exp + 1e-6f);
+    for (int t = 0; t <= causal_limit; ++t) {
+        const int64_t b_idx = t / block_size;
+        const int64_t t_off = t % block_size;
+        const int64_t physical_block_id = block_table[b_idx];
+
+        const Tdata *k_vec = k_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
+        Tcompute score = 0.0f;
+        for (int d = 0; d < head_size; ++d) {
+            score += static_cast<Tcompute>(q_ptr_base[d]) * static_cast<Tcompute>(k_vec[d]);
+        }
+        score *= static_cast<Tcompute>(scale);
+        if (alibi_slope != 0.0f) {
+            score += alibi_slope * static_cast<float>(t - causal_limit);
+        }
+        Tcompute prob = expf(static_cast<float>(score - max_score)) * inv_sum;
+
+        const Tdata *v_vec = v_cache_ + physical_block_id * kv_block_stride + kv_head_idx * kv_head_stride + t_off * head_size;
+        acc += prob * static_cast<Tcompute>(v_vec[dim_idx]);
+    }
+
+    out_ptr[dim_idx] = static_cast<Tdata>(acc);
+}
+
+} // namespace op::paged_attention_prefill::cuda
+
+#endif
diff --git a/src/infiniop/ops/paged_attention_prefill/info.h b/src/infiniop/ops/paged_attention_prefill/info.h
@@ -0,0 +1,107 @@
+#ifndef __PAGED_ATTENTION_PREFILL_INFO_H__
+#define __PAGED_ATTENTION_PREFILL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <iostream>
+#include <optional>
+#include <vector>
+
+namespace op::paged_attention_prefill {
+
+class PagedAttentionPrefillInfo {
+    PagedAttentionPrefillInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    float scale;
+
+    size_t num_seqs;
+    size_t num_heads;
+    size_t num_kv_heads;
+    size_t head_size;
+    size_t block_size;
+    size_t max_num_blocks_per_seq;
+    size_t total_q_tokens;
+
+    ptrdiff_t q_stride;
+    ptrdiff_t kv_block_stride;
+    ptrdiff_t kv_head_stride;
+    ptrdiff_t o_stride;
+
+    static utils::Result<PagedAttentionPrefillInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t q_desc,
+        infiniopTensorDescriptor_t k_cache_desc,
+        infiniopTensorDescriptor_t v_cache_desc,
+        infiniopTensorDescriptor_t block_tables_desc,
+        infiniopTensorDescriptor_t cache_lens_desc,
+        infiniopTensorDescriptor_t seq_lens_desc,
+        infiniopTensorDescriptor_t offset_desc,
+        const std::optional<infiniopTensorDescriptor_t> &alibi_slopes_desc,
+        float scale) {
+
+        auto dtype = q_desc->dtype();
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        if (out_desc->dtype() != dtype || k_cache_desc->dtype() != dtype || v_cache_desc->dtype() != dtype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (offset_desc->dtype() != INFINI_DTYPE_I64 || seq_lens_desc->dtype() != INFINI_DTYPE_I64) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        if (alibi_slopes_desc.has_value() && alibi_slopes_desc.value() != nullptr) {
+            std::cerr << "[Error] PagedAttentionPrefill: ALiBi slopes are not supported yet." << std::endl;
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        // Q shape: [total_tokens, heads, dim] (3D)
+        auto q_shape = q_desc->shape();
+        if (q_shape.size() < 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t total_q_tokens = q_shape[0];
+
+        size_t num_heads = q_shape[q_shape.size() - 2];
+        size_t head_size = q_shape[q_shape.size() - 1];
+
+        if (head_size != 128) {
+            std::cerr << "[Error] PagedAttentionPrefill head_size = 128 supported, got " << head_size << std::endl;
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 从 seq_lens 获取 num_seqs
+        size_t num_seqs = seq_lens_desc->shape()[0];
+
+        auto k_cache_shape = k_cache_desc->shape();
+        size_t num_kv_heads = k_cache_shape[1];
+        size_t block_size = v_cache_desc->shape()[2];
+        size_t max_num_blocks_per_seq = block_tables_desc->shape()[1];
+
+        // 提取步长,需要保持多个请求的 Q 连续
+        ptrdiff_t q_stride = q_desc->stride(0);
+        ptrdiff_t kv_block_stride = k_cache_desc->stride(0);
+        ptrdiff_t kv_head_stride = k_cache_desc->stride(1);
+        ptrdiff_t o_stride = out_desc->stride(0);
+
+        return utils::Result<PagedAttentionPrefillInfo>(PagedAttentionPrefillInfo{
+            dtype,
+            scale,
+            num_seqs,
+            num_heads,
+            num_kv_heads,
+            head_size,
+            block_size,
+            max_num_blocks_per_seq,
+            total_q_tokens,
+            q_stride,
+            kv_block_stride,
+            kv_head_stride,
+            o_stride});
+    }
+};
+
+} // namespace op::paged_attention_prefill
+
+#endif