InfiniTensor · spike-zhu · Dec 23, 2025 · Dec 24, 2025 · Dec 24, 2025 · Dec 26, 2025
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -31,5 +31,9 @@
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/paged_attention.h"
+#include "infiniop/ops/paged_attention_prefill.h"
+#include "infiniop/ops/paged_caching.h"
+#include "infiniop/ops/paged_caching_prefill.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/paged_attention.h b/include/infiniop/ops/paged_attention.h
@@ -0,0 +1,88 @@
+#ifndef __INFINIOP_PAGED_ATTENTION_API_H__
+#define __INFINIOP_PAGED_ATTENTION_API_H__
+
+#include "../operator_descriptor.h"
+
+// Define an opaque handle for the Paged Attention descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedAttentionDescriptor_t;
+
+/**
+ * @brief Creates a descriptor for the Paged Attention v1 operation.
+ *
+ * This function initializes a descriptor that holds all the metadata needed
+ * for the paged attention computation.
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param out_desc Descriptor for the output tensor.
+ * @param q_desc Descriptor for the query tensor.
+ * @param k_cache_desc Descriptor for the key cache tensor.
+ * @param v_cache_desc Descriptor for the value cache tensor.
+ * @param block_tables_desc Descriptor for the block tables tensor.
+ * @param seq_lens_desc Descriptor for the sequence lengths tensor.
+ * @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
+ * @param scale The attention scaling factor.
+ * @param max_num_blocks_per_seq The maximum number of batched blocks tables.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedAttentionDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedAttentionDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t alibi_slopes_desc,
+    float scale);
+
+/**
+ * @brief Retrieves the workspace size required for the Paged Attention operation.
+ *
+ * @param desc The Paged Attention descriptor.
+ * @param size A pointer to store the required workspace size in bytes.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
+    infiniopPagedAttentionDescriptor_t desc, size_t *size);
+
+/**
+ * @brief Executes the Paged Attention v1 operation.
+ *
+ * @param desc The Paged Attention descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param out Pointer to the output tensor data.
+ * @param q Pointer to the query tensor data.
+ * @param k_cache Pointer to the key cache data.
+ * @param v_cache Pointer to the value cache data.
+ * @param block_tables Pointer to the block tables data.
+ * @param seq_lens Pointer to the sequence lengths data.
+ * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
+ * @param stream The CUDA stream for the operation. Can be NULL.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedAttention(
+    infiniopPagedAttentionDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k_cache,
+    const void *v_cache,
+    const void *block_tables,
+    const void *seq_lens,
+    const void *alibi_slopes,
+    void *stream);
+
+/**
+ * @brief Destroys a Paged Attention descriptor.
+ *
+ * @param desc The descriptor to be destroyed.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
+    infiniopPagedAttentionDescriptor_t desc);
+
+#endif // __INFINIOP_PAGED_ATTENTION_API_H__
diff --git a/include/infiniop/ops/paged_attention_prefill.h b/include/infiniop/ops/paged_attention_prefill.h
@@ -0,0 +1,79 @@
+#ifndef __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+#define __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
+
+#include "../operator_descriptor.h"
+
+// Define an opaque handle for the Paged Attention Prefill descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedAttentionPrefillDescriptor_t;
+
+/**
+ * @brief Creates a descriptor for the Paged Attention Prefill operation.
+ * * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param out_desc Descriptor for the output tensor.
+ * @param q_desc Descriptor for the query tensor.
+ * @param k_cache_desc Descriptor for the global physical key cache.
+ * @param v_cache_desc Descriptor for the global physical value cache.
+ * @param block_tables_desc Descriptor for the block tables mapping logic to physical blocks.
+ * @param seq_lens_desc Descriptor for the total sequence lengths (history + current).
+ * @param new_lens_desc Descriptor for the current prefill sequence lengths.
+ * @param alibi_slopes_desc Optional descriptor for the ALiBi slopes tensor. Can be NULL.
+ * @param scale The attention scaling factor.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedAttentionPrefillDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedAttentionPrefillDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t q_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t block_tables_desc,
+    infiniopTensorDescriptor_t seq_lens_desc,
+    infiniopTensorDescriptor_t new_lens_desc, // 新增：对应实现中的第 9 个参数
+    infiniopTensorDescriptor_t alibi_slopes_desc,
+    float scale);
+
+/**
+ * @brief Retrieves the workspace size required for the Paged Attention Prefill operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedAttentionPrefillWorkspaceSize(
+    infiniopPagedAttentionPrefillDescriptor_t desc, size_t *size);
+
+/**
+ * @brief Executes the Paged Attention Prefill operation.
+ * * @param desc The Paged Attention Prefill descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param out Pointer to the output tensor data.
+ * @param q Pointer to the query tensor data.
+ * @param k_cache Pointer to the global key cache data.
+ * @param v_cache Pointer to the global value cache data.
+ * @param block_tables Pointer to the block tables data.
+ * @param seq_lens Pointer to the total sequence lengths data.
+ * @param new_lens Pointer to the current prefill sequence lengths data.
+ * @param alibi_slopes Pointer to the ALiBi slopes data. Can be NULL.
+ * @param stream The CUDA/device stream for the operation.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedAttentionPrefill(
+    infiniopPagedAttentionPrefillDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *q,
+    const void *k_cache,
+    const void *v_cache,
+    const void *block_tables,
+    const void *seq_lens,
+    const void *new_lens, // 新增：对应实现中的第 10 个参数
+    const void *alibi_slopes,
+    void *stream);
+
+/**
+ * @brief Destroys a Paged Attention Prefill descriptor.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedAttentionPrefillDescriptor(
+    infiniopPagedAttentionPrefillDescriptor_t desc);
+
+#endif // __INFINIOP_PAGED_ATTENTION_PREFILL_API_H__
diff --git a/include/infiniop/ops/paged_caching.h b/include/infiniop/ops/paged_caching.h
@@ -0,0 +1,77 @@
+#ifndef __INFINIOP_PAGED_CACHING_API_H__
+#define __INFINIOP_PAGED_CACHING_API_H__
+
+#include "../operator_descriptor.h"
+
+// Define an opaque handle for the Paged Caching descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedCachingDescriptor_t;
+
+/**
+ * @brief Creates a descriptor for the Paged Caching operation.
+ *
+ * This function initializes a descriptor that holds all the metadata needed
+ * to copy key/value vectors into their respective cache pools.
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param k_desc Descriptor for the source key tensor.
+ * @param v_desc Descriptor for the source value tensor.
+ * @param k_cache_desc Descriptor for the key cache pool tensor.
+ * @param v_cache_desc Descriptor for the value cache pool tensor.
+ * @param slot_mapping_desc Descriptor for the slot mapping tensor.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedCachingDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedCachingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t slot_mapping_desc);
+
+/**
+ * @brief Retrieves the workspace size required for the Paged Caching operation.
+ *
+ * @param desc The Paged Caching descriptor.
+ * @param size A pointer to store the required workspace size in bytes (typically 0).
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
+    infiniopPagedCachingDescriptor_t desc, size_t *size);
+
+/**
+ * @brief Executes the Paged Caching operation.
+ *
+ * @param desc The Paged Caching descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param k Pointer to the source key tensor data.
+ * @param v Pointer to the source value tensor data.
+ * @param k_cache Pointer to the key cache pool data.
+ * @param v_cache Pointer to the value cache pool data.
+ * @param slot_mapping Pointer to the slot mapping data.
+ * @param stream The CUDA stream for the operation. Can be NULL.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedCaching(
+    infiniopPagedCachingDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *k,
+    const void *v,
+    void *k_cache,
+    void *v_cache,
+    const void *slot_mapping,
+    void *stream);
+
+/**
+ * @brief Destroys a Paged Caching descriptor.
+ *
+ * @param desc The descriptor to be destroyed.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedCachingDescriptor(
+    infiniopPagedCachingDescriptor_t desc);
+
+#endif // __INFINIOP_PAGED_CACHING_API_H__
diff --git a/include/infiniop/ops/paged_caching_prefill.h b/include/infiniop/ops/paged_caching_prefill.h
@@ -0,0 +1,80 @@
+#ifndef __INFINIOP_PAGED_CACHING_PREFILL_API_H__
+#define __INFINIOP_PAGED_CACHING_PREFILL_API_H__
+
+#include "../operator_descriptor.h"
+
+// Define an opaque handle for the Paged Caching Prefill descriptor.
+typedef struct InfiniopDescriptor *infiniopPagedCachingPrefillDescriptor_t;
+
+/**
+ * @brief Creates a descriptor for the Paged Caching Prefill operation.
+ *
+ * This function initializes a descriptor that holds metadata to copy key/value
+ * vectors from a prefill batch into their respective physical slots in the cache pool.
+ *
+ * @param handle The handle to the InfiniOP library context.
+ * @param desc_ptr A pointer to store the created descriptor.
+ * @param k_desc Descriptor for the source key tensor (new tokens).
+ * @param v_desc Descriptor for the source value tensor (new tokens).
+ * @param k_cache_desc Descriptor for the key cache pool tensor (global pool).
+ * @param v_cache_desc Descriptor for the value cache pool tensor (global pool).
+ * @param slot_mapping_desc Descriptor for the slot mapping tensor (physical indices).
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopCreatePagedCachingPrefillDescriptor(
+    infiniopHandle_t handle,
+    infiniopPagedCachingPrefillDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
+    infiniopTensorDescriptor_t k_cache_desc,
+    infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t slot_mapping_desc);
+
+/**
+ * @brief Retrieves the workspace size required for the Paged Caching Prefill operation.
+ *
+ * @param desc The Paged Caching Prefill descriptor.
+ * @param size A pointer to store the required workspace size in bytes.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopGetPagedCachingPrefillWorkspaceSize(
+    infiniopPagedCachingPrefillDescriptor_t desc, size_t *size);
+
+/**
+ * @brief Executes the Paged Caching Prefill operation.
+ *
+ * This operation writes the K/V data into the cache pool at locations 
+ * specified by the slot_mapping.
+ *
+ * @param desc The Paged Caching Prefill descriptor.
+ * @param workspace Pointer to the workspace memory.
+ * @param workspace_size The size of the workspace.
+ * @param k Pointer to the source key tensor data.
+ * @param v Pointer to the source value tensor data.
+ * @param k_cache Pointer to the key cache pool data.
+ * @param v_cache Pointer to the value cache pool data.
+ * @param slot_mapping Pointer to the slot mapping data.
+ * @param stream The CUDA stream for the operation. Can be NULL.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopPagedCachingPrefill(
+    infiniopPagedCachingPrefillDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    const void *k,
+    const void *v,
+    void *k_cache,
+    void *v_cache,
+    const void *slot_mapping,
+    void *stream);
+
+/**
+ * @brief Destroys a Paged Caching Prefill descriptor.
+ *
+ * @param desc The descriptor to be destroyed.
+ * @return infiniStatus_t Status code of the operation.
+ */
+__C __export infiniStatus_t infiniopDestroyPagedCachingPrefillDescriptor(
+    infiniopPagedCachingPrefillDescriptor_t desc);
+
+#endif // __INFINIOP_PAGED_CACHING_PREFILL_API_H__