fix: n seqs and random length test pass

spike-zhu · spike-zhu · commit ac19252fa287 · 2025-12-28T17:14:03.000+08:00
diff --git a/src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh b/src/infiniop/ops/paged_attention_prefill/cuda/kernel.cuh
@@ -62,30 +62,32 @@ __global__ void pagedAttentionPrefillKernel(
 
     const int32_t *block_table = block_tables_ + seq_idx * max_num_blocks_per_seq;
     
-    // Q ptr: [seq, new_len, head, dim]
+    // 假设 q_stride 传入的是单个 Sequence 在内存中占据的 Tdata 数量 (即 max_new_len * num_heads * head_size)
     const Tdata *q_ptr_base = q_ + seq_idx * q_stride + 
-                              (q_token_idx * num_heads + head_idx) * head_size_const;
+                            q_token_idx * (num_heads * head_size_const) + 
+                            head_idx * head_size_const;
     
-    // Out ptr
+    // --- 2. 修改 Out 的基地址计算 ---
     Tdata *out_ptr = out_ + seq_idx * o_stride + 
-                     (q_token_idx * num_heads + head_idx) * head_size_const;
+                    q_token_idx * (num_heads * head_size_const) + 
+                    head_idx * head_size_const;
 
     const float alibi_slope = (alibi_slopes_ == nullptr) ? 0.0f : alibi_slopes_[head_idx];
 
-    // 只让第一个 Sequence, 第一个 Token, 第一个 Head 的第一个线程执行打印
-    if (seq_idx == 0 && q_token_idx == 0 && head_idx == 0 && dim_idx == 0) {
-        printf("DEBUG: Scale=%f, HeadSize=%zu, BlockSize=%zu\n", scale, head_size_const, block_size);
+    // // 只让第一个 Sequence, 第一个 Token, 第一个 Head 的第一个线程执行打印
+    // if (seq_idx == 0 && q_token_idx == 0 && head_idx == 0 && dim_idx == 0) {
+    //     printf("DEBUG: Scale=%f, HeadSize=%zu, BlockSize=%zu\n", scale, head_size_const, block_size);
         
-        // 检查 Q 的前 5 个元素
-        for(int i=0; i<5; ++i) printf("Q[%d]=%f ", i, (float)q_ptr_base[i]);
-        printf("\n");
-
-        // 检查第一个 KV Block 的前 5 个元素
-        const int32_t first_physical_block = block_table[0];
-        const Tdata *first_k = k_cache_ + first_physical_block * kv_block_stride;
-        for(int i=0; i<5; ++i) printf("K_cache[0][%d]=%f ", i, (float)first_k[i]);
-        printf("\n");
-    }
+    //     // 检查 Q 的前 5 个元素
+    //     for(int i=0; i<5; ++i) printf("Q[%d]=%f ", i, (float)q_ptr_base[i]);
+    //     printf("\n");
+
+    //     // 检查第一个 KV Block 的前 5 个元素
+    //     const int32_t first_physical_block = block_table[0];
+    //     const Tdata *first_k = k_cache_ + first_physical_block * kv_block_stride;
+    //     for(int i=0; i<5; ++i) printf("K_cache[0][%d]=%f ", i, (float)first_k[i]);
+    //     printf("\n");
+    // }
 
     // --- Pass 1: Find Global Max ---
     Tcompute max_score = -FLT_MAX;
diff --git a/test/infiniop/paged_attention_prefill.py b/test/infiniop/paged_attention_prefill.py
@@ -119,7 +119,7 @@ def test(
         print(f"--- Round {r+1} ---")
         
         # 1. 模拟调度与物理写入
-        new_lens_torch = torch.randint(max_step_len, max_step_len + 1, (num_seqs,), dtype=torch.int32)
+        new_lens_torch = torch.randint(1, max_step_len + 1, (num_seqs,), dtype=torch.int32)
         total_lens_list = []
         all_block_tables = []
         
@@ -138,9 +138,9 @@ def test(
             v_new = torch.randn(cur_new_len, num_kv_heads, head_size)
             q_val = torch.randn(cur_new_len, num_heads, head_size)
 
-            k_new = torch.ones_like(k_new)
-            v_new = torch.ones_like(v_new)
-            q_val = torch.ones_like(q_val)
+            # k_new = torch.ones_like(k_new)
+            # v_new = torch.ones_like(v_new)
+            # q_val = torch.ones_like(q_val)
 
             q_new_torch[i, :cur_new_len, :, :] = q_val
             
@@ -155,9 +155,14 @@ def test(
         k_cache._data_tensor.copy_(k_cache._torch_tensor)
         v_cache._data_tensor.copy_(v_cache._torch_tensor)
 
-        # 2. 准备算子 Tensor
+        # 2. 准备 Q Tensor
         q_new = TestTensor.from_torch(q_new_torch, dtype, device)
+
+        # 3. 准备 out Tensor，确保初始值为 0
         out = TestTensor((num_seqs, max_new_len, num_heads, head_size), None, dtype, device)
+        out.torch_tensor().zero_() 
+        out._data_tensor.zero_()
+
         seq_lens = TestTensor.from_torch(torch.tensor(total_lens_list, dtype=torch.int32), InfiniDtype.I32, device)
         
         max_blocks = max(len(t) for t in all_block_tables)
@@ -224,19 +229,8 @@ def test(
         # 5. 验证
         # ======================================================================
 
-        print(f"[debug] ans: {ans[:, 0, 0, :5]}")
-        print(f"[debug] out: {out.actual_tensor()[:, 0, 0, :5]}")
-
-        diff = ans - out.actual_tensor()
-        print(f"[debug] diff-shape: {diff.shape}")
-        print(f"[debug] diff: {diff}")
-
-        print(f"[debug] max ans: {torch.max(ans)}")
-        print(f"[debug] min ans: {torch.min(ans)}")
-
-        print(f"[debug] max out.actual_tensor(): {torch.max(out.actual_tensor())}")
-        print(f"[debug] min out.actual_tensor(): {torch.min(out.actual_tensor())}")
-
+        # print(f"[debug] ans: {ans[:, 0, 0, :5]}")
+        # print(f"[debug] out: {out.actual_tensor()[:, 0, 0, :5]}")
 
         atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
         # compare out.actual_tensor() with reference result ans
@@ -251,9 +245,10 @@ def test(
 # ==============================================================================
 _TEST_CASES_ = [
     # (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_step_len)
-    # (2, 8, 8, 128, 16, 32),
-    # (4, 16, 16, 64, 8, 64),
-    (2, 1, 1, 128, 8, 16),
+    (2, 8, 8, 128, 16, 32),
+    (4, 16, 16, 128, 8, 64),
+    (16, 1, 1, 128, 8, 16),
+    (1, 1, 1, 128, 8, 16),
 ]
 
 _TENSOR_DTYPES = [InfiniDtype.F32]