InfiniTensor · spike-zhu · Dec 23, 2025
diff --git a/test/infiniop/paged_attention.py b/test/infiniop/paged_attention.py
@@ -93,10 +93,10 @@ def ref_single_query_cached_kv_attention(
 _TEST_CASES_ = [
     # (num_seqs, num_heads, num_kv_heads, head_size, block_size, max_seq_len, use_alibi)
     (1, 1, 1, 128, 16, 1024, False),
-    (1, 40, 40, 128, 16, 1024, False),
-    (1, 40, 40, 128, 16, 1024, False),
-    (1, 8, 8, 128, 16, 1024, False),
-    (1, 64, 8, 128, 16, 2048, False),
+    (4, 40, 40, 128, 16, 1024, False),
+    (6, 40, 40, 128, 16, 1024, False),
+    (3, 8, 8, 128, 16, 1024, False),
+    (8, 64, 8, 128, 16, 2048, False),
 ]
 
 # Data types for testing
@@ -139,8 +139,32 @@ def test(
     max_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     num_blocks = num_seqs * max_blocks_per_seq  # A reasonable number for testing
 
+    # ==========================================================================
+    # STRIDE ADAPTATION FOR PACKED MHA LAYOUT
+    # ==========================================================================
+    # To match the kernel logic where o_stride is derived as (q_stride / 3),
+    # we simulate a Packed QKV memory layout typical for MHA (Multi-Head Attention).
+    # In this configuration, Q, K, and V have identical head counts and are
+    # stored contiguously in a single buffer (e.g., shape [num_seqs, 3, num_heads, head_size]).
+    #
+    # By setting the batch stride of Q to 3x the logical width, we satisfy the
+    # kernel's expectation that the input 'q' is a pointer into a 3-way interleaved
+    # tensor, ensuring the calculated output pointer remains correctly aligned.
+    # ==========================================================================
+
+    q_shape = (num_seqs, num_heads, head_size)
+
+    # Manually construct strides to simulate qkv[:, 0, :, :] behavior
+    q_strides = (
+        3
+        * num_heads
+        * head_size,  # Forced 3x stride to adapt to kernel's 'q_stride / 3' logic
+        head_size,  # Head stride
+        1,  # Element stride
+    )
+
     # Create input tensors
-    q = TestTensor((num_seqs, num_heads, head_size), None, dtype, device)
+    q = TestTensor(q_shape, q_strides, dtype, device)
     out = TestTensor((num_seqs, num_heads, head_size), None, dtype, device)
     k_cache = TestTensor(
         (num_blocks, num_kv_heads, block_size, head_size), None, dtype, device