hw-native-sys · zhusy54 · Apr 10, 2026
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -112,16 +112,19 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
         {
             uint32_t task_id = reg_val;  // Decode: register holds task_id directly
 
+            // Select dual-buffer slot: same bit as AICPU used when writing payload
+            __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);
+
             // Invalidate payload buffer (AICPU updates its content each dispatch)
-            dcci(payload, ENTIRE_DATA_CACHE);
+            dcci(exec_payload, ENTIRE_DATA_CACHE);
 
             write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
 
             // Performance profiling: record start time
             uint64_t start_time = get_sys_cnt_aicore();
 
             // Execute the task
-            execute_task(payload);
+            execute_task(exec_payload);
 
             // Performance profiling: record task execution
             if (profiling_enabled) {

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -112,16 +112,19 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
         {
             uint32_t task_id = reg_val;  // Decode: register holds task_id directly
 
+            // Select dual-buffer slot: same bit as AICPU used when writing payload
+            __gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);
+
             // Invalidate payload buffer (AICPU updates its content each dispatch)
-            dcci(payload, ENTIRE_DATA_CACHE);
+            dcci(exec_payload, ENTIRE_DATA_CACHE);
 
             write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));
 
             // Performance profiling: record start time
             uint64_t start_time = get_sys_cnt_aicore();
 
             // Execute the task
-            execute_task(payload);
+            execute_task(exec_payload);
 
             // Performance profiling: record task execution
             if (profiling_enabled) {

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -102,6 +102,9 @@ static __aicore__ void pv_matmul_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {

diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -104,6 +104,9 @@ static __aicore__ void qk_matmul_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {

diff --git a/.../st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp b/.../st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -187,6 +187,9 @@ static __aicore__ void online_update_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {

diff --git a/...t/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/...t/a2a3/tensormap_and_ringbuffer/batch_paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -159,6 +159,9 @@ static __aicore__ void softmax_prepare_batch_impl(
             pipe_barrier(PIPE_ALL);
         }
     }
+
+    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
+    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
 }
 
 extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {