hw-native-sys · ChaoWao · May 31, 2026 · May 31, 2026
diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
@@ -264,11 +264,12 @@ L2SwimlaneAicpuTaskPool[num_cores]                    (per-core AICPU pool state
 └── mismatch_record_count     (legacy; no longer written)
 
 L2SwimlaneAicoreTaskPool[num_cores]              (per-core AICore pool state)
-├── rotation {current_buf_ptr, generation}      (AICPU writes, AICore reads
-│                                                — cache-line independent)
-├── free_queue {buffer_ptrs[SLOT_COUNT], head, tail}
-├── total_record_count / dropped_record_count
-└── current_buf_seq
+├── head {current_buf_ptr, current_buf_seq,     (single 64B cache line;
+│         total_record_count,                    AICPU writes, AICore dcci-
+│         dropped_record_count}                  polls per task; AICPU bumps
+│                                                current_buf_seq on rotation
+│                                                so AICore detects the change)
+└── free_queue {buffer_ptrs[SLOT_COUNT], head, tail}
 
 [L2SwimlaneAicoreTaskBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core]
 └── L2SwimlaneAicoreTaskRecord records[PLATFORM_AICORE_BUFFER_SIZE]  (1024 records, 32B each)
@@ -300,14 +301,14 @@ log via `LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…"`.
 **Producer/consumer protocol on AICore (AICore-as-producer with rotation).**
 AICore writes a slim `L2SwimlaneAicoreTaskRecord` into its currently-active per-core
 `L2SwimlaneAicoreTaskBuffer` at `records[slot_within_buf++]`. The active buffer is
-published via a per-core `L2SwimlaneAicoreRotation` cache line (`current_buf_ptr` +
-`generation`); AICore `dcci`'s it per task — cheap relative to the
-baseline `dcci(payload, ENTIRE_DATA_CACHE)` it already pays per task.
-AICPU drives rotation: immediately before each `write_reg(DATA_MAIN_BASE)`
+published via a per-core `L2SwimlaneActiveHead` cache line (`current_buf_ptr` +
+`current_buf_seq` + counters); AICore `dcci`'s it per task — cheap relative
+to the baseline `dcci(payload, ENTIRE_DATA_CACHE)` it already pays per
+task. AICPU drives rotation: immediately before each `write_reg(DATA_MAIN_BASE)`
 for task `K`, if `K % PLATFORM_AICORE_BUFFER_SIZE == 0`, AICPU enqueues
-the current buffer to the per-thread ready queue (kind `is_phase=2`),
+the current buffer to the per-thread ready queue (kind `AicoreTask`),
 pops the next from `L2SwimlaneAicoreTaskPool::free_queue`, and bumps
-`L2SwimlaneAicoreRotation::generation`. AICore detects the bumped generation on
+`L2SwimlaneActiveHead::current_buf_seq`. AICore detects the bumped seq on
 its next task's `dcci`, refreshes its local cache, and resets its slot
 counter to 0.
 

diff --git a/src/a2a3/platform/include/aicore/aicore_profiling_state.h b/src/a2a3/platform/include/aicore/aicore_profiling_state.h
@@ -25,15 +25,17 @@
  *
  * Lifecycle:
  *   1. Host fills `KernelArgs::enable_profiling_flag` and
- *      `KernelArgs::l2_swimlane_aicore_rotation_table` (points to a per-core `L2SwimlaneAicoreRotation`
- *      device-address table). Host allocates the table bytes; AICPU populates
- *      the entries inside `l2_swimlane_aicpu_init`.
- *   2. AICore kernel entry stashes `&l2_swimlane_aicore_rotation_table[block_idx]` (the slot
- *      pointer — NOT the dereferenced rotation pointer yet) via
- *      `set_l2_swimlane_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`,
+ *      `KernelArgs::l2_swimlane_aicore_rotation_table` (an array of per-core
+ *      slots, each holding a device address of an `L2SwimlaneActiveHead`).
+ *      Host allocates the table bytes; AICPU populates the slot entries
+ *      inside `l2_swimlane_aicpu_init` with `&pool.head` for each AicoreTask
+ *      pool.
+ *   2. AICore kernel entry stashes `&l2_swimlane_aicore_rotation_table[block_idx]`
+ *      (the slot pointer — NOT the dereferenced head pointer yet) via
+ *      `set_l2_swimlane_aicore_head_slot()`, and calls `set_aicore_profiling_flag()`,
  *      before invoking `aicore_execute`.
- *   3. `get_l2_swimlane_aicore_rotation()` lazily dereferences the slot the first time
- *      it is called. Callers must defer the call until AFTER AICPU has
+ *   3. `get_l2_swimlane_aicore_head()` lazily dereferences the slot the first
+ *      time it is called. Callers must defer the call until AFTER AICPU has
  *      dispatched the first task (so AICPU init has had a chance to populate
  *      the table). The executor handles this by calling it inside the main
  *      loop's first-task branch.
@@ -56,22 +58,24 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag);
 __aicore__ uint32_t get_aicore_profiling_flag();
 
 /**
- * Per-core AICore rotation channel.
+ * Per-core AICore head channel.
  *
- * `set_l2_swimlane_aicore_rotation_slot(slot)` stashes the address of THIS core's slot
- * in the rotation-address table — `&((uint64_t*)k_args->l2_swimlane_aicore_rotation_table)[block_idx]`.
- * No dereference happens here, because at kernel entry the AICPU side may
- * not yet have populated the table (the host launches both kernels and
- * AICPU's init runs concurrently with AICore's entry).
+ * `set_l2_swimlane_aicore_head_slot(slot)` stashes the address of THIS core's
+ * slot in the head-address table —
+ * `&((uint64_t*)k_args->l2_swimlane_aicore_rotation_table)[block_idx]`. No
+ * dereference happens here, because at kernel entry the AICPU side may not
+ * yet have populated the table (the host launches both kernels and AICPU's
+ * init runs concurrently with AICore's entry).
  *
- * `get_l2_swimlane_aicore_rotation()` lazily dereferences the stashed slot on first use,
- * caches the result, and returns it on subsequent calls. Callers MUST defer
- * the first call until after AICPU has dispatched the first task — by then
- * AICPU's init has completed and the slot holds a valid device address.
- * The executor's main loop honours this by reading the rotation only inside
- * the first-task branch of the dispatch poll.
+ * `get_l2_swimlane_aicore_head()` lazily dereferences the stashed slot on
+ * first use, caches the result, and returns it on subsequent calls. Callers
+ * MUST defer the first call until after AICPU has dispatched the first task —
+ * by then AICPU's init has completed and the slot holds a valid device
+ * address pointing at the AICore pool's `head` (an `L2SwimlaneActiveHead`).
+ * The executor's main loop honours this by reading the head only inside the
+ * first-task branch of the dispatch poll.
  */
-__aicore__ void set_l2_swimlane_aicore_rotation_slot(__gm__ uint64_t *slot_ptr);
-__aicore__ __gm__ L2SwimlaneAicoreRotation *get_l2_swimlane_aicore_rotation();
+__aicore__ void set_l2_swimlane_aicore_head_slot(__gm__ uint64_t *slot_ptr);
+__aicore__ __gm__ L2SwimlaneActiveHead *get_l2_swimlane_aicore_head();
 
 #endif  // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_
diff --git a/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h b/src/a2a3/platform/include/aicore/l2_swimlane_collector_aicore.h
@@ -34,15 +34,16 @@
 /**
  * AICore-local rotation state. Tracks which buffer this core is currently
  * writing into and which slot is next. Reset by `l2_swimlane_aicore_record_task`
- * when it observes a generation bump on the shared `L2SwimlaneAicoreRotation` channel
- * (AICPU rotates by writing `current_buf_ptr` + bumping `generation`, so the
- * AICore-local state self-recovers without any AICore-side spin-wait).
+ * when it observes a `current_buf_seq` bump on the shared `L2SwimlaneActiveHead`
+ * cache line (AICPU rotates by writing `current_buf_ptr` + bumping
+ * `current_buf_seq`, so the AICore-local state self-recovers without any
+ * AICore-side spin-wait).
  */
 struct L2SwimlaneAicoreLocalState {
     __gm__ L2SwimlaneAicoreTaskBuffer *cached_buf = nullptr;
-    // Must start != AICPU's initial generation (1) so the first record_task
-    // call observes a generation mismatch and loads the buffer pointer.
-    uint32_t cached_generation = 0;
+    // Must start != AICPU's initial head.current_buf_seq (0) so the first
+    // record_task call observes a mismatch and loads the buffer pointer.
+    uint32_t cached_buf_seq = UINT32_MAX;
     uint32_t slot_within_buf = 0;
 };
 
@@ -51,12 +52,12 @@ struct L2SwimlaneAicoreLocalState {
  *
  * AICore writes a slim L2SwimlaneAicoreTaskRecord into its currently-published
  * per-core L2SwimlaneAicoreTaskBuffer at `records[slot_within_buf++]`. The
- * publication channel is an L2SwimlaneAicoreRotation cache line addressed via
- * `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]` (now points to L2SwimlaneAicoreRotation,
- * not directly to a buffer). AICPU updates `rotation->current_buf_ptr` and
- * bumps `rotation->generation` at dispatch boundaries; AICore detects the
- * change by `dcci`-ing the rotation line per task and comparing generation
- * to its locally cached copy.
+ * publication channel is an L2SwimlaneActiveHead cache line addressed via
+ * `KernelArgs::l2_swimlane_aicore_rotation_table[block_idx]` (points to the
+ * AICore pool's `head`, not directly to a buffer). AICPU updates
+ * `head->current_buf_ptr` and bumps `head->current_buf_seq` at dispatch
+ * boundaries; AICore detects the change by `dcci`-ing the head line per task
+ * and comparing the sequence to its locally cached copy.
  *
  * AICPU and AICore never read each other's data on the hot path. The host
  * post-processor joins the AICore stream (multi-buffer per core, in order)
@@ -69,23 +70,29 @@ struct L2SwimlaneAicoreLocalState {
  * so AICore has already finished writing their records before AICPU enqueues
  * the old buffer to the ready queue.
  *
- * @param rotation Per-core L2SwimlaneAicoreRotation channel (cached at kernel entry
- *                 from KernelArgs::l2_swimlane_aicore_rotation_table[block_idx])
+ * @param head     Per-core L2SwimlaneActiveHead channel — lazy-resolved on
+ *                 the executor's first-task branch via
+ *                 get_l2_swimlane_aicore_head(), which deref's the slot the
+ *                 kernel entry stashed from
+ *                 KernelArgs::l2_swimlane_aicore_rotation_table[block_idx].
+ *                 (Kernel entry can't deref directly — AICPU init runs
+ *                 concurrently with kernel entry, so the slot may not yet
+ *                 hold a valid address at that point.)
  * @param local    Per-core AICore-local state (caller-owned static)
  * @param task_id  Register dispatch id (DATA_MAIN_BASE), low 32 bits
  * @param start_time Start timestamp (get_sys_cnt)
  * @param end_time   End timestamp
  */
 __aicore__ __attribute__((always_inline)) static inline void l2_swimlane_aicore_record_task(
-    __gm__ L2SwimlaneAicoreRotation *rotation, L2SwimlaneAicoreLocalState *local, uint32_t task_id, uint64_t start_time,
+    __gm__ L2SwimlaneActiveHead *head, L2SwimlaneAicoreLocalState *local, uint32_t task_id, uint64_t start_time,
     uint64_t end_time
 ) {
-    // Re-fetch rotation channel each task; cheap relative to the
+    // Re-fetch head channel each task; cheap relative to the
     // baseline `dcci(payload, ENTIRE_DATA_CACHE)` we already pay per task.
-    dcci(rotation, SINGLE_CACHE_LINE);
-    if (rotation->generation != local->cached_generation) {
-        local->cached_generation = rotation->generation;
-        local->cached_buf = reinterpret_cast<__gm__ L2SwimlaneAicoreTaskBuffer *>(rotation->current_buf_ptr);
+    dcci(head, SINGLE_CACHE_LINE);
+    if (head->current_buf_seq != local->cached_buf_seq) {
+        local->cached_buf_seq = head->current_buf_seq;
+        local->cached_buf = reinterpret_cast<__gm__ L2SwimlaneAicoreTaskBuffer *>(head->current_buf_ptr);
         local->slot_within_buf = 0;
     }
     if (local->cached_buf == nullptr) {

diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -46,13 +46,14 @@ extern "C" uint64_t get_platform_l2_swimlane_base();
 extern "C" void set_l2_swimlane_enabled(bool enable);
 extern "C" bool is_l2_swimlane_enabled();
 
-// AICore rotation-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table).
+// AICore head-table device pointer (= KernelArgs::l2_swimlane_aicore_rotation_table).
 // Published by the host before AICPU init runs; AICPU init fills the table
-// with the per-core `&L2SwimlaneAicoreTaskPool::rotation` device addresses so
-// AICore can index `l2_swimlane_aicore_rotation_table[block_idx]` to find its rotation channel.
-// Moved from host into AICPU so the host stays decoupled from the AICore-side
-// shared-memory layout (host previously did host-to-device address translation
-// + reached into get_aicore_buffer_state to fill this).
+// with the per-core `&L2SwimlaneAicoreTaskPool::head` device addresses so
+// AICore can index `l2_swimlane_aicore_rotation_table[block_idx]` to find its
+// active-head cache line. Moved from host into AICPU so the host stays
+// decoupled from the AICore-side shared-memory layout (host previously did
+// host-to-device address translation + reached into get_aicore_buffer_state
+// to fill this).
 extern "C" void set_platform_l2_swimlane_aicore_rotation_table(uint64_t table_addr);
 extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table();
 
@@ -69,7 +70,7 @@ L2SwimlaneLevel get_l2_swimlane_level();
  *
  * Also primes the per-core AICore rotation channel: pops the initial
  * L2SwimlaneAicoreTaskBuffer from L2SwimlaneAicoreTaskPool::free_queue and writes its
- * address into the L2SwimlaneAicoreRotation channel that AICore polls per task.
+ * address into the L2SwimlaneActiveHead channel that AICore polls per task.
  *
  * @param worker_count  Number of AICore workers (cores) to initialize
  */