hw-native-sys · ChaoWao · May 31, 2026 · May 31, 2026
diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
@@ -274,10 +274,13 @@ L2SwimlaneAicoreTaskPool[num_cores]              (per-core AICore pool state)
 [L2SwimlaneAicoreTaskBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core]
 └── L2SwimlaneAicoreTaskRecord records[PLATFORM_AICORE_BUFFER_SIZE]  (1024 records, 32B each)
 
-[L2SwimlaneAicpuPhaseHeader + L2SwimlaneAicpuPhasePool[num_threads]]  (optional)
-├── magic / num_sched_threads
-├── core_to_thread[]  (core_id → scheduler thread index)
+[L2SwimlaneAicpuPhasePool[num_phase_threads]]  (optional)
 └── per-thread phase buffers (L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool)
+
+(Phase metadata — num_phase_threads, num_phase_cores, core_to_thread[] —
+ now lives inside L2SwimlaneDataHeader, not a separate cache line. The
+ old L2SwimlaneAicpuPhaseHeader struct + L2_SWIMLANE_AICPU_PHASE_MAGIC
+ gate were removed; host gates on num_phase_threads > 0 instead.)
 ```
 
 The records themselves are identical across architectures:
@@ -455,8 +458,9 @@ pushes back only the fields host actually modified (advanced
 The bulk `mirror_shm_to_device` is deliberately **not** called from
 the mgmt loop: it would race with AICPU writes to device-only
 fields (`current_buf_ptr`, `total/dropped/mismatch` counters,
-`queue_tails`, `free_queue.head`, `L2SwimlaneAicpuPhaseHeader::magic`,
-`core_to_thread[]`) and roll them back to whatever the host shadow
+`queue_tails`, `free_queue.head`,
+`L2SwimlaneDataHeader::num_phase_threads`,
+`L2SwimlaneDataHeader::core_to_thread[]`) and roll them back to whatever the host shadow
 held at the start of the tick. Per-buffer
 payloads (`L2SwimlaneAicpuTaskBuffer` / `L2SwimlaneAicpuPhaseBuffer`) are pulled on demand
 inside `ProfilerAlgorithms::process_entry` after a popped
@@ -657,10 +661,12 @@ active L2 swimlane buffer at run end. Check the AICPU flush path runs
 for every thread that produced records.
 
 **Phase records empty.** Either the runtime did not emit phase
-data (only `tensormap_and_ringbuffer` does, and only when
-`L2SwimlaneAicpuPhaseHeader::magic == L2_SWIMLANE_AICPU_PHASE_MAGIC`), or the host's
-`L2SwimlaneAicpuPhaseHeader` was not initialized. Verify the runtime sets
-the magic in its scheduler init path.
+data (only `tensormap_and_ringbuffer` does, and only when phase init
+ran — gated on `L2SwimlaneDataHeader::num_phase_threads > 0`), or the
+host did not pre-zero the field. Verify the runtime calls
+`l2_swimlane_aicpu_init_phase()` in its scheduler init path; check
+the host's `L2SwimlaneCollector::initialize` zero-inits
+`num_phase_threads` / `num_phase_cores` / `core_to_thread[]`.
 
 **`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime
 overwrites `task_id` with the full encoding on FIN

diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -142,7 +142,8 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
 /**
  * Initialize AICPU phase profiling
  *
- * Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers.
+ * Writes phase metadata (num_phase_threads, num_phase_cores, core_to_thread[])
+ * into L2SwimlaneDataHeader and clears per-thread phase record buffers.
  * Must be called once from thread 0 after l2_swimlane_aicpu_init().
  *
  * @param worker_count       Number of AICore workers (cores) — used to resolve

diff --git a/src/a2a3/platform/include/common/l2_swimlane_profiling.h b/src/a2a3/platform/include/common/l2_swimlane_profiling.h
@@ -20,6 +20,7 @@
  * │ L2SwimlaneDataHeader (fixed header)                         │
  * │  - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│
  * │  - num_cores, l2_swimlane_level                             │
+ * │  - num_phase_threads, num_phase_cores, core_to_thread[]     │
  * ├─────────────────────────────────────────────────────────────┤
  * │ L2SwimlaneAicpuTaskPool[0..num_cores-1]                     │
  * │  - head:       active L2SwimlaneAicpuTaskBuffer + counters  │
@@ -31,12 +32,7 @@
  * │                boundaries by bumping current_buf_seq)       │
  * │  - free_queue: SPSC ring of recycled AICore buffers         │
  * ├─────────────────────────────────────────────────────────────┤
- * │ L2SwimlaneAicpuPhaseHeader (optional, present when phase    │
- * │                             profiling is enabled)           │
- * │  - magic, num_sched_threads, records_per_thread             │
- * │  - core_to_thread mapping                                   │
- * ├─────────────────────────────────────────────────────────────┤
- * │ L2SwimlaneAicpuPhasePool[0..num_threads-1]                  │
+ * │ L2SwimlaneAicpuPhasePool[0..num_phase_threads-1]            │
  * │  - head, free_queue (same shape as AicpuTaskPool)           │
  * └─────────────────────────────────────────────────────────────┘
  *
@@ -46,8 +42,7 @@
  *
  * Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool)
  * With phases = Base + num_cores * sizeof(L2SwimlaneAicoreTaskPool)
- *                    + sizeof(L2SwimlaneAicpuPhaseHeader)
- *                    + num_threads * sizeof(L2SwimlaneAicpuPhasePool)
+ *                    + num_phase_threads * sizeof(L2SwimlaneAicpuPhasePool)
  */
 
 #ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
@@ -380,8 +375,32 @@ struct L2SwimlaneDataHeader {
     uint32_t l2_swimlane_level;  // 0=off, 1=AICore timing, 2=+dispatch/fanout,
                                  // 3=+sched phases, 4=+orch phases. Host writes
                                  // at init; AICPU reads in l2_swimlane_aicpu_init.
+
+    // Phase profiling metadata (AICPU writes in l2_swimlane_aicpu_init_phase;
+    // Host reads at drain time). num_phase_threads == 0 means phase profiling
+    // was not initialized (no phase pools to drain). Gated by
+    // l2_swimlane_level >= SCHED_PHASES at write time.
+    uint32_t num_phase_threads;                 // Number of phase pools the AICPU initialized
+    uint32_t num_phase_cores;                   // Number of valid entries in core_to_thread (0 = unset)
+    int8_t core_to_thread[PLATFORM_MAX_CORES];  // core_id → scheduler thread index (-1 = unassigned)
 } __attribute__((aligned(64)));
 
+// ABI lock for the merged header. The phase metadata fields and the
+// core_to_thread[] array are read by both host and AICPU .so's; silent
+// layout drift between them is undetectable at runtime (no magic gate
+// anymore). Mirrors the pool-layout asserts in #939.
+static_assert(
+    offsetof(L2SwimlaneDataHeader, num_phase_threads) ==
+        offsetof(L2SwimlaneDataHeader, l2_swimlane_level) + sizeof(uint32_t),
+    "L2SwimlaneDataHeader: num_phase_threads must follow l2_swimlane_level"
+);
+static_assert(
+    offsetof(L2SwimlaneDataHeader, core_to_thread) ==
+        offsetof(L2SwimlaneDataHeader, num_phase_cores) + sizeof(uint32_t),
+    "L2SwimlaneDataHeader: core_to_thread[] must follow num_phase_cores"
+);
+static_assert(sizeof(L2SwimlaneDataHeader) % 64 == 0, "L2SwimlaneDataHeader must be 64-byte aligned");
+
 // =============================================================================
 // AICPU Phase Profiling - Scheduler and Orchestrator Records
 // =============================================================================
@@ -451,27 +470,12 @@ struct L2SwimlaneAicpuPhaseRecord {
 };
 static_assert(sizeof(L2SwimlaneAicpuPhaseRecord) == 40, "L2SwimlaneAicpuPhaseRecord layout drift");
 
-constexpr uint32_t L2_SWIMLANE_AICPU_PHASE_MAGIC = 0x41435048;  // "ACPH"
-constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384;        // ~512KB per thread
+constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384;  // ~512KB per thread
 
 // Fixed-size phase record buffer. Same TypedBuffer template as L2SwimlaneAicpuTaskBuffer
 // and L2SwimlaneAicoreTaskBuffer — keeps the drain machinery uniform.
 using L2SwimlaneAicpuPhaseBuffer = TypedBuffer<L2SwimlaneAicpuPhaseRecord, PLATFORM_PHASE_RECORDS_PER_THREAD>;
 
-/**
- * AICPU phase profiling header
- *
- * Located after the L2SwimlaneAicpuTaskPool array in shared memory.
- * Contains metadata and per-thread tracking.
- */
-struct L2SwimlaneAicpuPhaseHeader {
-    uint32_t magic;                             // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC)
-    uint32_t num_sched_threads;                 // Number of scheduler threads
-    uint32_t records_per_thread;                // Max records per L2SwimlaneAicpuPhaseBuffer
-    uint32_t num_cores;                         // Total number of cores with valid assignments
-    int8_t core_to_thread[PLATFORM_MAX_CORES];  // core_id → scheduler thread index (-1 = unassigned)
-} __attribute__((aligned(64)));
-
 // =============================================================================
 // Helper Functions - Memory Layout
 // =============================================================================
@@ -530,24 +534,24 @@ inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_i
  * Calculate total memory size including AICore states and phase profiling
  * region (buffer states only, not the record payloads themselves).
  *
- * Layout (after the fixed L2SwimlaneDataHeader):
+ * Layout (after the fixed L2SwimlaneDataHeader, which now carries the
+ * formerly-standalone phase metadata fields):
  *   [L2SwimlaneAicpuTaskPool × num_cores]
  *   [L2SwimlaneAicoreTaskPool × num_cores]
- *   [L2SwimlaneAicpuPhaseHeader]
- *   [L2SwimlaneAicpuPhasePool × num_sched_threads]
+ *   [L2SwimlaneAicpuPhasePool × num_phase_threads]
  *
- * @param num_cores Number of AICore instances
- * @param num_sched_threads Number of phase profiling threads (scheduler + orchestrator)
+ * @param num_cores         Number of AICore instances
+ * @param num_phase_threads Number of phase profiling threads (scheduler + orchestrator)
  * @return Total bytes needed for header + all buffer states
  */
-inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
+inline size_t calc_perf_data_size_with_phases(int num_cores, int num_phase_threads) {
     return calc_perf_data_size(num_cores) + num_cores * sizeof(L2SwimlaneAicoreTaskPool) +
-           sizeof(L2SwimlaneAicpuPhaseHeader) + num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool);
+           num_phase_threads * sizeof(L2SwimlaneAicpuPhasePool);
 }
 
 /**
  * Get L2SwimlaneAicoreTaskPool array start address (located immediately
- * after the L2SwimlaneAicpuTaskPool array, before the L2SwimlaneAicpuPhaseHeader).
+ * after the L2SwimlaneAicpuTaskPool array).
  */
 inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_states(void *base_ptr, int num_cores) {
     return reinterpret_cast<L2SwimlaneAicoreTaskPool *>(
@@ -560,21 +564,14 @@ inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_state(void *base_ptr, int num
 }
 
 /**
- * Get L2SwimlaneAicpuPhaseHeader pointer (located after the L2SwimlaneAicoreTaskPool array).
- */
-inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
-    return reinterpret_cast<L2SwimlaneAicpuPhaseHeader *>(
-        reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores) +
-        num_cores * sizeof(L2SwimlaneAicoreTaskPool)
-    );
-}
-
-/**
- * Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader)
+ * Get L2SwimlaneAicpuPhasePool array start address (located immediately
+ * after the L2SwimlaneAicoreTaskPool array — the standalone phase header
+ * was merged into L2SwimlaneDataHeader).
  */
 inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) {
     return reinterpret_cast<L2SwimlaneAicpuPhasePool *>(
-        reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader)
+        reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores) +
+        num_cores * sizeof(L2SwimlaneAicoreTaskPool)
     );
 }
 

diff --git a/src/a2a3/platform/include/host/l2_swimlane_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h
@@ -179,11 +179,17 @@ struct L2SwimlaneModule {
             cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2SwimlaneAicoreTaskBuffer));
         }
 
-        // Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being
-        // initialized (runtimes that don't emit phase records leave it zero).
-        L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
-        const int num_phase_threads =
-            (ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
+        // Per-thread phase states (kind 1) — gated on num_phase_threads in
+        // the root header (runtimes that don't emit phase records leave it
+        // zero; AICPU sets it inside l2_swimlane_aicpu_init_phase). Bounds-
+        // clamp against PLATFORM_MAX_AICPU_THREADS in addition to the
+        // host-side zero-init, so a corrupted device-shared value can't
+        // walk off the pool array. Mirrors the same check in
+        // read_phase_header_metadata.
+        int num_phase_threads = static_cast<int>(header->num_phase_threads);
+        if (num_phase_threads > PLATFORM_MAX_AICPU_THREADS) {
+            num_phase_threads = 0;
+        }
         for (int t = 0; t < num_phase_threads; t++) {
             L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t);
             cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer));
@@ -219,7 +225,7 @@ using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback;
  *                                    (mgmt first so its final-drain entries
  *                                    have a consumer).
  *   5. read_phase_header_metadata() — single-shot read of the core→thread
- *                                    mapping from L2SwimlaneAicpuPhaseHeader.
+ *                                    mapping from L2SwimlaneDataHeader.
  *   6. reconcile_counters()        — device-side three-bucket accounting for
  *                                    both PERF and PHASE pools (total /
  *                                    collected / dropped).
@@ -327,7 +333,7 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
     void *get_aicore_ring_addr_table_device_ptr() const { return aicore_ring_addr_table_dev_; }
 
     /**
-     * Read AICPU phase metadata that lives in L2SwimlaneAicpuPhaseHeader (not on the
+     * Read AICPU phase metadata that lives in L2SwimlaneDataHeader (not on the
      * buffer pipeline): the core→thread mapping plus a has-data signal
      * derived from accumulated per-event records. Single-shot — must be
      * called after stop() so the shm region has settled.

diff --git a/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -28,9 +28,13 @@
 #include "common/platform_config.h"
 #include "common/unified_log.h"
 
-// Cached pointers for hot-path access (set during init)
-static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr;
+// Cached pointers for hot-path access (set during init). Phase metadata
+// (num_phase_threads, num_phase_cores, core_to_thread[]) lives inside
+// L2SwimlaneDataHeader after the phase-header merge; we keep a separate
+// bool so phase-gated paths can check init-ran without re-reading the
+// device-shared header.
 static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr;
+static bool s_phase_initialized = false;
 
 // Per-core L2SwimlaneAicpuTaskPool cache
 static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {};
@@ -127,6 +131,15 @@ static int enqueue_ready_buffer(
 }
 
 void l2_swimlane_aicpu_init(int worker_count) {
+    // Reset cross-launch state up front. AICPU statics persist across launches
+    // on the same loaded .so; without this reset, an enabled→disabled launch
+    // sequence would leave s_phase_initialized=true from the prior run, and
+    // any subsequent record_phase call would dereference the prior launch's
+    // (now-freed) s_aicpu_phase_pools pointers. Same shape as the
+    // [[block_local]] reset in onboard/aicore/kernel.cpp for the AICore-side
+    // rotation slot (fixed in #936).
+    s_phase_initialized = false;
+
     void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
     if (l2_swimlane_base == nullptr) {
         LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling");
@@ -593,17 +606,12 @@ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) {
         return;
     }
 
-    s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count);
     s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);
 
-    s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC;
-    s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads;
-    s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
-    s_l2_swimlane_aicpu_phase_header->num_cores = 0;
-
-    memset(
-        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
-    );
+    s_l2_swimlane_header->num_phase_threads = num_sched_threads;
+    s_l2_swimlane_header->num_phase_cores = 0;
+    memset(s_l2_swimlane_header->core_to_thread, -1, sizeof(s_l2_swimlane_header->core_to_thread));
+    s_phase_initialized = true;
 
     // Cache per-thread record pointers and clear buffers
     // Include all threads: scheduler + orchestrator (orchestrators may become schedulers)
@@ -719,7 +727,7 @@ void l2_swimlane_aicpu_record_phase(
     int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
     uint64_t tasks_processed, uint32_t extra1, uint32_t extra2
 ) {
-    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
+    if (!s_phase_initialized) {
         return;
     }
 
@@ -794,12 +802,12 @@ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx =
 void l2_swimlane_aicpu_record_orch_phase(
     L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
 ) {
-    if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return;
+    if (s_orch_thread_idx < 0 || !s_phase_initialized) return;
     l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
 }
 
 void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
-    if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) {
+    if (!s_phase_initialized || s_l2_swimlane_header == nullptr) {
         return;
     }
 
@@ -835,25 +843,23 @@ void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
 }
 
 void l2_swimlane_aicpu_init_core_assignments(int total_cores) {
-    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
+    if (!s_phase_initialized) {
         return;
     }
-    memset(
-        s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
-    );
-    s_l2_swimlane_aicpu_phase_header->num_cores = static_cast<uint32_t>(total_cores);
+    memset(s_l2_swimlane_header->core_to_thread, -1, sizeof(s_l2_swimlane_header->core_to_thread));
+    s_l2_swimlane_header->num_phase_cores = static_cast<uint32_t>(total_cores);
     wmb();
     LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores);
 }
 
 void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
-    if (s_l2_swimlane_aicpu_phase_header == nullptr) {
+    if (!s_phase_initialized) {
         return;
     }
     for (int i = 0; i < core_num; i++) {
         int core_id = core_ids[i];
         if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) {
-            s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
+            s_l2_swimlane_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
         }
     }
     wmb();