Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions docs/dfx/l2-swimlane-profiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,13 @@ L2SwimlaneAicoreTaskPool[num_cores] (per-core AICore pool state)
[L2SwimlaneAicoreTaskBuffer × PLATFORM_AICORE_BUFFERS_PER_CORE per core]
└── L2SwimlaneAicoreTaskRecord records[PLATFORM_AICORE_BUFFER_SIZE] (1024 records, 32B each)

[L2SwimlaneAicpuPhaseHeader + L2SwimlaneAicpuPhasePool[num_threads]] (optional)
├── magic / num_sched_threads
├── core_to_thread[] (core_id → scheduler thread index)
[L2SwimlaneAicpuPhasePool[num_phase_threads]] (optional)
└── per-thread phase buffers (L2SwimlaneAicpuPhasePool aliases L2SwimlaneAicpuTaskPool)

(Phase metadata — num_phase_threads, num_phase_cores, core_to_thread[] —
now lives inside L2SwimlaneDataHeader, not a separate cache line. The
old L2SwimlaneAicpuPhaseHeader struct + L2_SWIMLANE_AICPU_PHASE_MAGIC
gate were removed; host gates on num_phase_threads > 0 instead.)
```

The records themselves are identical across architectures:
Expand Down Expand Up @@ -455,8 +458,9 @@ pushes back only the fields host actually modified (advanced
The bulk `mirror_shm_to_device` is deliberately **not** called from
the mgmt loop: it would race with AICPU writes to device-only
fields (`current_buf_ptr`, `total/dropped/mismatch` counters,
`queue_tails`, `free_queue.head`, `L2SwimlaneAicpuPhaseHeader::magic`,
`core_to_thread[]`) and roll them back to whatever the host shadow
`queue_tails`, `free_queue.head`,
`L2SwimlaneDataHeader::num_phase_threads`,
`L2SwimlaneDataHeader::core_to_thread[]`) and roll them back to whatever the host shadow
held at the start of the tick. Per-buffer
payloads (`L2SwimlaneAicpuTaskBuffer` / `L2SwimlaneAicpuPhaseBuffer`) are pulled on demand
inside `ProfilerAlgorithms::process_entry` after a popped
Expand Down Expand Up @@ -657,10 +661,12 @@ active L2 swimlane buffer at run end. Check the AICPU flush path runs
for every thread that produced records.

**Phase records empty.** Either the runtime did not emit phase
data (only `tensormap_and_ringbuffer` does, and only when
`L2SwimlaneAicpuPhaseHeader::magic == L2_SWIMLANE_AICPU_PHASE_MAGIC`), or the host's
`L2SwimlaneAicpuPhaseHeader` was not initialized. Verify the runtime sets
the magic in its scheduler init path.
data (only `tensormap_and_ringbuffer` does, and only when phase init
ran — gated on `L2SwimlaneDataHeader::num_phase_threads > 0`), or the
host did not pre-zero the field. Verify the runtime calls
`l2_swimlane_aicpu_init_phase()` in its scheduler init path; check
the host's `L2SwimlaneCollector::initialize` zero-inits
`num_phase_threads` / `num_phase_cores` / `core_to_thread[]`.

**`dispatch_time_us` < `finish_time_us` mismatch.** Verify the runtime
overwrites `task_id` with the full encoding on FIN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ void l2_swimlane_aicpu_flush(int thread_idx, const int *cur_thread_cores, int co
/**
* Initialize AICPU phase profiling
*
* Sets up L2SwimlaneAicpuPhaseHeader and clears per-thread phase record buffers.
* Writes phase metadata (num_phase_threads, num_phase_cores, core_to_thread[])
* into L2SwimlaneDataHeader and clears per-thread phase record buffers.
* Must be called once from thread 0 after l2_swimlane_aicpu_init().
*
* @param worker_count Number of AICore workers (cores) — used to resolve
Expand Down
85 changes: 41 additions & 44 deletions src/a2a3/platform/include/common/l2_swimlane_profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* │ L2SwimlaneDataHeader (fixed header) │
* │ - ReadyQueue (FIFO, capacity=PLATFORM_PROF_READYQUEUE_SIZE)│
* │ - num_cores, l2_swimlane_level │
* │ - num_phase_threads, num_phase_cores, core_to_thread[] │
* ├─────────────────────────────────────────────────────────────┤
* │ L2SwimlaneAicpuTaskPool[0..num_cores-1] │
* │ - head: active L2SwimlaneAicpuTaskBuffer + counters │
Expand All @@ -31,12 +32,7 @@
* │ boundaries by bumping current_buf_seq) │
* │ - free_queue: SPSC ring of recycled AICore buffers │
* ├─────────────────────────────────────────────────────────────┤
* │ L2SwimlaneAicpuPhaseHeader (optional, present when phase │
* │ profiling is enabled) │
* │ - magic, num_sched_threads, records_per_thread │
* │ - core_to_thread mapping │
* ├─────────────────────────────────────────────────────────────┤
* │ L2SwimlaneAicpuPhasePool[0..num_threads-1] │
* │ L2SwimlaneAicpuPhasePool[0..num_phase_threads-1] │
* │ - head, free_queue (same shape as AicpuTaskPool) │
* └─────────────────────────────────────────────────────────────┘
*
Expand All @@ -46,8 +42,7 @@
*
* Base size = sizeof(L2SwimlaneDataHeader) + num_cores * sizeof(L2SwimlaneAicpuTaskPool)
* With phases = Base + num_cores * sizeof(L2SwimlaneAicoreTaskPool)
* + sizeof(L2SwimlaneAicpuPhaseHeader)
* + num_threads * sizeof(L2SwimlaneAicpuPhasePool)
* + num_phase_threads * sizeof(L2SwimlaneAicpuPhasePool)
*/

#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_L2_SWIMLANE_PROFILING_H_
Expand Down Expand Up @@ -380,8 +375,32 @@ struct L2SwimlaneDataHeader {
uint32_t l2_swimlane_level; // 0=off, 1=AICore timing, 2=+dispatch/fanout,
// 3=+sched phases, 4=+orch phases. Host writes
// at init; AICPU reads in l2_swimlane_aicpu_init.

// Phase profiling metadata (AICPU writes in l2_swimlane_aicpu_init_phase;
// Host reads at drain time). num_phase_threads == 0 means phase profiling
// was not initialized (no phase pools to drain). Gated by
// l2_swimlane_level >= SCHED_PHASES at write time.
uint32_t num_phase_threads; // Number of phase pools the AICPU initialized
uint32_t num_phase_cores; // Number of valid entries in core_to_thread (0 = unset)
int8_t core_to_thread[PLATFORM_MAX_CORES]; // core_id → scheduler thread index (-1 = unassigned)
} __attribute__((aligned(64)));

// ABI lock for the merged header. The phase metadata fields and the
// core_to_thread[] array are read by both host and AICPU .so's; silent
// layout drift between them is undetectable at runtime (no magic gate
// anymore). Mirrors the pool-layout asserts in #939.
static_assert(
offsetof(L2SwimlaneDataHeader, num_phase_threads) ==
offsetof(L2SwimlaneDataHeader, l2_swimlane_level) + sizeof(uint32_t),
"L2SwimlaneDataHeader: num_phase_threads must follow l2_swimlane_level"
);
static_assert(
offsetof(L2SwimlaneDataHeader, core_to_thread) ==
offsetof(L2SwimlaneDataHeader, num_phase_cores) + sizeof(uint32_t),
"L2SwimlaneDataHeader: core_to_thread[] must follow num_phase_cores"
);
static_assert(sizeof(L2SwimlaneDataHeader) % 64 == 0, "L2SwimlaneDataHeader must be 64-byte aligned");

// =============================================================================
// AICPU Phase Profiling - Scheduler and Orchestrator Records
// =============================================================================
Expand Down Expand Up @@ -451,27 +470,12 @@ struct L2SwimlaneAicpuPhaseRecord {
};
static_assert(sizeof(L2SwimlaneAicpuPhaseRecord) == 40, "L2SwimlaneAicpuPhaseRecord layout drift");

constexpr uint32_t L2_SWIMLANE_AICPU_PHASE_MAGIC = 0x41435048; // "ACPH"
constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384; // ~512KB per thread
constexpr int PLATFORM_PHASE_RECORDS_PER_THREAD = 16384; // ~512KB per thread

// Fixed-size phase record buffer. Same TypedBuffer template as L2SwimlaneAicpuTaskBuffer
// and L2SwimlaneAicoreTaskBuffer — keeps the drain machinery uniform.
using L2SwimlaneAicpuPhaseBuffer = TypedBuffer<L2SwimlaneAicpuPhaseRecord, PLATFORM_PHASE_RECORDS_PER_THREAD>;

/**
* AICPU phase profiling header
*
* Located after the L2SwimlaneAicpuTaskPool array in shared memory.
* Contains metadata and per-thread tracking.
*/
struct L2SwimlaneAicpuPhaseHeader {
uint32_t magic; // Validation magic (L2_SWIMLANE_AICPU_PHASE_MAGIC)
uint32_t num_sched_threads; // Number of scheduler threads
uint32_t records_per_thread; // Max records per L2SwimlaneAicpuPhaseBuffer
uint32_t num_cores; // Total number of cores with valid assignments
int8_t core_to_thread[PLATFORM_MAX_CORES]; // core_id → scheduler thread index (-1 = unassigned)
} __attribute__((aligned(64)));

// =============================================================================
// Helper Functions - Memory Layout
// =============================================================================
Expand Down Expand Up @@ -530,24 +534,24 @@ inline L2SwimlaneAicpuTaskPool *get_perf_buffer_state(void *base_ptr, int core_i
* Calculate total memory size including AICore states and phase profiling
* region (buffer states only, not the record payloads themselves).
*
* Layout (after the fixed L2SwimlaneDataHeader):
* Layout (after the fixed L2SwimlaneDataHeader, which now carries the
* formerly-standalone phase metadata fields):
* [L2SwimlaneAicpuTaskPool × num_cores]
* [L2SwimlaneAicoreTaskPool × num_cores]
* [L2SwimlaneAicpuPhaseHeader]
* [L2SwimlaneAicpuPhasePool × num_sched_threads]
* [L2SwimlaneAicpuPhasePool × num_phase_threads]
*
* @param num_cores Number of AICore instances
* @param num_sched_threads Number of phase profiling threads (scheduler + orchestrator)
* @param num_cores Number of AICore instances
* @param num_phase_threads Number of phase profiling threads (scheduler + orchestrator)
* @return Total bytes needed for header + all buffer states
*/
inline size_t calc_perf_data_size_with_phases(int num_cores, int num_sched_threads) {
inline size_t calc_perf_data_size_with_phases(int num_cores, int num_phase_threads) {
return calc_perf_data_size(num_cores) + num_cores * sizeof(L2SwimlaneAicoreTaskPool) +
sizeof(L2SwimlaneAicpuPhaseHeader) + num_sched_threads * sizeof(L2SwimlaneAicpuPhasePool);
num_phase_threads * sizeof(L2SwimlaneAicpuPhasePool);
}

/**
* Get L2SwimlaneAicoreTaskPool array start address (located immediately
* after the L2SwimlaneAicpuTaskPool array, before the L2SwimlaneAicpuPhaseHeader).
* after the L2SwimlaneAicpuTaskPool array).
*/
inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_states(void *base_ptr, int num_cores) {
return reinterpret_cast<L2SwimlaneAicoreTaskPool *>(
Expand All @@ -560,21 +564,14 @@ inline L2SwimlaneAicoreTaskPool *get_aicore_buffer_state(void *base_ptr, int num
}

/**
* Get L2SwimlaneAicpuPhaseHeader pointer (located after the L2SwimlaneAicoreTaskPool array).
*/
inline L2SwimlaneAicpuPhaseHeader *get_phase_header(void *base_ptr, int num_cores) {
return reinterpret_cast<L2SwimlaneAicpuPhaseHeader *>(
reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores) +
num_cores * sizeof(L2SwimlaneAicoreTaskPool)
);
}

/**
* Get L2SwimlaneAicpuPhasePool array start address (located after L2SwimlaneAicpuPhaseHeader)
* Get L2SwimlaneAicpuPhasePool array start address (located immediately
* after the L2SwimlaneAicoreTaskPool array — the standalone phase header
* was merged into L2SwimlaneDataHeader).
*/
inline L2SwimlaneAicpuPhasePool *get_phase_buffer_states(void *base_ptr, int num_cores) {
return reinterpret_cast<L2SwimlaneAicpuPhasePool *>(
reinterpret_cast<char *>(get_phase_header(base_ptr, num_cores)) + sizeof(L2SwimlaneAicpuPhaseHeader)
reinterpret_cast<char *>(base_ptr) + calc_perf_data_size(num_cores) +
num_cores * sizeof(L2SwimlaneAicoreTaskPool)
);
}

Expand Down
20 changes: 13 additions & 7 deletions src/a2a3/platform/include/host/l2_swimlane_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,17 @@ struct L2SwimlaneModule {
cb(/*kind=*/2, &ac_state->free_queue, sizeof(L2SwimlaneAicoreTaskBuffer));
}

// Per-thread phase states (kind 1) — gated on L2SwimlaneAicpuPhaseHeader being
// initialized (runtimes that don't emit phase records leave it zero).
L2SwimlaneAicpuPhaseHeader *ph = get_phase_header(shm, num_cores);
const int num_phase_threads =
(ph->magic == L2_SWIMLANE_AICPU_PHASE_MAGIC) ? static_cast<int>(ph->num_sched_threads) : 0;
// Per-thread phase states (kind 1) — gated on num_phase_threads in
// the root header (runtimes that don't emit phase records leave it
// zero; AICPU sets it inside l2_swimlane_aicpu_init_phase). Bounds-
// clamp against PLATFORM_MAX_AICPU_THREADS in addition to the
// host-side zero-init, so a corrupted device-shared value can't
// walk off the pool array. Mirrors the same check in
// read_phase_header_metadata.
int num_phase_threads = static_cast<int>(header->num_phase_threads);
if (num_phase_threads > PLATFORM_MAX_AICPU_THREADS) {
num_phase_threads = 0;
}
for (int t = 0; t < num_phase_threads; t++) {
L2SwimlaneAicpuPhasePool *state = get_phase_buffer_state(shm, num_cores, t);
cb(/*kind=*/1, &state->free_queue, sizeof(L2SwimlaneAicpuPhaseBuffer));
Expand Down Expand Up @@ -219,7 +225,7 @@ using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback;
* (mgmt first so its final-drain entries
* have a consumer).
* 5. read_phase_header_metadata() — single-shot read of the core→thread
* mapping from L2SwimlaneAicpuPhaseHeader.
* mapping from L2SwimlaneDataHeader.
* 6. reconcile_counters() — device-side three-bucket accounting for
* both PERF and PHASE pools (total /
* collected / dropped).
Expand Down Expand Up @@ -327,7 +333,7 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
void *get_aicore_ring_addr_table_device_ptr() const { return aicore_ring_addr_table_dev_; }

/**
* Read AICPU phase metadata that lives in L2SwimlaneAicpuPhaseHeader (not on the
* Read AICPU phase metadata that lives in L2SwimlaneDataHeader (not on the
* buffer pipeline): the core→thread mapping plus a has-data signal
* derived from accumulated per-event records. Single-shot — must be
* called after stop() so the shm region has settled.
Expand Down
48 changes: 27 additions & 21 deletions src/a2a3/platform/src/aicpu/l2_swimlane_collector_aicpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,13 @@
#include "common/platform_config.h"
#include "common/unified_log.h"

// Cached pointers for hot-path access (set during init)
static L2SwimlaneAicpuPhaseHeader *s_l2_swimlane_aicpu_phase_header = nullptr;
// Cached pointers for hot-path access (set during init). Phase metadata
// (num_phase_threads, num_phase_cores, core_to_thread[]) lives inside
// L2SwimlaneDataHeader after the phase-header merge; we keep a separate
// bool so phase-gated paths can check init-ran without re-reading the
// device-shared header.
static L2SwimlaneDataHeader *s_l2_swimlane_header = nullptr;
static bool s_phase_initialized = false;

// Per-core L2SwimlaneAicpuTaskPool cache
static L2SwimlaneAicpuTaskPool *s_aicpu_task_pools[PLATFORM_MAX_CORES] = {};
Expand Down Expand Up @@ -127,6 +131,15 @@ static int enqueue_ready_buffer(
}

void l2_swimlane_aicpu_init(int worker_count) {
// Reset cross-launch state up front. AICPU statics persist across launches
// on the same loaded .so; without this reset, an enabled→disabled launch
// sequence would leave s_phase_initialized=true from the prior run, and
// any subsequent record_phase call would dereference the prior launch's
// (now-freed) s_aicpu_phase_pools pointers. Same shape as the
// [[block_local]] reset in onboard/aicore/kernel.cpp for the AICore-side
// rotation slot (fixed in #936).
s_phase_initialized = false;

void *l2_swimlane_base = reinterpret_cast<void *>(g_platform_l2_swimlane_base);
if (l2_swimlane_base == nullptr) {
LOG_ERROR("l2_swimlane_data_base is NULL, cannot initialize profiling");
Expand Down Expand Up @@ -593,17 +606,12 @@ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_threads) {
return;
}

s_l2_swimlane_aicpu_phase_header = get_phase_header(l2_swimlane_base, worker_count);
s_l2_swimlane_header = get_l2_swimlane_header(l2_swimlane_base);

s_l2_swimlane_aicpu_phase_header->magic = L2_SWIMLANE_AICPU_PHASE_MAGIC;
s_l2_swimlane_aicpu_phase_header->num_sched_threads = num_sched_threads;
s_l2_swimlane_aicpu_phase_header->records_per_thread = PLATFORM_PHASE_RECORDS_PER_THREAD;
s_l2_swimlane_aicpu_phase_header->num_cores = 0;

memset(
s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
);
s_l2_swimlane_header->num_phase_threads = num_sched_threads;
s_l2_swimlane_header->num_phase_cores = 0;
memset(s_l2_swimlane_header->core_to_thread, -1, sizeof(s_l2_swimlane_header->core_to_thread));
s_phase_initialized = true;
Comment thread
hw-native-sys-bot marked this conversation as resolved.

// Cache per-thread record pointers and clear buffers
// Include all threads: scheduler + orchestrator (orchestrators may become schedulers)
Expand Down Expand Up @@ -719,7 +727,7 @@ void l2_swimlane_aicpu_record_phase(
int thread_idx, L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t loop_iter,
uint64_t tasks_processed, uint32_t extra1, uint32_t extra2
) {
if (s_l2_swimlane_aicpu_phase_header == nullptr) {
if (!s_phase_initialized) {
return;
}

Expand Down Expand Up @@ -794,12 +802,12 @@ void l2_swimlane_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx =
void l2_swimlane_aicpu_record_orch_phase(
L2SwimlaneAicpuPhaseId phase_id, uint64_t start_time, uint64_t end_time, uint32_t submit_idx, uint64_t task_id
) {
if (s_orch_thread_idx < 0 || s_l2_swimlane_aicpu_phase_header == nullptr) return;
if (s_orch_thread_idx < 0 || !s_phase_initialized) return;
l2_swimlane_aicpu_record_phase(s_orch_thread_idx, phase_id, start_time, end_time, submit_idx, task_id);
}

void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
if (s_l2_swimlane_aicpu_phase_header == nullptr || s_l2_swimlane_header == nullptr) {
if (!s_phase_initialized || s_l2_swimlane_header == nullptr) {
return;
}

Expand Down Expand Up @@ -835,25 +843,23 @@ void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
}

void l2_swimlane_aicpu_init_core_assignments(int total_cores) {
if (s_l2_swimlane_aicpu_phase_header == nullptr) {
if (!s_phase_initialized) {
return;
}
memset(
s_l2_swimlane_aicpu_phase_header->core_to_thread, -1, sizeof(s_l2_swimlane_aicpu_phase_header->core_to_thread)
);
s_l2_swimlane_aicpu_phase_header->num_cores = static_cast<uint32_t>(total_cores);
memset(s_l2_swimlane_header->core_to_thread, -1, sizeof(s_l2_swimlane_header->core_to_thread));
s_l2_swimlane_header->num_phase_cores = static_cast<uint32_t>(total_cores);
wmb();
LOG_INFO_V0("Core-to-thread mapping init: %d cores", total_cores);
}

void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num) {
if (s_l2_swimlane_aicpu_phase_header == nullptr) {
if (!s_phase_initialized) {
return;
}
for (int i = 0; i < core_num; i++) {
int core_id = core_ids[i];
if (core_id >= 0 && core_id < PLATFORM_MAX_CORES) {
s_l2_swimlane_aicpu_phase_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
s_l2_swimlane_header->core_to_thread[core_id] = static_cast<int8_t>(thread_idx);
}
}
wmb();
Expand Down
Loading
Loading