Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/a2a3/platform/include/host/l2_swimlane_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,14 @@ struct L2SwimlaneModule {
const int num_cores = static_cast<int>(header->num_cores);
const L2SwimlaneBufferKind kind = entry.kind;

// Validate kind first — without this, an out-of-range value silently
// falls into the AicoreTask branch below and reads a wrong-typed pool.
if (kind != L2SwimlaneBufferKind::AicpuTask && kind != L2SwimlaneBufferKind::AicpuPhase &&
kind != L2SwimlaneBufferKind::AicoreTask) {
LOG_ERROR("L2SwimlaneModule: invalid entry kind=%u", static_cast<uint32_t>(kind));
return std::nullopt;
}

if (kind == L2SwimlaneBufferKind::AicpuPhase) {
if (entry.core_index >= static_cast<uint32_t>(PLATFORM_MAX_AICPU_THREADS)) {
LOG_ERROR("L2SwimlaneModule: invalid phase entry: thread=%u", entry.core_index);
Expand Down
20 changes: 18 additions & 2 deletions src/a2a3/platform/onboard/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,21 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA
// executor runs. AICore reads via get_aicore_profiling_flag() /
// get_l2_swimlane_aicore_rotation() — never touches Handshake for profiling.
set_aicore_profiling_flag(k_args->enable_profiling_flag);
if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) {
// Always publish the rotation slot (nullptr when this launch is disabled
// or has no rotation table). [[block_local]] storage persists across
// launches on the same loaded kernel binary, so without an explicit
// nullptr publication a sequence like enabled(valid)→enabled(NULL table)
// or enabled→disabled would leave `get_l2_swimlane_aicore_rotation()`
// returning the prior launch's freed pointer. AICore call sites are
// additionally flag-gated on this-launch `l2_swimlane_enabled`, so the
// disabled-launch read path is currently unreachable through the
// executors; the unconditional reset is defensive against future call
// sites that don't carry that gate. Mirrors the publish-nullptr branch
// in sim/aicore/kernel.cpp (sim keys only on the table pointer; onboard
// additionally AND-gates on PROFILING_FLAG_L2_SWIMLANE — intentional,
// since the onboard table is shared across collectors).
if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE) &&
k_args->l2_swimlane_aicore_rotation_table != 0) {
// Stash only the slot pointer. The slot CONTENTS are written by
// AICPU's `l2_swimlane_aicpu_init` which runs concurrently with this
// entry; dereferencing here would race with AICPU's write. The
Expand All @@ -113,7 +127,9 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA
// done and the slot is populated.
__gm__ uint64_t *rotation_table =
reinterpret_cast<__gm__ uint64_t *>(k_args->l2_swimlane_aicore_rotation_table);
set_l2_swimlane_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr);
set_l2_swimlane_aicore_rotation_slot(&rotation_table[block_idx]);
} else {
set_l2_swimlane_aicore_rotation_slot(nullptr);
}

aicore_execute(k_args->runtime_args, block_idx, core_type);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -841,13 +841,21 @@ int32_t SchedulerContext::init(
// l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
// header — must be called BEFORE caching the level, otherwise the cached
// value would still be 0 (only the binary enable bit has been seeded by
// kernel.cpp at this point).
// kernel.cpp at this point). Reset the cached level on disabled runs so a
// prior enabled launch's level can't leak into the phase-record gates in
// scheduler_dispatch.
if (is_l2_swimlane_enabled()) {
l2_swimlane_aicpu_init(runtime->worker_count);
l2_swimlane_level_ = get_l2_swimlane_level();
if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_thread_num_);
// When orchestrator phases merge into scheduler threads, phase
// records flow through aicpu_thread_num_ pools — matches the same
// branch in dump_tensor_init (scheduler_dispatch.cpp).
const int phase_threads = orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_;
l2_swimlane_aicpu_init_phase(runtime->worker_count, phase_threads);
}
} else {
l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
}
#endif

Expand Down
Loading