Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions src/a2a3/platform/include/aicore/aicore_profiling_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,18 @@
*
* Lifecycle:
* 1. Host fills `KernelArgs::enable_profiling_flag` and
* `KernelArgs::aicore_ring_addr` (now points to a per-core
* `AicoreRotation` table).
* 2. AICore kernel entry indexes `aicore_ring_addr[block_idx]` for this
* core's `AicoreRotation*` and calls `set_aicore_profiling_flag()` +
* `set_aicore_rotation()` before invoking `aicore_execute`.
* 3. `aicore_execute` and downstream profiling helpers read via getters.
* `KernelArgs::aicore_ring_addr` (points to a per-core `AicoreRotation`
* device-address table). Host allocates the table bytes; AICPU populates
* the entries inside `l2_perf_aicpu_init`.
* 2. AICore kernel entry stashes `&aicore_ring_addr[block_idx]` (the slot
* pointer — NOT the dereferenced rotation pointer yet) via
* `set_aicore_rotation_slot()`, and calls `set_aicore_profiling_flag()`,
* before invoking `aicore_execute`.
* 3. `get_aicore_rotation()` lazily dereferences the slot the first time
* it is called. Callers must defer the call until AFTER AICPU has
* dispatched the first task (so AICPU init has had a chance to populate
* the table). The executor handles this by calling it inside the main
* loop's first-task branch.
*/

#ifndef PLATFORM_AICORE_AICORE_PROFILING_STATE_H_
Expand All @@ -50,12 +56,22 @@ __aicore__ void set_aicore_profiling_flag(uint32_t flag);
__aicore__ uint32_t get_aicore_profiling_flag();

/**
* Per-core AICore rotation channel. Set once at kernel entry from
* `((uint64_t*)k_args->aicore_ring_addr)[block_idx]`; nullptr when the L2
* swimlane bit is off or the address table itself is null. AICore reads
* this cache line per task to pick up the current L2PerfAicoreBuffer.
* Per-core AICore rotation channel.
*
* `set_aicore_rotation_slot(slot)` stashes the address of THIS core's slot
* in the rotation-address table — `&((uint64_t*)k_args->aicore_ring_addr)[block_idx]`.
* No dereference happens here, because at kernel entry the AICPU side may
* not yet have populated the table (the host launches both kernels and
* AICPU's init runs concurrently with AICore's entry).
*
* `get_aicore_rotation()` lazily dereferences the stashed slot on first use,
* caches the result, and returns it on subsequent calls. Callers MUST defer
* the first call until after AICPU has dispatched the first task — by then
* AICPU's init has completed and the slot holds a valid device address.
* The executor's main loop honours this by reading the rotation only inside
* the first-task branch of the dispatch poll.
*/
__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation);
__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr);
__aicore__ __gm__ AicoreRotation *get_aicore_rotation();

#endif // PLATFORM_AICORE_AICORE_PROFILING_STATE_H_
10 changes: 10 additions & 0 deletions src/a2a3/platform/include/aicpu/l2_perf_collector_aicpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ extern "C" uint64_t get_platform_l2_perf_base();
extern "C" void set_l2_swimlane_enabled(bool enable);
extern "C" bool is_l2_swimlane_enabled();

// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr).
// Published by the host before AICPU init runs; AICPU init fills the table
// with the per-core `&L2PerfAicoreBufferState::rotation` device addresses so
// AICore can index `aicore_ring_addr[block_idx]` to find its rotation channel.
// Moved from host into AICPU so the host stays decoupled from the AICore-side
// shared-memory layout (host previously did host-to-device address translation
// + reached into get_aicore_buffer_state to fill this).
extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr);
extern "C" uint64_t get_platform_aicore_rotation_table();

// Typed getter for the granular perf_level (promoted from the shared-memory
// header inside l2_perf_aicpu_init). Gate sites should use this so the
// comparison RHS is a named L2PerfLevel constant.
Expand Down
27 changes: 23 additions & 4 deletions src/a2a3/platform/onboard/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,28 @@
// linker dedup the otherwise-duplicate symbol definitions across the two
// compilation units.
[[block_local]] static uint32_t s_aicore_profiling_flag;
// Slot pointer (NOT the dereferenced rotation address) — see
// aicore_profiling_state.h for the lazy-deref contract.
[[block_local]] static __gm__ uint64_t *s_aicore_rotation_slot;
[[block_local]] static __gm__ AicoreRotation *s_aicore_rotation;

__attribute__((weak)) __aicore__ void set_aicore_profiling_flag(uint32_t flag) { s_aicore_profiling_flag = flag; }
__attribute__((weak)) __aicore__ uint32_t get_aicore_profiling_flag() { return s_aicore_profiling_flag; }

__attribute__((weak)) __aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) {
s_aicore_rotation = rotation;
__attribute__((weak)) __aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
s_aicore_rotation_slot = slot_ptr;
s_aicore_rotation = nullptr; // force lazy resolution on next get
}
__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() {
// Lazy first-call resolve: AICPU init populates `*s_aicore_rotation_slot`
// before dispatching the first task, so by the time the executor reaches
// for the rotation (inside the first-task branch of the dispatch poll)
// the slot holds a valid device address.
if (s_aicore_rotation == nullptr && s_aicore_rotation_slot != nullptr) {
s_aicore_rotation = reinterpret_cast<__gm__ AicoreRotation *>(*s_aicore_rotation_slot);
}
return s_aicore_rotation;
}
__attribute__((weak)) __aicore__ __gm__ AicoreRotation *get_aicore_rotation() { return s_aicore_rotation; }

extern __aicore__ void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type);

Expand Down Expand Up @@ -91,8 +104,14 @@ extern "C" __global__ __aicore__ void KERNEL_ENTRY(aicore_kernel)(__gm__ KernelA
// get_aicore_rotation() — never touches Handshake for profiling.
set_aicore_profiling_flag(k_args->enable_profiling_flag);
if (GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE)) {
// Stash only the slot pointer. The slot CONTENTS are written by
// AICPU's `l2_perf_aicpu_init` which runs concurrently with this
// entry; dereferencing here would race with AICPU's write. The
// executor defers the deref via `get_aicore_rotation()` until inside
// the first-task branch — by then AICPU has dispatched, so init is
// done and the slot is populated.
__gm__ uint64_t *rotation_table = reinterpret_cast<__gm__ uint64_t *>(k_args->aicore_ring_addr);
set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx]));
set_aicore_rotation_slot(rotation_table != nullptr ? &rotation_table[block_idx] : nullptr);
}

aicore_execute(k_args->runtime_args, block_idx, core_type);
Expand Down
1 change: 1 addition & 0 deletions src/a2a3/platform/onboard/aicpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ extern "C" __attribute__((visibility("default"))) int simpler_aicpu_exec(void *a
set_platform_dump_base(k_args->dump_data_base);
set_dump_tensor_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR));
set_platform_l2_perf_base(k_args->l2_perf_data_base);
set_platform_aicore_rotation_table(k_args->aicore_ring_addr);
set_l2_swimlane_enabled(GET_PROFILING_FLAG(k_args->enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE));
set_platform_pmu_base(k_args->pmu_data_base);
set_platform_pmu_reg_addrs(k_args->pmu_reg_addrs);
Expand Down
25 changes: 20 additions & 5 deletions src/a2a3/platform/sim/aicore/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@
static pthread_key_t g_reg_base_key;
static pthread_key_t g_core_id_key;
static pthread_key_t g_aicore_profiling_flag_key;
// Slot pointer (NOT the dereferenced rotation address) — see
// aicore_profiling_state.h for the lazy-deref contract.
static pthread_key_t g_aicore_rotation_slot_key;
static pthread_key_t g_aicore_rotation_key;
static pthread_once_t g_tls_once = PTHREAD_ONCE_INIT;

static void create_tls_keys() {
pthread_key_create(&g_reg_base_key, nullptr);
pthread_key_create(&g_core_id_key, nullptr);
pthread_key_create(&g_aicore_profiling_flag_key, nullptr);
pthread_key_create(&g_aicore_rotation_slot_key, nullptr);
pthread_key_create(&g_aicore_rotation_key, nullptr);
}

Expand All @@ -61,11 +65,19 @@ __aicore__ uint32_t get_aicore_profiling_flag() {
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(pthread_getspecific(g_aicore_profiling_flag_key)));
}

__aicore__ void set_aicore_rotation(__gm__ AicoreRotation *rotation) {
pthread_setspecific(g_aicore_rotation_key, reinterpret_cast<void *>(rotation));
__aicore__ void set_aicore_rotation_slot(__gm__ uint64_t *slot_ptr) {
pthread_setspecific(g_aicore_rotation_slot_key, reinterpret_cast<void *>(slot_ptr));
pthread_setspecific(g_aicore_rotation_key, nullptr); // force lazy resolve on next get
}
__aicore__ __gm__ AicoreRotation *get_aicore_rotation() {
return reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key));
auto *cached = reinterpret_cast<__gm__ AicoreRotation *>(pthread_getspecific(g_aicore_rotation_key));
if (cached != nullptr) return cached;
auto *slot = reinterpret_cast<__gm__ uint64_t *>(pthread_getspecific(g_aicore_rotation_slot_key));
if (slot == nullptr) return nullptr;
// Lazy first-call resolve — see aicore_profiling_state.h.
cached = reinterpret_cast<__gm__ AicoreRotation *>(*slot);
pthread_setspecific(g_aicore_rotation_key, reinterpret_cast<void *>(cached));
return cached;
}

// Core identity setter function pointers — set by DeviceRunner after dlopen.
Expand Down Expand Up @@ -107,10 +119,13 @@ extern "C" void aicore_execute_wrapper(
// Publish per-core profiling state before the executor runs.
set_aicore_profiling_flag(enable_profiling_flag);
if (aicore_ring_addr != 0) {
// Stash only the slot pointer; deref happens lazily inside
// get_aicore_rotation() once AICPU has populated the table. See
// aicore_profiling_state.h.
uint64_t *rotation_table = reinterpret_cast<uint64_t *>(aicore_ring_addr);
set_aicore_rotation(reinterpret_cast<__gm__ AicoreRotation *>(rotation_table[block_idx]));
set_aicore_rotation_slot(reinterpret_cast<__gm__ uint64_t *>(&rotation_table[block_idx]));
} else {
set_aicore_rotation(nullptr);
set_aicore_rotation_slot(nullptr);
}

// Set core identity for pto-isa TPUSH/TPOP simulation.
Expand Down
9 changes: 9 additions & 0 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,13 @@ int DeviceRunner::ensure_binaries_loaded() {
return -1;
}

set_platform_aicore_rotation_table_func_ =
reinterpret_cast<void (*)(uint64_t)>(dlsym(aicpu_so_handle_, "set_platform_aicore_rotation_table"));
if (set_platform_aicore_rotation_table_func_ == nullptr) {
LOG_ERROR("dlsym failed for set_platform_aicore_rotation_table: %s", dlerror());
return -1;
}

set_l2_swimlane_enabled_func_ =
reinterpret_cast<void (*)(bool)>(dlsym(aicpu_so_handle_, "set_l2_swimlane_enabled"));
if (set_l2_swimlane_enabled_func_ == nullptr) {
Expand Down Expand Up @@ -652,6 +659,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
set_platform_dump_base_func_(kernel_args_.dump_data_base);
set_dump_tensor_enabled_func_(enable_dump_tensor_);
set_platform_l2_perf_base_func_(kernel_args_.l2_perf_data_base);
set_platform_aicore_rotation_table_func_(kernel_args_.aicore_ring_addr);
set_l2_swimlane_enabled_func_(enable_l2_swimlane_);
set_platform_pmu_base_func_(kernel_args_.pmu_data_base);
set_platform_pmu_reg_addrs_func_(kernel_args_.pmu_reg_addrs);
Expand Down Expand Up @@ -844,6 +852,7 @@ void DeviceRunner::unload_executor_binaries() {
set_platform_dump_base_func_ = nullptr;
set_dump_tensor_enabled_func_ = nullptr;
set_platform_l2_perf_base_func_ = nullptr;
set_platform_aicore_rotation_table_func_ = nullptr;
set_l2_swimlane_enabled_func_ = nullptr;
set_platform_pmu_base_func_ = nullptr;
set_platform_pmu_reg_addrs_func_ = nullptr;
Expand Down
1 change: 1 addition & 0 deletions src/a2a3/platform/sim/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ class DeviceRunner {
void (*set_platform_dump_base_func_)(uint64_t){nullptr};
void (*set_dump_tensor_enabled_func_)(bool){nullptr};
void (*set_platform_l2_perf_base_func_)(uint64_t){nullptr};
void (*set_platform_aicore_rotation_table_func_)(uint64_t){nullptr};
void (*set_l2_swimlane_enabled_func_)(bool){nullptr};
void (*set_platform_pmu_base_func_)(uint64_t){nullptr};
void (*set_platform_pmu_reg_addrs_func_)(uint64_t){nullptr};
Expand Down
23 changes: 23 additions & 0 deletions src/a2a3/platform/src/aicpu/l2_perf_collector_aicpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,19 @@ static uint64_t g_platform_l2_perf_base = 0;
static bool g_enable_l2_swimlane = false;
static L2PerfLevel g_l2_perf_level = L2PerfLevel::DISABLED;

// AICore rotation-table device pointer (= KernelArgs::aicore_ring_addr).
// Published by the host (sim: dlsym'd setter; onboard: from k_args via the
// kernel entry); AICPU init walks it to fill per-core &rotation addresses.
static uint64_t g_platform_aicore_rotation_table = 0;

extern "C" void set_platform_l2_perf_base(uint64_t l2_perf_data_base) { g_platform_l2_perf_base = l2_perf_data_base; }
extern "C" uint64_t get_platform_l2_perf_base() { return g_platform_l2_perf_base; }
extern "C" void set_l2_swimlane_enabled(bool enable) { g_enable_l2_swimlane = enable; }
extern "C" bool is_l2_swimlane_enabled() { return g_enable_l2_swimlane; }
extern "C" void set_platform_aicore_rotation_table(uint64_t table_addr) {
g_platform_aicore_rotation_table = table_addr;
}
extern "C" uint64_t get_platform_aicore_rotation_table() { return g_platform_aicore_rotation_table; }
L2PerfLevel get_l2_perf_level() { return g_l2_perf_level; }

/**
Expand Down Expand Up @@ -132,6 +141,16 @@ void l2_perf_aicpu_init(int worker_count) {
static_cast<uint32_t>(g_l2_perf_level)
);

// Populate the per-core AicoreRotation device-address table. AICore reads
// `aicore_ring_addr[block_idx]` from KernelArgs to find its rotation
// channel; the table itself is host-allocated, but the entries are
// device-internal addresses (`&ac_state->rotation`) that the host would
// otherwise have to translate from host-mapped to device-mapped. AICPU
// already runs on the device, so it can write the addresses directly
// without any translation — that keeps the host side decoupled from the
// AICore shared-memory layout.
uint64_t *rotation_table = reinterpret_cast<uint64_t *>(g_platform_aicore_rotation_table);
Comment thread
hw-native-sys-bot marked this conversation as resolved.

// Pop first buffer from free_queue for each core
for (int i = 0; i < worker_count; i++) {
L2PerfBufferState *state = get_perf_buffer_state(l2_perf_base, i);
Expand All @@ -140,6 +159,10 @@ void l2_perf_aicpu_init(int worker_count) {
s_perf_buffer_states[i] = state;
s_aicore_buffer_states[i] = ac_state;

if (rotation_table != nullptr) {
rotation_table[i] = reinterpret_cast<uint64_t>(&ac_state->rotation);
}

// Pop first buffer from free_queue
rmb();
uint32_t head = state->free_queue.head;
Expand Down
25 changes: 7 additions & 18 deletions src/a2a3/platform/src/host/l2_perf_collector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,13 @@ int L2PerfCollector::initialize(
PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE
);

// Step 5c: Standalone uint64_t[num_aicore] table holding per-core
// AicoreRotation device addresses (= &ac_state->rotation). AICore reads
// rotation_table[block_idx] via KernelArgs::aicore_ring_addr and feeds it
// into the platform's set_aicore_rotation().
// Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core
// AicoreRotation device addresses. Host only allocates the bytes and
// hands the device pointer to AICPU via KernelArgs::aicore_ring_addr;
// AICPU itself fills the entries inside `l2_perf_aicpu_init` (it has
// direct access to `&ac_state->rotation` device addresses, no
// host-to-device translation needed). AICore reads
// rotation_table[block_idx] at kernel entry.
{
size_t table_bytes = static_cast<size_t>(num_aicore) * sizeof(uint64_t);
void *rotation_table_host = nullptr;
Expand All @@ -252,20 +255,6 @@ int L2PerfCollector::initialize(
LOG_ERROR("Failed to allocate aicore_ring_addr (rotation) table (%zu bytes)", table_bytes);
return -1;
}
uint64_t *rotation_table = reinterpret_cast<uint64_t *>(rotation_table_host);

// Compute the per-core device address of &state->rotation. We have
// the host-mapped shm region; the device equivalent is at the same
// offset from perf_dev_ptr.
auto host_to_dev = [&](void *host_addr) -> uint64_t {
uintptr_t offset = reinterpret_cast<uintptr_t>(host_addr) - reinterpret_cast<uintptr_t>(perf_host_ptr);
return reinterpret_cast<uint64_t>(perf_dev_ptr) + offset;
};

for (int i = 0; i < num_aicore; i++) {
L2PerfAicoreBufferState *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i);
rotation_table[i] = host_to_dev(&ac_state->rotation);
}
aicore_ring_addr_table_dev_ = rotation_table_dev;
}

Expand Down
9 changes: 8 additions & 1 deletion src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);

// Per-core AicoreRotation channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp.
__gm__ AicoreRotation *l2_perf_rotation = l2_perf_enabled ? get_aicore_rotation() : nullptr;
// Deferred until first task so AICPU's init has populated the rotation
// table (the dispatch itself proves init is done).
__gm__ AicoreRotation *l2_perf_rotation = nullptr;
AicoreLocalState l2_perf_local = {nullptr, 0, 0};

volatile uint32_t task_id = AICPU_IDLE_TASK_ID;
Expand All @@ -83,6 +85,11 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
uint32_t actual_task_id = task_id;
write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id));

// First-task lazy resolve of the rotation channel.
if (l2_perf_enabled && l2_perf_rotation == nullptr) {
l2_perf_rotation = get_aicore_rotation();
}

__gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]);
uint64_t start_time = get_sys_cnt_aicore();

Expand Down
Loading
Loading