Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 11 additions & 65 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,21 +310,8 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
return rc;
}

// Initialize runtime args
rc = kernel_args_.init_runtime_args(runtime, mem_alloc_);
if (rc != 0) {
LOG_ERROR("init_runtime_args failed: %d", rc);
return rc;
}

// Publish log config to AICPU via KernelArgs (severity floor + INFO verbosity).
// HostLogger is the single source of truth for log config (seeded by
// libsimpler_log.so via simpler_log_init before host_runtime.so was even
// dlopen'd). Read it directly when populating KernelArgs.
kernel_args_.args.log_level = static_cast<uint32_t>(HostLogger::get_instance().level());
kernel_args_.args.log_info_v = static_cast<uint32_t>(HostLogger::get_instance().info_v());
// Device ordinal for the AICPU executor's per-device orchestration-SO name.
kernel_args_.args.device_id = static_cast<uint32_t>(device_id_);
rc = init_runtime_args_with_metadata(runtime);
if (rc != 0) return rc;

rc = kernel_args_init_ffts_base_addr(kernel_args_);
if (rc != 0) {
Expand All @@ -339,27 +326,14 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
return rc;
}

// Start collector mgmt + poll threads now, just before kernels launch.
// Starting earlier wastes CPU on empty queues and risks tripping
// ProfilerBase's poll-loop idle-timeout if device-side init is slow.
auto thread_factory = [this](std::function<void()> fn) {
return create_thread(std::move(fn));
};
if (enable_l2_swimlane_) {
l2_perf_collector_.start(thread_factory);
}
if (enable_dump_tensor_) {
dump_collector_.start(thread_factory);
}
if (enable_pmu_) {
pmu_collector_.start(thread_factory);
}
start_shared_collectors_for_run();
// a2a3-only dep_gen collector — share the same thread_factory shape as base.
if (enable_dep_gen_) {
auto thread_factory = [this](std::function<void()> fn) {
return create_thread(std::move(fn));
};
dep_gen_collector_.start(thread_factory);
}
if (enable_scope_stats_) {
scope_stats_collector_.start(thread_factory);
}

LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName);
rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1);
Expand Down Expand Up @@ -392,26 +366,9 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {

// Tear down collectors. stop() joins mgmt then collector in the only safe
// order (mgmt's final-drain pass into L2 has poll as its consumer).
// Diagnostic exports use the per-task `output_prefix_` directory the user
// set on CallConfig (CallConfig::validate() enforces non-empty upstream).
if (enable_l2_swimlane_) {
l2_perf_collector_.stop();
l2_perf_collector_.read_phase_header_metadata();
l2_perf_collector_.reconcile_counters();
l2_perf_collector_.export_swimlane_json();
}

if (enable_dump_tensor_) {
dump_collector_.stop();
dump_collector_.reconcile_counters();
dump_collector_.export_dump_files();
}

if (enable_pmu_) {
pmu_collector_.stop();
pmu_collector_.reconcile_counters();
}
teardown_shared_collectors_after_run();

// a2a3-only dep_gen teardown: stop + reconcile + replay emit.
if (enable_dep_gen_) {
dep_gen_collector_.stop();
if (dep_gen_collector_.reconcile_counters()) {
Expand All @@ -424,12 +381,6 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
}
}

if (enable_scope_stats_) {
scope_stats_collector_.stop();
scope_stats_collector_.reconcile_counters();
scope_stats_collector_.write_jsonl(output_prefix_);
}

// Print handshake results (reads from device memory, must be before free)
print_handshake_results();

Expand Down Expand Up @@ -458,14 +409,9 @@ int DeviceRunner::finalize() {
finalize_collectors();

// Shared cleanup body — streams, kernel_args, callable/orch maps,
// chip-callable buffer pool, the three arenas, device_wall, and
// mem_alloc_.finalize(). Device-wall free order is normalized to
// "before mem_alloc_.finalize" inside finalize_common(); the a2a3-
// specific cached arena sizes still need clearing here.
// chip-callable buffer pool, the three arenas, device_wall,
// mem_alloc_.finalize(), and cached arena sizes.
rc = finalize_common();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
cached_runtime_arena_size_ = 0;

// Reset device AFTER all device memory is freed. Two paths:
//
Expand Down
72 changes: 6 additions & 66 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,39 +153,10 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
LOG_ERROR("prepare_orch_so failed: %d", rc);
return rc;
}
rc = kernel_args_.init_runtime_args(runtime, mem_alloc_);
if (rc != 0) {
LOG_ERROR("init_runtime_args failed: %d", rc);
return rc;
}
rc = init_runtime_args_with_metadata(runtime);
if (rc != 0) return rc;

// Publish log config to AICPU via KernelArgs (severity floor + INFO verbosity).
// HostLogger is the single source of truth for log config (seeded by
// libsimpler_log.so via simpler_log_init before host_runtime.so was even
// dlopen'd). Read it directly when populating KernelArgs.
kernel_args_.args.log_level = static_cast<uint32_t>(HostLogger::get_instance().level());
kernel_args_.args.log_info_v = static_cast<uint32_t>(HostLogger::get_instance().info_v());
// Device ordinal for the AICPU executor's per-device orchestration-SO name.
kernel_args_.args.device_id = static_cast<uint32_t>(device_id_);

// Start collector mgmt + poll threads now, just before kernels launch.
// Starting earlier wastes CPU on empty queues and risks tripping
// ProfilerBase's poll-loop idle-timeout if device-side init is slow.
auto thread_factory = [this](std::function<void()> fn) {
return create_thread(std::move(fn));
};
if (enable_l2_swimlane_) {
l2_perf_collector_.start(thread_factory);
}
if (enable_dump_tensor_) {
dump_collector_.start(thread_factory);
}
if (enable_pmu_) {
pmu_collector_.start(thread_factory);
}
if (enable_scope_stats_) {
scope_stats_collector_.start(thread_factory);
}
start_shared_collectors_for_run();

LOG_INFO_V0("=== launch_aicpu_kernel %s ===", host::KernelNames::InitName);
rc = launch_aicpu_kernel(stream_aicpu_, &kernel_args_.args, host::KernelNames::InitName, 1);
Expand Down Expand Up @@ -220,34 +191,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {

read_device_wall_ns();

// Tear down collectors. stop() joins mgmt then collector in the only
// safe order (mgmt's final-drain pass into L2 has poll as its
// consumer). Diagnostic exports use the per-task `output_prefix_`
// directory the user set on CallConfig (validate() enforces non-empty
// upstream).
if (enable_l2_swimlane_) {
l2_perf_collector_.stop();
l2_perf_collector_.read_phase_header_metadata();
l2_perf_collector_.reconcile_counters();
l2_perf_collector_.export_swimlane_json();
}

if (enable_dump_tensor_) {
dump_collector_.stop();
dump_collector_.reconcile_counters();
dump_collector_.export_dump_files();
}

if (enable_pmu_) {
pmu_collector_.stop();
pmu_collector_.reconcile_counters();
}

if (enable_scope_stats_) {
scope_stats_collector_.stop();
scope_stats_collector_.reconcile_counters();
scope_stats_collector_.write_jsonl(output_prefix_);
}
teardown_shared_collectors_after_run();

// Print handshake results (reads from device memory, must be before free)
print_handshake_results();
Expand Down Expand Up @@ -290,13 +234,9 @@ int DeviceRunner::finalize() {
}

// Shared cleanup body — streams, kernel_args, callable/orch maps,
// chip-callable buffer pool, the three arenas, device_wall, and
// mem_alloc_.finalize(). The a5-specific cached arena sizes still need
// clearing here.
// chip-callable buffer pool, the three arenas, device_wall,
// mem_alloc_.finalize(), and cached arena sizes.
rc = finalize_common();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
cached_runtime_arena_size_ = 0;

int reset_rc = rtDeviceReset(device_id_);
if (reset_rc != 0) {
Expand Down
71 changes: 71 additions & 0 deletions src/common/platform/onboard/host/device_runner_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,9 @@ int DeviceRunnerBase::finalize_common() {
block_dim_ = 0;
worker_count_ = 0;
aicore_kernel_binary_.clear();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
cached_runtime_arena_size_ = 0;
return rc;
}

Expand Down Expand Up @@ -910,3 +913,71 @@ void DeviceRunnerBase::read_device_wall_ns() {
}
}
}

int DeviceRunnerBase::init_runtime_args_with_metadata(Runtime &runtime) {
int rc = kernel_args_.init_runtime_args(runtime, mem_alloc_);
if (rc != 0) {
LOG_ERROR("init_runtime_args failed: %d", rc);
return rc;
}
// Publish log config to AICPU via KernelArgs (severity floor + INFO verbosity).
// HostLogger is the single source of truth for log config (seeded by
// libsimpler_log.so via simpler_log_init before host_runtime.so was even
// dlopen'd). Read it directly when populating KernelArgs.
kernel_args_.args.log_level = static_cast<uint32_t>(HostLogger::get_instance().level());
kernel_args_.args.log_info_v = static_cast<uint32_t>(HostLogger::get_instance().info_v());
// Device ordinal for the AICPU executor's per-device orchestration-SO name.
kernel_args_.args.device_id = static_cast<uint32_t>(device_id_);
return 0;
}

void DeviceRunnerBase::start_shared_collectors_for_run() {
// Start collector mgmt + poll threads now, just before kernels launch.
// Starting earlier wastes CPU on empty queues and risks tripping
// ProfilerBase's poll-loop idle-timeout if device-side init is slow.
auto thread_factory = [this](std::function<void()> fn) {
return create_thread(std::move(fn));
};
if (enable_l2_swimlane_) {
l2_perf_collector_.start(thread_factory);
}
if (enable_dump_tensor_) {
dump_collector_.start(thread_factory);
}
if (enable_pmu_) {
pmu_collector_.start(thread_factory);
}
if (enable_scope_stats_) {
scope_stats_collector_.start(thread_factory);
}
}

void DeviceRunnerBase::teardown_shared_collectors_after_run() {
// Tear down collectors. stop() joins mgmt then collector in the only safe
// order (mgmt's final-drain pass into L2 has poll as its consumer).
// Diagnostic exports use the per-task `output_prefix_` directory the user
// set on CallConfig (CallConfig::validate() enforces non-empty upstream).
if (enable_l2_swimlane_) {
l2_perf_collector_.stop();
l2_perf_collector_.read_phase_header_metadata();
l2_perf_collector_.reconcile_counters();
l2_perf_collector_.export_swimlane_json();
}

if (enable_dump_tensor_) {
dump_collector_.stop();
dump_collector_.reconcile_counters();
dump_collector_.export_dump_files();
}

if (enable_pmu_) {
pmu_collector_.stop();
pmu_collector_.reconcile_counters();
}

if (enable_scope_stats_) {
scope_stats_collector_.stop();
scope_stats_collector_.reconcile_counters();
scope_stats_collector_.write_jsonl(output_prefix_);
}
}
42 changes: 42 additions & 0 deletions src/common/platform/onboard/host/device_runner_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,48 @@ class DeviceRunnerBase {
*/
void read_device_wall_ns();

/**
* H2D the Runtime struct via `kernel_args_.init_runtime_args` and
* publish log config + device ordinal into KernelArgs. AICPU reads
* these at launch — log_level / log_info_v are sourced from
* `HostLogger::get_instance()` (the single source of truth seeded
* by `simpler_log_init` before host_runtime.so loaded); device_id
* is the per-device suffix the AICPU executor uses for the
* per-device orchestration-SO name.
*
* @return 0 on success, the underlying init_runtime_args rc on failure.
*/
int init_runtime_args_with_metadata(Runtime &runtime);

/**
* Start collector mgmt + poll threads for the four shared
* diagnostics collectors (`l2_perf_collector_`, `dump_collector_`,
* `pmu_collector_`, `scope_stats_collector_`) that are enabled.
* Each `start()` is gated on the corresponding `enable_*_` flag;
* disabled collectors are not started.
*
* Each spawned thread is bound to `device_id_` via `create_thread`.
*
* Subclasses with arch-specific collectors (a2a3's
* `dep_gen_collector_`) call this helper and then start their own.
*/
void start_shared_collectors_for_run();

/**
* Tear down the four shared diagnostics collectors after the launched
* kernels have synced. Each block is gated on the corresponding
* `enable_*_` flag and does: stop() → reconcile_counters() →
* export step (`l2_perf` writes swimlane JSON via
* `read_phase_header_metadata` + `export_swimlane_json`; `dump`
* writes dump files; `pmu` has no export step beyond reconcile;
* `scope_stats` writes JSONL).
*
* Subclasses with arch-specific collectors (a2a3's
* `dep_gen_collector_` + its `dep_gen_replay_emit_deps_json` export)
* inline their own teardown after calling this helper.
*/
void teardown_shared_collectors_after_run();

/**
* Shared body of `finalize()`. Each arch subclass's `finalize()`
* handles: (a) the early-return + thread attach prologue, (b) any
Expand Down
Loading