Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 4 additions & 96 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,55 +114,9 @@ int kernel_args_init_ffts_base_addr(KernelArgsHelper &helper) {

DeviceRunner::~DeviceRunner() { finalize(); }

int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
// Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
// runtime arena. Split out from a single large allocation because the
// combined size can exceed the device allocator's largest contiguous
// block. Each arena commits exactly one region, so its base() is the
// pooled pointer the caller wants.
//
// Idempotent for the production case (sizes do not change across a
// worker's lifetime). If a caller asks for a larger layout on any
// region, redo just that region — already-committed peers stay alive
// so their callers don't have to re-acquire.
auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
if (requested_size == 0) {
// hbg's runtime_arena path: caller passed 0 and never reserved
// a region. Leave the arena uncommitted; acquire_pooled_* will
// return nullptr.
if (arena.is_committed() && cached_size != 0) {
arena.release();
cached_size = 0;
}
return 0;
}
if (arena.is_committed() && requested_size <= cached_size) {
return 0;
}
arena.release();
cached_size = 0;
arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
// commit() failure leaves committed_=false, so the next entry's
// is_committed() guard skips the release branch. release() is
// idempotent on a never-committed arena (zeroes cursor_).
arena.release();
return -1;
}
cached_size = requested_size;
return 0;
};
// Failure of a later region leaves earlier peers committed on purpose:
// pooled pointers previously returned to callers must stay valid even if
// this resize attempt aborts.
if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) return -1;
if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) return -1;
return 0;
}

// `create_thread`, `attach_current_thread`, `configure_aicore_op_timeout`,
// and `ensure_device_initialized` live on `DeviceRunnerBase` — see
// `setup_static_arena`, `create_thread`, `attach_current_thread`,
// `configure_aicore_op_timeout`, and `ensure_device_initialized` live on
// `DeviceRunnerBase` — see
// `src/common/platform/onboard/host/device_runner_base.cpp`.

int DeviceRunner::ensure_acl_ready(int device_id) {
Expand Down Expand Up @@ -668,53 +622,7 @@ int DeviceRunner::finalize() {
return rc;
}

// `launch_aicpu_kernel` lives on `DeviceRunnerBase`.

int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) {
// Lazy-register the AICore binary on first call; reuse cached handle
// thereafter. CANN has no public rtUnregisterAllKernel, so re-registering
// every run would pin another device-side copy of the ELF and quickly
// exhaust HBM — surfaced on a5 onboard CI as 207001 at
// rtKernelLaunchWithHandleV2 with a 507899 cascade at rtStreamCreate.
if (aicore_bin_handle_ == nullptr) {
if (aicore_kernel_binary_.empty()) {
LOG_ERROR("AICore kernel binary is empty");
return -1;
}
rtDevBinary_t binary;
std::memset(&binary, 0, sizeof(binary));
binary.magic = RT_DEV_BINARY_MAGIC_ELF;
binary.version = 0;
binary.data = aicore_kernel_binary_.data();
binary.length = aicore_kernel_binary_.size();
int rc = rtRegisterAllKernel(&binary, &aicore_bin_handle_);
if (rc != RT_ERROR_NONE) {
LOG_ERROR("rtRegisterAllKernel failed: %d", rc);
aicore_bin_handle_ = nullptr;
return rc;
}
}

struct Args {
KernelArgs *k_args;
};
Args args = {k_args};
rtArgsEx_t rt_args;
std::memset(&rt_args, 0, sizeof(rt_args));
rt_args.args = &args;
rt_args.argsSize = sizeof(args);

rtTaskCfgInfo_t cfg = {};
cfg.schemMode = RT_SCHEM_MODE_BATCH;

int rc = rtKernelLaunchWithHandleV2(aicore_bin_handle_, 0, block_dim_, &rt_args, nullptr, stream, &cfg);
if (rc != RT_ERROR_NONE) {
LOG_ERROR("rtKernelLaunchWithHandleV2 failed: %d", rc);
return rc;
}

return rc;
}
// `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`.

int DeviceRunner::init_l2_perf(int num_aicore, int device_id) {
auto alloc_cb = [this](size_t size) -> void * {
Expand Down
50 changes: 10 additions & 40 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,13 @@ class DeviceRunner : public DeviceRunnerBase {
DeviceRunner() = default;
~DeviceRunner();

/**
* Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
* memory, trb prebuilt runtime arena) as three independent device
* allocations. Must be called before any acquire_pooled_*. Idempotent
* on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
* prebuilt runtime arena) — the corresponding arena stays uncommitted.
* Returns 0 on success, -1 on failure.
*
* `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
* `acquire_pooled_{gm_heap,gm_sm,runtime_arena}`, `create_thread`,
* `attach_current_thread`, `ensure_device_initialized`,
* `print_handshake_results`, `set_executors`, `set_dispatcher_binary`,
* `device_id`, and `last_device_wall_ns` are inherited from
* `DeviceRunnerBase`.
*/
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
// `setup_static_arena`, `allocate_tensor`, `free_tensor`,
// `copy_to_device`, `copy_from_device`,
// `acquire_pooled_{gm_heap,gm_sm,runtime_arena}`, `create_thread`,
// `attach_current_thread`, `ensure_device_initialized`,
// `print_handshake_results`, `set_executors`, `set_dispatcher_binary`,
// `device_id`, `last_device_wall_ns`, `launch_aicpu_kernel`, and
// `launch_aicore_kernel` are inherited from `DeviceRunnerBase`.

/**
* Execute a runtime
Expand Down Expand Up @@ -143,20 +134,6 @@ class DeviceRunner : public DeviceRunnerBase {
*/
int finalize();

// `launch_aicpu_kernel` lives on `DeviceRunnerBase`.

/**
* Launch an AICore kernel
*
* Internal method used by run(). Can be called directly for custom
* workflows.
*
* @param stream AICore stream
* @param k_args Pointer to kernel arguments (includes runtime, ffts_base_addr, etc.)
* @return 0 on success, error code on failure
*/
int launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args);

// `upload_chip_callable_buffer` is inherited from `DeviceRunnerBase`.

/**
Expand Down Expand Up @@ -203,16 +180,9 @@ class DeviceRunner : public DeviceRunnerBase {
private:
// Most lifecycle state (device_id_, block_dim_, cores_per_blockdim_,
// worker_count_, executor + dispatcher bytes, aicore_bin_handle_,
// load_aicpu_op_, mem_alloc_, the three DeviceArenas, persistent
// AICPU/AICore streams, kernel_args_, device_wall_*, device_args_,
// binaries_loaded_) is inherited from `DeviceRunnerBase`.
//
// Arena cached sizes for setup_static_arena's "fits" check — avoids
// re-allocating the same buffer when a later worker init asks for an
// equal-or-smaller layout on an already-committed arena.
size_t cached_gm_heap_size_{0};
size_t cached_gm_sm_size_{0};
size_t cached_runtime_arena_size_{0};
// load_aicpu_op_, mem_alloc_, the three DeviceArenas + their cached
// sizes, persistent AICPU/AICore streams, kernel_args_, device_wall_*,
// device_args_, binaries_loaded_) is inherited from `DeviceRunnerBase`.

// Group D state (`chip_callable_buffers_`, `callables_`,
// `orch_so_dedup_`, `aicpu_seen_callable_ids_`, `aicpu_dlopen_total_`,
Expand Down
110 changes: 5 additions & 105 deletions src/a5/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,63 +56,10 @@ static int prof_free_cb(void *dev_ptr) { return rtFree(dev_ptr); }

DeviceRunner::~DeviceRunner() { finalize(); }

int DeviceRunner::setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) {
// Three independent device_malloc'd buffers: GM heap, PTO2 SM, prebuilt
// runtime arena. Split out from a single large allocation because the
// combined size can exceed the device allocator's largest contiguous
// block. Each arena commits exactly one region, so its base() is the
// pooled pointer the caller wants.
//
// Idempotent for the production case (sizes do not change across a
// worker's lifetime). If a caller asks for a larger layout on any
// region, redo just that region — already-committed peers stay alive
// so their callers don't have to re-acquire.
auto commit_region = [](DeviceArena &arena, size_t &cached_size, size_t requested_size) -> int {
if (requested_size == 0) {
// hbg's runtime_arena path: caller passed 0 and never reserved
// a region. Leave the arena uncommitted; acquire_pooled_* will
// return nullptr.
if (arena.is_committed() && cached_size != 0) {
arena.release();
cached_size = 0;
}
return 0;
}
if (arena.is_committed() && requested_size <= cached_size) {
return 0;
}
arena.release();
cached_size = 0;
arena.reserve(requested_size, DeviceArena::kDefaultBaseAlign);
if (arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
// commit() failure leaves committed_=false, so the next entry's
// is_committed() guard skips the release branch. release() is
// idempotent on a never-committed arena (zeroes cursor_).
arena.release();
return -1;
}
cached_size = requested_size;
return 0;
};
if (commit_region(gm_heap_arena_, cached_gm_heap_size_, gm_heap_size) != 0) return -1;
if (commit_region(gm_sm_arena_, cached_gm_sm_size_, gm_sm_size) != 0) {
gm_heap_arena_.release();
cached_gm_heap_size_ = 0;
return -1;
}
if (commit_region(runtime_arena_pool_, cached_runtime_arena_size_, runtime_arena_size) != 0) {
gm_heap_arena_.release();
gm_sm_arena_.release();
cached_gm_heap_size_ = 0;
cached_gm_sm_size_ = 0;
return -1;
}
return 0;
}

// `create_thread`, `attach_current_thread`, `configure_aicore_op_timeout`,
// `ensure_device_initialized`, `ensure_binaries_loaded`, `query_max_block_dim`,
// and `validate_block_dim` live on `DeviceRunnerBase` — see
// `setup_static_arena`, `create_thread`, `attach_current_thread`,
// `configure_aicore_op_timeout`, `ensure_device_initialized`,
// `ensure_binaries_loaded`, `query_max_block_dim`, and `validate_block_dim`
// live on `DeviceRunnerBase` — see
// `src/common/platform/onboard/host/device_runner_base.cpp`.

int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
Expand Down Expand Up @@ -468,54 +415,7 @@ int DeviceRunner::finalize() {
return rc;
}

// `launch_aicpu_kernel` lives on `DeviceRunnerBase`.

int DeviceRunner::launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args) {
// Lazy-register the AICore binary on first call; reuse cached handle
// thereafter. CANN has no public rtUnregisterAllKernel, so re-registering
// every run would pin another device-side copy of the ELF (~365KB on a5)
// and quickly exhaust HBM — surfaced in CI as 207001 at
// rtKernelLaunchWithHandleV2 with a 507899 cascade at rtStreamCreate.
if (aicore_bin_handle_ == nullptr) {
if (aicore_kernel_binary_.empty()) {
LOG_ERROR("AICore kernel binary is empty");
return -1;
}
rtDevBinary_t binary;
std::memset(&binary, 0, sizeof(binary));
binary.magic = RT_DEV_BINARY_MAGIC_ELF;
binary.version = 0;
binary.data = aicore_kernel_binary_.data();
binary.length = aicore_kernel_binary_.size();
int rc = rtRegisterAllKernel(&binary, &aicore_bin_handle_);
if (rc != RT_ERROR_NONE) {
LOG_ERROR("rtRegisterAllKernel failed: %d", rc);
aicore_bin_handle_ = nullptr;
return rc;
}
}

struct Args {
KernelArgs *k_args;
};
// Pass device address of KernelArgs to AICore (KERNEL_ENTRY signature).
Args args = {k_args};
rtArgsEx_t rt_args;
std::memset(&rt_args, 0, sizeof(rt_args));
rt_args.args = &args;
rt_args.argsSize = sizeof(args);

rtTaskCfgInfo_t cfg = {};
cfg.schemMode = RT_SCHEM_MODE_BATCH;

int rc = rtKernelLaunchWithHandleV2(aicore_bin_handle_, 0, block_dim_, &rt_args, nullptr, stream, &cfg);
if (rc != RT_ERROR_NONE) {
LOG_ERROR("rtKernelLaunchWithHandleV2 failed: %d", rc);
return rc;
}

return rc;
}
// `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`.

void DeviceRunner::finalize_collectors() {
if (l2_perf_collector_.is_initialized()) {
Expand Down
52 changes: 10 additions & 42 deletions src/a5/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,13 @@ class DeviceRunner : public DeviceRunnerBase {
DeviceRunner() = default;
~DeviceRunner();

/**
* Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
* memory, trb prebuilt runtime arena) as three independent device
* allocations. Must be called before any acquire_pooled_*. Idempotent
* on identical sizes. `runtime_arena_size` is 0 for the hbg path (no
* prebuilt runtime arena) — the corresponding arena stays uncommitted.
* Returns 0 on success, -1 on failure.
*
* `allocate_tensor`, `free_tensor`, `copy_to_device`, `copy_from_device`,
* `acquire_pooled_{gm_heap,gm_sm,runtime_arena}`, `create_thread`,
* `attach_current_thread`, `ensure_device_initialized`,
* `print_handshake_results`, `set_executors`, `set_dispatcher_binary`,
* `device_id`, and `last_device_wall_ns` are inherited from
* `DeviceRunnerBase`.
*/
int setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
// `setup_static_arena`, `allocate_tensor`, `free_tensor`,
// `copy_to_device`, `copy_from_device`,
// `acquire_pooled_{gm_heap,gm_sm,runtime_arena}`, `create_thread`,
// `attach_current_thread`, `ensure_device_initialized`,
// `print_handshake_results`, `set_executors`, `set_dispatcher_binary`,
// `device_id`, `last_device_wall_ns`, `launch_aicpu_kernel`, and
// `launch_aicore_kernel` are inherited from `DeviceRunnerBase`.

/**
* Execute a runtime
Expand Down Expand Up @@ -132,22 +123,6 @@ class DeviceRunner : public DeviceRunnerBase {
*/
int finalize();

// `launch_aicpu_kernel` lives on `DeviceRunnerBase`.

/**
* Launch an AICore kernel
*
* Internal method used by run(). Can be called directly for custom
* workflows. Receives the device-resident KernelArgs pointer, which the
* AICore KERNEL_ENTRY uses to forward profiling state into platform
* slots before calling aicore_execute(runtime_args, ...).
*
* @param stream AICore stream
* @param k_args Device pointer to the populated KernelArgs
* @return 0 on success, error code on failure
*/
int launch_aicore_kernel(rtStream_t stream, KernelArgs *k_args);

// `upload_chip_callable_buffer`, `register_callable`,
// `register_callable_host_orch`, `unregister_callable`, `has_callable`,
// `bind_callable_to_runtime`, `aicpu_dlopen_count`, and
Expand All @@ -156,16 +131,9 @@ class DeviceRunner : public DeviceRunnerBase {
private:
// Most lifecycle state (device_id_, block_dim_, cores_per_blockdim_,
// worker_count_, executor + dispatcher bytes, aicore_bin_handle_,
// load_aicpu_op_, mem_alloc_, the three DeviceArenas, persistent
// AICPU/AICore streams, kernel_args_, device_wall_*, device_args_,
// binaries_loaded_) is inherited from `DeviceRunnerBase`.
//
// Arena cached sizes for setup_static_arena's "fits" check — avoids
// re-allocating a buffer when a later worker init asks for an
// equal-or-smaller layout.
size_t cached_gm_heap_size_{0};
size_t cached_gm_sm_size_{0};
size_t cached_runtime_arena_size_{0};
// load_aicpu_op_, mem_alloc_, the three DeviceArenas + their cached
// sizes, persistent AICPU/AICore streams, kernel_args_, device_wall_*,
// device_args_, binaries_loaded_) is inherited from `DeviceRunnerBase`.

// Group D state (`chip_callable_buffers_`, `callables_`,
// `orch_so_dedup_`, `aicpu_seen_callable_ids_`, `aicpu_dlopen_total_`,
Expand Down
Loading
Loading