Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 31 additions & 22 deletions src/a2a3/platform/onboard/host/host_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
#include "common/platform_config.h"
#include "runtime/rt.h"
#include "ascend_hal.h" // CANN HAL API definitions (MODULE_TYPE_AICORE, INFO_TYPE_OCCUPY, etc.)
#include <chrono>
#include <dlfcn.h>
#include <iostream>
#include <thread>

static int kind_to_addr_type(AicoreRegKind kind) {
switch (kind) {
Expand Down Expand Up @@ -103,10 +105,29 @@ get_aicore_reg_info(std::vector<int64_t> &aic, std::vector<int64_t> &aiv, const
in_map_para.devid = device_id;
in_map_para.addr_type = addr_type;

auto ret = halFunc(
0, reinterpret_cast<void *>(&in_map_para), sizeof(struct AddrMapInPara),
reinterpret_cast<void *>(&out_map_para), nullptr
);
// Retry rc=13 (EACCES): concurrent chip_process bring-up across paired dies
// can lose a narrow driver-side serialization window for halMemCtl. The
// failure consistently lands on dev=11 (last die of last chip in the
// 8-11 range); a short backoff lets the prior holder release before the
// next attempt. Other return codes are not retried — they indicate a
// permanent failure mode (missing capability, invalid devid, etc.).
constexpr int kHalMemCtlEacces = 13;
constexpr int kHalMemCtlMaxRetries = 3;
constexpr int kHalMemCtlRetryDelayMs = 50;
int ret = 0;
for (int attempt = 0; attempt <= kHalMemCtlMaxRetries; ++attempt) {
ret = halFunc(
0, reinterpret_cast<void *>(&in_map_para), sizeof(struct AddrMapInPara),
reinterpret_cast<void *>(&out_map_para), nullptr
);
if (ret != kHalMemCtlEacces) break;
if (attempt == kHalMemCtlMaxRetries) break;
LOG_WARN(
"halMemCtl rc=13 (EACCES) on devid=%lld attempt %d/%d, retrying after %d ms", (long long)device_id,
attempt + 1, kHalMemCtlMaxRetries, kHalMemCtlRetryDelayMs
);
std::this_thread::sleep_for(std::chrono::milliseconds(kHalMemCtlRetryDelayMs));
}

if (ret != 0) {
LOG_ERROR("halMemCtl failed with rc=%d", ret);
Expand Down Expand Up @@ -135,31 +156,19 @@ get_aicore_reg_info(std::vector<int64_t> &aic, std::vector<int64_t> &aiv, const

/**
* Get one flat AIC-then-AIV address array for the requested register kind.
* For Ctrl kind, falls back to placeholder addresses on HAL failure to
* preserve historical behavior on hardware where halMemCtl rejects
* ADDR_MAP_TYPE_REG_AIC_CTRL queries (the dispatch path does not actually
* dereference these addresses). For Pmu kind, propagates the HAL error so
* the caller can disable PMU collection cleanly.
* Propagates HAL failure unconditionally: the AICPU init/deinit handshake
* dereferences these addresses via write_reg/read_reg (FAST_PATH_ENABLE,
* DATA_MAIN_BASE, COND), so any placeholder fill would deadlock the next
* task on a stream-sync timeout instead of failing the prepare cleanly.
*/
static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id, AicoreRegKind kind) {
std::vector<int64_t> aic;
std::vector<int64_t> aiv;

int rc = get_aicore_reg_info(aic, aiv, kind_to_addr_type(kind), device_id);
if (rc != 0) {
if (kind == AicoreRegKind::Ctrl) {
LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d, using placeholder addresses", kind_to_name(kind), rc);
aic.clear();
aiv.clear();
for (uint32_t i = 0; i < DAV_2201::PLATFORM_MAX_PHYSICAL_CORES; i++) {
aic.push_back(0xDEADBEEF00000000ULL + (i * 0x800000));
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x100000);
aiv.push_back(0xDEADBEEF00000000ULL + (i * 0x800000) + 0x200000);
}
} else {
LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
return rc;
}
LOG_ERROR("get_aicore_regs(%s): halMemCtl failed: %d", kind_to_name(kind), rc);
return rc;
}

// AIC cores first, then AIV cores
Expand Down
38 changes: 13 additions & 25 deletions src/a5/platform/onboard/host/host_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,30 +85,19 @@ static int get_aicore_reg_info(std::vector<int64_t> &regs, int64_t device_id) {
return 0;
}

static void get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id) {
/**
* Propagates HAL failure: the AICPU init/deinit handshake dereferences these
* addresses via write_reg/read_reg, so any placeholder fill would deadlock
* the next task on a stream-sync timeout instead of failing prepare cleanly.
*/
static int get_aicore_regs(std::vector<int64_t> &regs, uint64_t device_id) {
int rt = get_aicore_reg_info(regs, device_id);

if (rt != 0) {
LOG_ERROR("get_aicore_reg_info failed, using placeholder addresses");
// Fallback: generate placeholder addresses
constexpr size_t MAX_INDEX = DAV_3510::PLATFORM_MAX_PHYSICAL_CORES * PLATFORM_CORES_PER_BLOCKDIM;
regs.clear();
regs.resize(MAX_INDEX);

for (uint32_t core_idx = 0; core_idx < DAV_3510::PLATFORM_MAX_PHYSICAL_CORES; core_idx++) {
uint32_t die_idx = core_idx / PLATFORM_AICORE_PER_DIE;
uint32_t local_idx = core_idx % PLATFORM_AICORE_PER_DIE;
uint32_t die_base = die_idx * (PLATFORM_AICORE_PER_DIE * PLATFORM_CORES_PER_BLOCKDIM);

uint64_t base_addr = 0xDEADBEEF00000000ULL + (core_idx * 0x800000);

regs[die_base + local_idx] = base_addr;
regs[die_base + PLATFORM_AICORE_PER_DIE + local_idx * 2] = base_addr + REG_AIV_FIRST_OFFSET;
regs[die_base + PLATFORM_AICORE_PER_DIE + local_idx * 2 + 1] = base_addr + REG_AIV_SECOND_OFFSET;
}
LOG_ERROR("get_aicore_reg_info failed: %d", rt);
return rt;
}

LOG_INFO_V0("get_aicore_regs: Retrieved %zu register addresses", regs.size());
return 0;
}

int init_aicore_register_addresses(uint64_t *runtime_regs_ptr, uint64_t device_id, MemoryAllocator &allocator) {
Expand All @@ -121,11 +110,10 @@ int init_aicore_register_addresses(uint64_t *runtime_regs_ptr, uint64_t device_i

// Step 1: Get register addresses from HAL
std::vector<int64_t> host_regs;
get_aicore_regs(host_regs, device_id);

if (host_regs.empty()) {
LOG_ERROR("Failed to get AICore register addresses");
return -1;
int rc = get_aicore_regs(host_regs, device_id);
if (rc != 0) {
LOG_ERROR("Failed to get AICore register addresses: %d", rc);
return rc;
}

// Step 2: Allocate device memory for register address array
Expand Down
Loading