Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,24 @@ int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
return -1;
}

bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
const int32_t *cores = core_trackers_[thread_idx].core_ids();
int32_t core_num = core_trackers_[thread_idx].core_num();
for (int32_t i = 0; i < core_num; i++) {
if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
return true;
}
}
return false;
}

bool SchedulerContext::no_thread_owns_running_task() const {
for (int32_t t = 0; t < aicpu_thread_num_; t++) {
if (self_owns_running_task(t)) return false;
}
return true;
}

void SchedulerContext::log_stall_diagnostics(
int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,21 @@ class SchedulerContext {
// the cold diagnostic path.
int32_t find_core_owner_thread(int32_t core_id) const;

// Does this thread own any core with a RUNNING task (running_slot_state set)?
// Gates the scheduler timeout fatal latch: a thread without an owned
// RUNNING task has no first-hand evidence of a stuck dispatch and must
// not declare global fatal on its own idle observation. The thread that
// does own the stuck task will reach the budget on its own polls and
// latch with valid evidence (or recover when the COND register flips).
bool self_owns_running_task(int32_t thread_idx) const;

// Does *any* scheduler thread own a RUNNING task? Used as the second
// fatal-latch condition: if the wall-clock budget elapsed AND no thread
// owns RUNNING work AND tasks remain incomplete, the system is in a
// pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
// ownerless idle threads are the only observers — let one of them latch.
bool no_thread_owns_running_task() const;

__attribute__((noinline, cold)) int32_t handle_timeout_exit(
int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
int32_t last_progress_count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,13 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
#endif

// Wall-clock timestamp of the last completed task on this thread.
// Updated on made_progress; consulted to decide whether the wall-clock
// budget for declaring a scheduler hang has elapsed. Initialized to
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();

while (true) {
if (completed_.load(std::memory_order_acquire)) {
break;
Expand Down Expand Up @@ -737,6 +744,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_

if (made_progress) {
idle_iterations = 0;
last_progress_ts = get_sys_cnt_aicpu();
} else {
while (deferred_release_count > 0) {
#if PTO2_SCHED_PROFILING
Expand All @@ -755,17 +763,39 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
if (idle_iterations % STALL_LOG_INTERVAL == 0) {
log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
}
if (idle_iterations >= MAX_IDLE_ITERATIONS) {
return handle_timeout_exit(
thread_idx, header, runtime, idle_iterations, last_progress_count
// Wall-clock budget gate, with two fatal-latch branches:
//
// 1. Self owns a RUNNING task — first-hand evidence the
// dispatch is stuck. Latch.
// 2. No thread anywhere owns a RUNNING task AND tasks remain
// unfinished — the system is in a pre-dispatch / WAIT-only
// deadlock (e.g. dependency cycle). Ownerless idle threads
// are the only observers; let this one latch on the global
// evidence (`completed_tasks_ < total_tasks_` and
// `no_thread_owns_running_task()`).
//
// Otherwise: a sibling thread owns a RUNNING task but hasn't
// hit its own budget yet (typical distributed startup-skew
// case) — refresh last_progress_ts and keep spinning. The
// STALL diagnostic above still fires periodically so
// observability is preserved.
if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
bool self_owns = self_owns_running_task(thread_idx);
bool global_stuck = !self_owns && total_tasks_ > 0 &&
completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
no_thread_owns_running_task();
if (self_owns || global_stuck) {
return handle_timeout_exit(
thread_idx, header, runtime, idle_iterations, last_progress_count
#if PTO2_PROFILING
,
l2_swimlane.sched_start_ts
,
l2_swimlane.sched_start_ts
#endif
);
} else {
SPIN_WAIT_HINT();
);
}
last_progress_ts = get_sys_cnt_aicpu();
}
SPIN_WAIT_HINT();
#if PTO2_PROFILING
CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
// Idle iterations no longer emit a phase record. Host tooling
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,21 @@ constexpr int32_t MAX_IDLE_ITERATIONS = PLATFORM_MAX_IDLE_ITERATIONS; // platfo
constexpr int32_t STALL_LOG_INTERVAL =
MAX_IDLE_ITERATIONS * 6 / 10; // derived: ~one stall diagnostic halfway to timeout
constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024; // Check orchestrator error every N idle iters

// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
// the per-thread iteration-count cap for the fatal-latch decision; the
// iteration cap still drives the STALL diagnostic cadence (which is per-thread
// observability and benefits from running at the thread's own pace).
//
// Using wall-clock here is load-bearing for distributed runs: with per-thread
// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
// same iteration count. The fast spinner racing ahead and latching fatal
// kills the slower-but-correct poller mid-poll — see the distributed
// startup-skew scenario in issue #897.
constexpr int32_t SCHEDULER_TIMEOUT_MS = 5000; // 5 s; > worst observed distributed-init skew + HCCL wait
constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
Comment thread
hw-native-sys-bot marked this conversation as resolved.
constexpr int32_t STALL_DUMP_READY_MAX = 8;
constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
constexpr int32_t STALL_DUMP_CORE_MAX = 8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,24 @@ int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
return -1;
}

bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
const int32_t *cores = core_trackers_[thread_idx].core_ids();
int32_t core_num = core_trackers_[thread_idx].core_num();
for (int32_t i = 0; i < core_num; i++) {
if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
return true;
}
}
return false;
}

bool SchedulerContext::no_thread_owns_running_task() const {
for (int32_t t = 0; t < aicpu_thread_num_; t++) {
if (self_owns_running_task(t)) return false;
}
return true;
}

void SchedulerContext::log_stall_diagnostics(
int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,21 @@ class SchedulerContext {
// the cold diagnostic path.
int32_t find_core_owner_thread(int32_t core_id) const;

// Does this thread own any core with a RUNNING task (running_slot_state set)?
// Gates the scheduler timeout fatal latch: a thread without an owned
// RUNNING task has no first-hand evidence of a stuck dispatch and must
// not declare global fatal on its own idle observation. The thread that
// does own the stuck task will reach the budget on its own polls and
// latch with valid evidence (or recover when the COND register flips).
bool self_owns_running_task(int32_t thread_idx) const;

// Does *any* scheduler thread own a RUNNING task? Used as the second
// fatal-latch condition: if the wall-clock budget elapsed AND no thread
// owns RUNNING work AND tasks remain incomplete, the system is in a
// pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
// ownerless idle threads are the only observers — let one of them latch.
bool no_thread_owns_running_task() const;

__attribute__((noinline, cold)) int32_t handle_timeout_exit(
int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
int32_t last_progress_count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,13 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
#endif

// Wall-clock timestamp of the last completed task on this thread.
// Updated on made_progress; consulted to decide whether the wall-clock
// budget for declaring a scheduler hang has elapsed. Initialized to
// "now" so the first budget cycle starts when this thread does, not at
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();

while (true) {
if (completed_.load(std::memory_order_acquire)) {
break;
Expand Down Expand Up @@ -733,6 +740,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_

if (made_progress) {
idle_iterations = 0;
last_progress_ts = get_sys_cnt_aicpu();
} else {
while (deferred_release_count > 0) {
#if PTO2_SCHED_PROFILING
Expand All @@ -751,17 +759,39 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
if (idle_iterations % STALL_LOG_INTERVAL == 0) {
log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
}
if (idle_iterations >= MAX_IDLE_ITERATIONS) {
return handle_timeout_exit(
thread_idx, header, runtime, idle_iterations, last_progress_count
// Wall-clock budget gate, with two fatal-latch branches:
//
// 1. Self owns a RUNNING task — first-hand evidence the
// dispatch is stuck. Latch.
// 2. No thread anywhere owns a RUNNING task AND tasks remain
// unfinished — the system is in a pre-dispatch / WAIT-only
// deadlock (e.g. dependency cycle). Ownerless idle threads
// are the only observers; let this one latch on the global
// evidence (`completed_tasks_ < total_tasks_` and
// `no_thread_owns_running_task()`).
//
// Otherwise: a sibling thread owns a RUNNING task but hasn't
// hit its own budget yet (typical distributed startup-skew
// case) — refresh last_progress_ts and keep spinning. The
// STALL diagnostic above still fires periodically so
// observability is preserved.
if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
bool self_owns = self_owns_running_task(thread_idx);
bool global_stuck = !self_owns && total_tasks_ > 0 &&
completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
no_thread_owns_running_task();
if (self_owns || global_stuck) {
return handle_timeout_exit(
thread_idx, header, runtime, idle_iterations, last_progress_count
#if PTO2_PROFILING
,
l2_swimlane.sched_start_ts
,
l2_swimlane.sched_start_ts
#endif
);
} else {
SPIN_WAIT_HINT();
);
}
last_progress_ts = get_sys_cnt_aicpu();
}
SPIN_WAIT_HINT();
#if PTO2_PROFILING
CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ constexpr int32_t MAX_IDLE_ITERATIONS = PLATFORM_MAX_IDLE_ITERATIONS; // platfo
constexpr int32_t STALL_LOG_INTERVAL =
MAX_IDLE_ITERATIONS * 6 / 10; // derived: ~one stall diagnostic halfway to timeout
constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024; // Check orchestrator error every N idle iters

// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
// the per-thread iteration-count cap for the fatal-latch decision; the
// iteration cap still drives the STALL diagnostic cadence (which is per-thread
// observability and benefits from running at the thread's own pace).
//
// Using wall-clock here is load-bearing for distributed runs: with per-thread
// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
// same iteration count. The fast spinner racing ahead and latching fatal
// kills the slower-but-correct poller mid-poll — see the distributed
// startup-skew scenario in issue #897.
constexpr int32_t SCHEDULER_TIMEOUT_MS = 5000; // 5 s; > worst observed distributed-init skew + HCCL wait
constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
Comment thread
hw-native-sys-bot marked this conversation as resolved.
constexpr int32_t STALL_DUMP_READY_MAX = 8;
constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
constexpr int32_t STALL_DUMP_CORE_MAX = 8;
Expand Down
Loading