hw-native-sys · ChaoWao · May 31, 2026 · May 31, 2026
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -212,6 +212,24 @@ int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
     return -1;
 }
 
+bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    for (int32_t i = 0; i < core_num; i++) {
+        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SchedulerContext::no_thread_owns_running_task() const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        if (self_owns_running_task(t)) return false;
+    }
+    return true;
+}
+
 void SchedulerContext::log_stall_diagnostics(
     int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
 ) {

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -339,6 +339,21 @@ class SchedulerContext {
     // the cold diagnostic path.
     int32_t find_core_owner_thread(int32_t core_id) const;
 
+    // Does this thread own any core with a RUNNING task (running_slot_state set)?
+    // Gates the scheduler timeout fatal latch: a thread without an owned
+    // RUNNING task has no first-hand evidence of a stuck dispatch and must
+    // not declare global fatal on its own idle observation. The thread that
+    // does own the stuck task will reach the budget on its own polls and
+    // latch with valid evidence (or recover when the COND register flips).
+    bool self_owns_running_task(int32_t thread_idx) const;
+
+    // Does *any* scheduler thread own a RUNNING task? Used as the second
+    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
+    // owns RUNNING work AND tasks remain incomplete, the system is in a
+    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
+    // ownerless idle threads are the only observers — let one of them latch.
+    bool no_thread_owns_running_task() const;
+
     __attribute__((noinline, cold)) int32_t handle_timeout_exit(
         int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
         int32_t last_progress_count

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -537,6 +537,13 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
+    // Wall-clock timestamp of the last completed task on this thread.
+    // Updated on made_progress; consulted to decide whether the wall-clock
+    // budget for declaring a scheduler hang has elapsed. Initialized to
+    // "now" so the first budget cycle starts when this thread does, not at
+    // an undefined value.
+    uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
     while (true) {
         if (completed_.load(std::memory_order_acquire)) {
             break;
@@ -737,6 +744,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
         if (made_progress) {
             idle_iterations = 0;
+            last_progress_ts = get_sys_cnt_aicpu();
         } else {
             while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
@@ -755,17 +763,39 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (idle_iterations % STALL_LOG_INTERVAL == 0) {
                 log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
             }
-            if (idle_iterations >= MAX_IDLE_ITERATIONS) {
-                return handle_timeout_exit(
-                    thread_idx, header, runtime, idle_iterations, last_progress_count
+            // Wall-clock budget gate, with two fatal-latch branches:
+            //
+            // 1. Self owns a RUNNING task — first-hand evidence the
+            //    dispatch is stuck. Latch.
+            // 2. No thread anywhere owns a RUNNING task AND tasks remain
+            //    unfinished — the system is in a pre-dispatch / WAIT-only
+            //    deadlock (e.g. dependency cycle). Ownerless idle threads
+            //    are the only observers; let this one latch on the global
+            //    evidence (`completed_tasks_ < total_tasks_` and
+            //    `no_thread_owns_running_task()`).
+            //
+            // Otherwise: a sibling thread owns a RUNNING task but hasn't
+            // hit its own budget yet (typical distributed startup-skew
+            // case) — refresh last_progress_ts and keep spinning. The
+            // STALL diagnostic above still fires periodically so
+            // observability is preserved.
+            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
+                bool self_owns = self_owns_running_task(thread_idx);
+                bool global_stuck = !self_owns && total_tasks_ > 0 &&
+                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
+                                    no_thread_owns_running_task();
+                if (self_owns || global_stuck) {
+                    return handle_timeout_exit(
+                        thread_idx, header, runtime, idle_iterations, last_progress_count
 #if PTO2_PROFILING
-                    ,
-                    l2_swimlane.sched_start_ts
+                        ,
+                        l2_swimlane.sched_start_ts
 #endif
-                );
-            } else {
-                SPIN_WAIT_HINT();
+                    );
+                }
+                last_progress_ts = get_sys_cnt_aicpu();
             }
+            SPIN_WAIT_HINT();
 #if PTO2_PROFILING
             CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
             // Idle iterations no longer emit a phase record. Host tooling

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -48,6 +48,21 @@ constexpr int32_t MAX_IDLE_ITERATIONS = PLATFORM_MAX_IDLE_ITERATIONS;  // platfo
 constexpr int32_t STALL_LOG_INTERVAL =
     MAX_IDLE_ITERATIONS * 6 / 10;                     // derived: ~one stall diagnostic halfway to timeout
 constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
+// the per-thread iteration-count cap for the fatal-latch decision; the
+// iteration cap still drives the STALL diagnostic cadence (which is per-thread
+// observability and benefits from running at the thread's own pace).
+//
+// Using wall-clock here is load-bearing for distributed runs: with per-thread
+// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
+// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
+// same iteration count. The fast spinner racing ahead and latching fatal
+// kills the slower-but-correct poller mid-poll — see the distributed
+// startup-skew scenario in issue #897.
+constexpr int32_t SCHEDULER_TIMEOUT_MS = 5000;  // 5 s; > worst observed distributed-init skew + HCCL wait
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
+    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
 constexpr int32_t STALL_DUMP_READY_MAX = 8;
 constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -212,6 +212,24 @@ int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
     return -1;
 }
 
+bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
+    const int32_t *cores = core_trackers_[thread_idx].core_ids();
+    int32_t core_num = core_trackers_[thread_idx].core_num();
+    for (int32_t i = 0; i < core_num; i++) {
+        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool SchedulerContext::no_thread_owns_running_task() const {
+    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
+        if (self_owns_running_task(t)) return false;
+    }
+    return true;
+}
+
 void SchedulerContext::log_stall_diagnostics(
     int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
 ) {

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
@@ -343,6 +343,21 @@ class SchedulerContext {
     // the cold diagnostic path.
     int32_t find_core_owner_thread(int32_t core_id) const;
 
+    // Does this thread own any core with a RUNNING task (running_slot_state set)?
+    // Gates the scheduler timeout fatal latch: a thread without an owned
+    // RUNNING task has no first-hand evidence of a stuck dispatch and must
+    // not declare global fatal on its own idle observation. The thread that
+    // does own the stuck task will reach the budget on its own polls and
+    // latch with valid evidence (or recover when the COND register flips).
+    bool self_owns_running_task(int32_t thread_idx) const;
+
+    // Does *any* scheduler thread own a RUNNING task? Used as the second
+    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
+    // owns RUNNING work AND tasks remain incomplete, the system is in a
+    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
+    // ownerless idle threads are the only observers — let one of them latch.
+    bool no_thread_owns_running_task() const;
+
     __attribute__((noinline, cold)) int32_t handle_timeout_exit(
         int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
         int32_t last_progress_count

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -532,6 +532,13 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
     l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
 #endif
 
+    // Wall-clock timestamp of the last completed task on this thread.
+    // Updated on made_progress; consulted to decide whether the wall-clock
+    // budget for declaring a scheduler hang has elapsed. Initialized to
+    // "now" so the first budget cycle starts when this thread does, not at
+    // an undefined value.
+    uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
     while (true) {
         if (completed_.load(std::memory_order_acquire)) {
             break;
@@ -733,6 +740,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
 
         if (made_progress) {
             idle_iterations = 0;
+            last_progress_ts = get_sys_cnt_aicpu();
         } else {
             while (deferred_release_count > 0) {
 #if PTO2_SCHED_PROFILING
@@ -751,17 +759,39 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             if (idle_iterations % STALL_LOG_INTERVAL == 0) {
                 log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
             }
-            if (idle_iterations >= MAX_IDLE_ITERATIONS) {
-                return handle_timeout_exit(
-                    thread_idx, header, runtime, idle_iterations, last_progress_count
+            // Wall-clock budget gate, with two fatal-latch branches:
+            //
+            // 1. Self owns a RUNNING task — first-hand evidence the
+            //    dispatch is stuck. Latch.
+            // 2. No thread anywhere owns a RUNNING task AND tasks remain
+            //    unfinished — the system is in a pre-dispatch / WAIT-only
+            //    deadlock (e.g. dependency cycle). Ownerless idle threads
+            //    are the only observers; let this one latch on the global
+            //    evidence (`completed_tasks_ < total_tasks_` and
+            //    `no_thread_owns_running_task()`).
+            //
+            // Otherwise: a sibling thread owns a RUNNING task but hasn't
+            // hit its own budget yet (typical distributed startup-skew
+            // case) — refresh last_progress_ts and keep spinning. The
+            // STALL diagnostic above still fires periodically so
+            // observability is preserved.
+            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
+                bool self_owns = self_owns_running_task(thread_idx);
+                bool global_stuck = !self_owns && total_tasks_ > 0 &&
+                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
+                                    no_thread_owns_running_task();
+                if (self_owns || global_stuck) {
+                    return handle_timeout_exit(
+                        thread_idx, header, runtime, idle_iterations, last_progress_count
 #if PTO2_PROFILING
-                    ,
-                    l2_swimlane.sched_start_ts
+                        ,
+                        l2_swimlane.sched_start_ts
 #endif
-                );
-            } else {
-                SPIN_WAIT_HINT();
+                    );
+                }
+                last_progress_ts = get_sys_cnt_aicpu();
             }
+            SPIN_WAIT_HINT();
 #if PTO2_PROFILING
             CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
             if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {

diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
@@ -49,6 +49,21 @@ constexpr int32_t MAX_IDLE_ITERATIONS = PLATFORM_MAX_IDLE_ITERATIONS;  // platfo
 constexpr int32_t STALL_LOG_INTERVAL =
     MAX_IDLE_ITERATIONS * 6 / 10;                     // derived: ~one stall diagnostic halfway to timeout
 constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
+// the per-thread iteration-count cap for the fatal-latch decision; the
+// iteration cap still drives the STALL diagnostic cadence (which is per-thread
+// observability and benefits from running at the thread's own pace).
+//
+// Using wall-clock here is load-bearing for distributed runs: with per-thread
+// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
+// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
+// same iteration count. The fast spinner racing ahead and latching fatal
+// kills the slower-but-correct poller mid-poll — see the distributed
+// startup-skew scenario in issue #897.
+constexpr int32_t SCHEDULER_TIMEOUT_MS = 5000;  // 5 s; > worst observed distributed-init skew + HCCL wait
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
+    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
 constexpr int32_t STALL_DUMP_READY_MAX = 8;
 constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;