Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions python/simpler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def my_l4_orch(orch, args, config):
_SHUTDOWN = 3
_CONTROL_REQUEST = 4
_CONTROL_DONE = 5
# Child writes this after its expensive init (ChipWorker.init) completes.
# Parent's _start_hierarchical spin-waits for every chip child to reach
# INIT_DONE before allowing any dispatch — keeps cross-rank init skew out
# of the per-rank host-side stream sync budget (issue #897).
_INIT_DONE = 6

# Control sub-commands (written at _OFF_CALLABLE as uint64)
_CTRL_MALLOC = 0
Expand Down Expand Up @@ -707,6 +712,11 @@ def _chip_process_loop(

mailbox_addr = ctypes.addressof(ctypes.c_char.from_buffer(buf))
state_addr = mailbox_addr + _OFF_STATE
# Signal init complete. Parent's _start_hierarchical spin-waits for
# every chip child to reach _INIT_DONE before dispatching the first
# task, so the per-rank host-side stream sync budget only covers
# actual op execution rather than absorbing peer-rank init skew.
_mailbox_store_i32(state_addr, _INIT_DONE)
sys.stderr.write(f"[chip_process pid={os.getpid()} dev={device_id}] ready\n")
sys.stderr.flush()

Expand Down Expand Up @@ -1374,6 +1384,24 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l
else:
self._chip_pids.append(pid)

# Cross-chip init barrier. ChipWorker.init can have a long
# right tail (e.g. PTO2_RING_HEAP=4 GiB pushes per-rank
# device_malloc beyond the host stream sync budget); without
# this barrier a fast-init chip starts its aclrtSyncStream
# window N seconds before a slow peer reaches the same
# point, and any cross-rank wait inside the op (HCCL notify,
# etc.) charges the slow peer's remaining init time against
# the fast peer's PLATFORM_STREAM_SYNC_TIMEOUT_MS budget —
# the cascade documented in issue #897. Reset each child to
# _IDLE once observed so the standard dispatch state machine
# resumes from the canonical "ready for work" state.
for shm in self._chip_shms:
assert shm.buf is not None
addr = _buffer_field_addr(shm.buf, _OFF_STATE)
while _mailbox_load_i32(addr) != _INIT_DONE:
pass
_mailbox_store_i32(addr, _IDLE)

# Fork next-level Worker children (L4+ with Worker children).
# Each child process: init the inner Worker (which mmaps its own
# HeapRing and allocates its own child mailboxes), then enter
Expand Down
7 changes: 7 additions & 0 deletions src/common/hierarchical/worker_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ enum class MailboxState : int32_t {
SHUTDOWN = 3,
CONTROL_REQUEST = 4,
CONTROL_DONE = 5,
// Child writes this after its expensive init (ChipWorker::init / inner
// Worker::init) completes. Parent's _start_hierarchical spin-waits for
// EVERY chip child to reach INIT_DONE before any dispatch (CTRL_PREPARE
// or TASK_READY) goes out. This aligns the host-side stream-sync windows
// across distributed ranks so cross-rank init skew never charges against
// the per-rank PLATFORM_STREAM_SYNC_TIMEOUT_MS budget (issue #897).
INIT_DONE = 6,
};

static constexpr size_t MAILBOX_SIZE = 4096;
Expand Down
Loading