Fix deadlock in results_queue.join() during training

benediktstroebl · benediktstroebl · commit 5229cf4a2485 · 2025-10-07T09:14:36.000Z
Add a 10-second timeout to results_queue.join() to prevent indefinite
hangs when lingering results aren't properly consumed. If a timeout
occurs, drain any remaining items from the queue to allow training to
continue.

This fixes an issue where training could deadlock between steps if
results from a previous step remained unprocessed in the queue.
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
@@ -88,8 +88,21 @@ async def train(
     ) -> AsyncIterator[dict[str, float]]:
         # Get the packed tensors from disk
         packed_tensors = packed_tensors_from_dir(**disk_packed_tensors)
-        # Wait for existing batches to finish
-        await self.results_queue.join()
+        # Wait for existing batches to finish, with timeout to prevent deadlock
+        try:
+            await asyncio.wait_for(self.results_queue.join(), timeout=10.0)
+        except asyncio.TimeoutError:
+            # Recover from deadlock by draining queue
+            drained = 0
+            while True:
+                try:
+                    self.results_queue.get_nowait()
+                    self.results_queue.task_done()
+                    drained += 1
+                except asyncio.QueueEmpty:
+                    break
+            if verbose and drained > 0:
+                print(f"Warning: Drained {drained} lingering result(s) from queue")
         # If we haven't already, start the training task
         if self._train_task is None:
             self._train_task = asyncio.create_task(