NVIDIA · nvzhihanj · Jan 24, 2026 · Jan 23, 2026
@@ -125,7 +125,7 @@ def __init__(self,
                  max_num_sequences: int,
                  drafter: Optional[Drafter] = None,
                  disable_overlap_scheduler: bool = False,
-                 max_input_len: int = 2048,
+                 max_input_len: int = 0x3fffffff,
                  max_batch_size: int = 8,
                  max_beam_width: int = 1,
                  max_draft_len: int = 0,
@@ -1388,6 +1388,7 @@ def _executor_loop(self):
                 if scheduled_batch is None:
                     break
 
+                self._terminate_requests(scheduled_batch.paused_requests)
                 self._pause_requests(scheduled_batch.paused_requests)
 
                 finished_requests = []
@@ -1623,7 +1624,7 @@ def _executor_loop_overlap(self):
                         else:
                             can_forward = True
 
-                self._pause_requests(scheduled_batch.paused_requests)
+                self._terminate_requests(scheduled_batch.paused_requests)
 
                 can_queue = self._can_queue(scheduled_batch)
                 if can_queue:
@@ -1726,6 +1727,8 @@ def _executor_loop_overlap(self):
                     # Cleanup previous draft resources used in the draft model
                     self.drafter.cleanup_previous_draft_resources()
 
+                self._pause_requests(scheduled_batch.paused_requests)
+
                 if can_queue:
                     guided_decoder_failed_requests = None
                     if self.guided_decoder is not None:
@@ -2788,14 +2791,16 @@ def key_has_response():
             self.responses.pop(id)
             return response
 
-    def _pause_requests(self, requests_to_pause):
+    def _terminate_requests(self, requests_to_pause):
         # todo: support work with self.inflight_req_ids.
         #       Currently, self.inflight_req_ids is not.
-        max_input_len = self.max_input_len
         for req in requests_to_pause:
-            req.pause(max_input_len)
             self._terminate_request(req)
 
+    def _pause_requests(self, requests_to_pause):
+        for req in requests_to_pause:
+            req.pause(self.max_input_len)
+
     def _add_inflight_ids(self, scheduled_requests):
         """Add request IDs of current requests to self.inflight_req_ids.