Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def __init__(self,
max_num_sequences: int,
drafter: Optional[Drafter] = None,
disable_overlap_scheduler: bool = False,
max_input_len: int = 2048,
max_input_len: int = 0x3fffffff,
max_batch_size: int = 8,
max_beam_width: int = 1,
max_draft_len: int = 0,
Expand Down Expand Up @@ -1388,6 +1388,7 @@ def _executor_loop(self):
if scheduled_batch is None:
break

self._terminate_requests(scheduled_batch.paused_requests)
self._pause_requests(scheduled_batch.paused_requests)

finished_requests = []
Expand Down Expand Up @@ -1623,7 +1624,7 @@ def _executor_loop_overlap(self):
else:
can_forward = True

self._pause_requests(scheduled_batch.paused_requests)
self._terminate_requests(scheduled_batch.paused_requests)

can_queue = self._can_queue(scheduled_batch)
if can_queue:
Expand Down Expand Up @@ -1726,6 +1727,8 @@ def _executor_loop_overlap(self):
# Cleanup previous draft resources used in the draft model
self.drafter.cleanup_previous_draft_resources()

self._pause_requests(scheduled_batch.paused_requests)

if can_queue:
guided_decoder_failed_requests = None
if self.guided_decoder is not None:
Expand Down Expand Up @@ -2788,14 +2791,16 @@ def key_has_response():
self.responses.pop(id)
return response

def _pause_requests(self, requests_to_pause):
def _terminate_requests(self, requests_to_pause):
# todo: support work with self.inflight_req_ids.
# Currently, self.inflight_req_ids is not.
max_input_len = self.max_input_len
for req in requests_to_pause:
req.pause(max_input_len)
self._terminate_request(req)

def _pause_requests(self, requests_to_pause):
for req in requests_to_pause:
req.pause(self.max_input_len)

def _add_inflight_ids(self, scheduled_requests):
"""Add request IDs of current requests to self.inflight_req_ids.

Expand Down
Loading