Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions fastdeploy/engine/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,7 @@ class RequestMetrics:

llm_engine_recv_req_timestamp: Optional[float] = None
llm_engine_send_req_to_engine_timestamp: Optional[float] = None
llm_engine_send_req_to_decoder_engine_timestamp: Optional[float] = None
llm_engine_recv_latest_token_timestamp: Optional[float] = None
llm_engine_recv_token_timestamp: Optional[float] = None

Expand Down Expand Up @@ -952,6 +953,9 @@ def __getitem__(self, key):
def __setitem__(self, key, value):
setattr(self, key, value)

def update_decoder_start_time(self):
self.llm_engine_send_req_to_decoder_engine_timestamp = self.decode_inference_start_time

Comment on lines +956 to +958
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR 描述目前基本保留了模板内容,未补充本次修改的动机、具体改动点、使用方式/回归命令、以及(若影响输出)精度验证结果。为了便于评审与后续维护,建议按模板补全至少 Motivation/Modifications/Usage(or Command)/Accuracy Tests(如无测试也请说明原因)。

Copilot uses AI. Check for mistakes.

class RequestOutput:
"""The output data of a completion request to the LLM.
Expand Down
4 changes: 3 additions & 1 deletion fastdeploy/engine/sched/resource_manager_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1439,7 +1439,9 @@ def add_prefilled_request(self, request_output: RequestOutput):

request_output.metrics.decode_recv_req_time = request.metrics.decode_recv_req_time
request_output.metrics.decode_preallocate_req_time = request.metrics.decode_preallocate_req_time
request.metrics = request_output.metrics
request.metrics = copy.deepcopy(request_output.metrics)
request.metrics.decode_inference_start_time = time.time()
request.metrics.update_decoder_start_time()
self.running.append(request)

def _free_blocks(self, request: Request):
Expand Down
5 changes: 3 additions & 2 deletions fastdeploy/spec_decode/mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def __init__(

self.attn_backends: list[AttentionBackend] = []
self._initialize_attn_backend()
self.eb5_runner = bool(int(os.getenv("EB5_ENABLE_FD_RUNNER", "0")))

# Forward meta store the global meta information of the forward
self.forward_meta = None
Expand Down Expand Up @@ -503,7 +504,7 @@ def insert_tasks_v1(
self.model_inputs["step_idx"][idx : idx + 1] = (
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
)
if self.enable_mm:
if self.enable_mm and not self.eb5_runner:
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里在 eb5_runner=true 时跳过了多模态 attn_mask_offsets_full/decoder 的填充,但本类后续在 _initialize_forward_meta() 里仍然会在 enable_mm=True 时把 model_inputs["attn_mask_offsets"] 传入 ForwardMeta(即 attention backend 会使用该 tensor)。如果 offsets 没有被正确初始化/更新,很容易导致注意力 mask 错误、输出不正确。建议在 eb5_runner 场景下同时确保 ForwardMeta.attn_mask_offsets 为 None(或将 enable_mm 置为 False),或者提供等价的 offsets 计算/初始化逻辑,避免使用到陈旧/未定义的数据。

Copilot uses AI. Check for mistakes.
inputs = request.multimodal_inputs
self.model_inputs["attn_mask_offsets_full"][idx][0 : prefill_end_index - prefill_start_index] = (
paddle.to_tensor(
Expand Down Expand Up @@ -885,7 +886,7 @@ def _propose_cuda(self, step_use_cudagraph: bool = False, is_dummy_run: bool = F
self.model_inputs["seq_lens_decoder"],
)

if self.enable_mm:
if self.enable_mm and not self.eb5_runner:
attn_mask_offsets = update_attn_mask_offsets(
ids_remove_padding,
getattr(
Expand Down
13 changes: 13 additions & 0 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2011,6 +2011,13 @@ def _get_p_done_idxs_gd(self, model_forward_batch: Optional[List[Request]], num_

return prefill_done_idxs

def _execute_empty_mtp_input(self, forward_meta) -> None:
"""
run ep inference forward with empty input.
"""
for _ in range(self.fd_config.speculative_config.num_model_steps):
self.proposer.model.empty_input_forward(forward_meta)

def execute_model(
self,
model_forward_batch: Optional[List[Request]] = None,
Expand Down Expand Up @@ -2038,6 +2045,12 @@ def execute_model_normal(
model_inputs, p_done_idxs, _ = self._preprocess(model_forward_batch, num_running_requests)
model_output = self._execute(model_inputs)
if model_output is None or self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item() <= 0:
if (
self.fd_config.speculative_config.method == SpecMethod.MTP
and hasattr(self.proposer.model, "empty_input_forward")
and self.parallel_config.use_ep
):
self._execute_empty_mtp_input(self.forward_meta)
return
model_output_data, sampler_output, post_process_event = self._postprocess(
model_output, p_done_idxs, model_forward_batch, num_running_requests
Expand Down
Loading