Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,18 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco

// choices: array of size N, where N is related to n request parameter
jsonResponse.StartArray("choices");
int index = 0;
for (int i = 0; i < results.tokens.size(); i++) {
for (size_t i = 0; i < results.tokens.size(); ++i) {
const std::vector<int64_t>& tokens = results.tokens[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
const ov::genai::GenerationFinishReason finishReasonRaw =
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it possible to have results.finish_reasons.empty() ? in which situations?

(!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP;
Copy link
Copy Markdown
Collaborator

@dkalinowski dkalinowski May 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add comment that we access [0] because we always use generate with batch=1

auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
jsonResponse.FinishReason(finishReason.value_or("unknown"));
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
jsonResponse.Index(static_cast<int>(i));

if (endpoint == Endpoint::CHAT_COMPLETIONS) {
jsonResponse.MessageObject(parsedOutput);
Expand Down Expand Up @@ -481,7 +482,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
const ov::genai::GenerationFinishReason finishReasonRaw =
(!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP;
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
jsonResponse.FinishReason(finishReason.value_or("unknown"));
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
Expand Down
18 changes: 16 additions & 2 deletions src/llm/apis/openai_responses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -649,10 +649,17 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
std::vector<ParsedOutput> parsedOutputs;
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& tokens : results.tokens) {
parsedOutputs.push_back(parseOutputIfNeeded(tokens));
}
return serializeUnaryResponseImpl(parsedOutputs);
for (const auto& finishReason : results.finish_reasons) {
Copy link
Copy Markdown
Collaborator

@dkalinowski dkalinowski May 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have different implementation than chat/completions? cant we just take [0]? I think we also have batch size=1 here always

if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
Expand All @@ -673,7 +680,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded
parsedOutputs.push_back(std::move(output));
}
}
return serializeUnaryResponseImpl(parsedOutputs);
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& finishReason : results.finish_reasons) {
if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

// --- Streaming event building blocks ---
Expand Down
6 changes: 5 additions & 1 deletion src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,11 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP;
if (!legacyExecutionContext->results.finish_reasons.empty()) {
finishReason = legacyExecutionContext->results.finish_reasons[0];
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt 0 finish reasons internal error here? should we throw if it happens?

}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
6 changes: 5 additions & 1 deletion src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,11 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP;
if (!legacyExecutionContext->results.finish_reasons.empty()) {
finishReason = legacyExecutionContext->results.finish_reasons[0];
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
2 changes: 1 addition & 1 deletion src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2688,7 +2688,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
// params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg
TestParameters{"lm_cb_regular", true, true, true, false, true},
TestParameters{"lm_legacy_regular", false, false, false, false, false},
TestParameters{"lm_legacy_regular", false, false, true, false, false},
TestParameters{"vlm_cb_regular", false, true, true, false, true},
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));

Expand Down