Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,17 +409,22 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco

// choices: array of size N, where N is related to n request parameter
jsonResponse.StartArray("choices");
int index = 0;
for (int i = 0; i < results.tokens.size(); i++) {
if (results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM generation result, defaulting to STOP for all choices");
} else if (results.finish_reasons.size() != results.tokens.size()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Finish reasons size ({}) does not match tokens size ({}) in unary LM generation result, defaulting missing entries to STOP",
results.finish_reasons.size(), results.tokens.size());
}
for (size_t i = 0; i < results.tokens.size(); ++i) {
const std::vector<int64_t>& tokens = results.tokens[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
const ov::genai::GenerationFinishReason finishReasonRaw = i < results.finish_reasons.size() ? results.finish_reasons[i] : ov::genai::GenerationFinishReason::STOP;
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
Comment thread
michalkulakowski marked this conversation as resolved.
jsonResponse.FinishReason(finishReason.value_or("unknown"));
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
jsonResponse.Index(static_cast<int>(i));

if (endpoint == Endpoint::CHAT_COMPLETIONS) {
jsonResponse.MessageObject(parsedOutput);
Expand Down Expand Up @@ -480,8 +485,12 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens);
ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens);
jsonResponse.StartObject();
// finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls
auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty());
if (results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM generation result, defaulting to STOP");
}
// Current generation flow uses batch=1, so only finish_reasons[0] is expected here.
const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : results.finish_reasons[0];
auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty());
jsonResponse.FinishReason(finishReason.value_or("unknown"));
Comment thread
michalkulakowski marked this conversation as resolved.
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
Expand Down
24 changes: 22 additions & 2 deletions src/llm/apis/openai_responses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,17 +652,30 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
if (results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM responses generation result, defaulting to STOP");
}
std::vector<ParsedOutput> parsedOutputs;
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& tokens : results.tokens) {
parsedOutputs.push_back(parseOutputIfNeeded(tokens));
}
return serializeUnaryResponseImpl(parsedOutputs);
for (const auto& finishReason : results.finish_reasons) {
Copy link
Copy Markdown
Collaborator

@dkalinowski dkalinowski May 8, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have different implementation than chat/completions? cant we just take [0]? I think we also have batch size=1 here always

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now we could have same implementation for both APIs, but this approach highlights differences (for chat/completions there is one finish reason per output, for response one incomplete details for whole response)

if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
Comment thread
michalkulakowski marked this conversation as resolved.
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) {
OVMS_PROFILE_FUNCTION();
usage.promptTokens = results.perf_metrics.get_num_input_tokens();
usage.completionTokens = results.perf_metrics.get_num_generated_tokens();
if (results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM responses generation result, defaulting to STOP");
}
// Usage is already correctly set from perf_metrics above — no need for updateUsage.
std::vector<ParsedOutput> parsedOutputs;
if (!textResponse.empty()) {
Expand All @@ -677,7 +690,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded
parsedOutputs.push_back(std::move(output));
}
}
return serializeUnaryResponseImpl(parsedOutputs);
ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP;
for (const auto& finishReason : results.finish_reasons) {
if (finishReason == ov::genai::GenerationFinishReason::LENGTH) {
responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH;
break;
}
}
return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason);
}

// --- Streaming event building blocks ---
Expand Down
7 changes: 6 additions & 1 deletion src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,12 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (legacyExecutionContext->results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy LM streaming generation result, defaulting to STOP");
}
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
7 changes: 6 additions & 1 deletion src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,12 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (legacyExecutionContext->results.finish_reasons.empty()) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP");
}
// Legacy generation path always runs with batch=1, so we read the single finish reason at index 0.
ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0];
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
}
Expand Down
167 changes: 167 additions & 0 deletions src/test/http_openai_handler_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1713,6 +1713,173 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompleted
ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsIncompleteOnLength) {
std::string json = R"({
"model": "llama",
"input": "What is OpenVINO?",
"max_output_tokens": 5
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
std::optional<uint32_t> maxTokensLimit;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::EncodedResults results;
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
const auto& shape = outputIds.get_shape();
ASSERT_EQ(shape.size(), 2);
ASSERT_EQ(shape[0], 1);
ASSERT_EQ(outputIds.get_element_type(), ov::element::i64);
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + shape[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};

std::string serialized = apiHandler->serializeUnaryResponse(results);

ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsCompletedOnStop) {
std::string json = R"({
"model": "llama",
"input": "What is OpenVINO?",
"max_output_tokens": 5
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
std::optional<uint32_t> maxTokensLimit;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::EncodedResults results;
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results);

ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsIncompleteOnLength) {
std::string json = R"({
"model": "llama",
"input": "What is OpenVINO?",
"max_output_tokens": 5
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
std::optional<uint32_t> maxTokensLimit;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::VLMDecodedResults results;
std::string text = "OVMS";
results.texts = {text};
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};

std::string serialized = apiHandler->serializeUnaryResponse(results, text);

ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsCompletedOnStop) {
std::string json = R"({
"model": "llama",
"input": "What is OpenVINO?",
"max_output_tokens": 5
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIResponsesHandler>(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer);
std::optional<uint32_t> maxTokensLimit;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::VLMDecodedResults results;
std::string text = "OVMS";
results.texts = {text};
results.finish_reasons = {ov::genai::GenerationFinishReason::STOP};

std::string serialized = apiHandler->serializeUnaryResponse(results, text);

ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized;
ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized;
ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsEncodedResultsLengthFinishReason) {
std::string json = R"({
"model": "llama",
"stream": false,
"messages": [{"role": "user", "content": "What is OpenVINO?"}]
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
uint32_t maxTokensLimit = 100;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::EncodedResults results;
ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids;
int64_t* outputIdsData = reinterpret_cast<int64_t*>(outputIds.data());
results.tokens = {std::vector<int64_t>(outputIdsData, outputIdsData + outputIds.get_shape()[1])};
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};

std::string serialized = apiHandler->serializeUnaryResponse(results);
ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsVLMDecodedResultsLengthFinishReason) {
std::string json = R"({
"model": "llama",
"stream": false,
"messages": [{"role": "user", "content": "What is OpenVINO?"}]
})";
doc.Parse(json.c_str());
ASSERT_FALSE(doc.HasParseError());

auto apiHandler = std::make_shared<ovms::OpenAIChatCompletionsHandler>(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer);
uint32_t maxTokensLimit = 100;
uint32_t bestOfLimit = 0;
std::optional<uint32_t> maxModelLength;
ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus());

ov::genai::VLMDecodedResults results;
std::string text = "OVMS";
results.texts = {text};
results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH};

std::string serialized = apiHandler->serializeUnaryResponse(results, text);
ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized;
}

TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) {
std::string json = R"({
"model": "llama",
Expand Down
6 changes: 3 additions & 3 deletions src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2685,9 +2685,9 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(
// params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg
TestParameters{"lm_cb_regular", true, true, true, false, true},
TestParameters{"lm_legacy_regular", false, false, false, false, false},
TestParameters{"lm_legacy_regular", false, false, true, false, false},
TestParameters{"vlm_cb_regular", false, true, true, false, true},
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));
TestParameters{"vlm_legacy_regular", false, false, true, false, false}));

const std::string validRequestBodyWithParameter(const std::string& modelName, const std::string& parameter, const std::string& value) {
std::string requestBody = R"(
Expand Down Expand Up @@ -3611,7 +3611,7 @@ INSTANTIATE_TEST_SUITE_P(
TestParameters{"lm_cb_regular", true, true, true, false, true},
TestParameters{"lm_legacy_regular", false, false, false, false, false},
TestParameters{"vlm_cb_regular", false, true, true, false, true},
TestParameters{"vlm_legacy_regular", false, false, false, false, false}));
TestParameters{"vlm_legacy_regular", false, false, true, false, false}));

// Common tests for all pipeline types (testing logic executed prior pipeline type selection)
class LLMConfigHttpTest : public ::testing::Test {};
Expand Down