Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/capi_frontend/server_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ struct ServerSettingsImpl {
std::optional<std::vector<std::string>> allowedMediaDomains;
std::string logLevel = "INFO";
std::string logPath;
// When enabled, LLM chat/text completion unary responses include an extra
// "__verbose" object with the raw model input (after chat template) and the
// raw model output (before tool/reasoning parsing). Inspired by llama.cpp -v.
bool verboseResponse = false;
bool allowCredentials = false;
std::string allowedOrigins{"*"};
std::string allowedMethods{"*"};
Expand Down
8 changes: 8 additions & 0 deletions src/cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ std::variant<bool, std::pair<int, std::string>> CLIParser::parse(int argc, char*
("log_path",
"Optional path to the log file",
cxxopts::value<std::string>(), "LOG_PATH")
("verbose_response",
"When enabled, LLM chat/text completion unary responses include an extra "
"\"__verbose\" object with the raw prompt (after chat template) and the raw "
"model output (before tool/reasoning parsing). Useful for debugging.",
cxxopts::value<bool>()->default_value("false"),
"VERBOSE_RESPONSE")
#ifdef MTR_ENABLED
("trace_path",
"Path to the trace file",
Expand Down Expand Up @@ -518,6 +524,8 @@ void CLIParser::prepareServer(ServerSettingsImpl& serverSettings) {
serverSettings.logLevel = result->operator[]("log_level").as<std::string>();
if (result->count("log_path"))
serverSettings.logPath = result->operator[]("log_path").as<std::string>();
if (result->count("verbose_response"))
serverSettings.verboseResponse = result->operator[]("verbose_response").as<bool>();

if (result->count("grpc_channel_arguments"))
serverSettings.grpcChannelArguments = result->operator[]("grpc_channel_arguments").as<std::string>();
Expand Down
24 changes: 24 additions & 0 deletions src/llm/apis/openai_api_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,30 @@ ParsedOutput OpenAIApiHandler::parseOutputIfNeeded(const std::vector<int64_t>& g
return parsedOutput;
}

std::string OpenAIApiHandler::serializeStreamingVerboseChunk() {
if (!verboseResponse) {
return std::string();
}
std::string rawOutput;
if (!verboseRawTokens.empty()) {
rawOutput = tokenizer.decode(verboseRawTokens, ov::genai::skip_special_tokens(false));
} else {
rawOutput = verboseRawText;
}
rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
writer.StartObject();
writer.String("__verbose");
writer.StartObject();
writer.String("prompt");
writer.String(verbosePrompt.c_str());
writer.String("raw_output");
writer.String(rawOutput.c_str());
writer.EndObject();
writer.EndObject();
return buffer.GetString();
}

// --- Free functions ---

void updateUsage(CompletionUsageStatistics& usage, const std::vector<int64_t>& generatedIds, bool echoPrompt) {
Expand Down
32 changes: 32 additions & 0 deletions src/llm/apis/openai_api_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,16 @@ class OpenAIApiHandler {
// Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning.
std::unique_ptr<OutputParser> outputParser = nullptr;

// Verbose response support (enabled via --verbose_response). When set, the
// serialized unary response includes a "__verbose" object with the raw prompt
// (post chat template) and raw decoded model output (before tool/reasoning
// parsing). Inspired by llama.cpp -v.
bool verboseResponse = false;
std::string verbosePrompt;
// Streaming accumulators for raw model output.
std::vector<int64_t> verboseRawTokens;
std::string verboseRawText;

// Shared parsing helpers
absl::Status parseCommonPart(std::optional<uint32_t> maxTokensLimit, uint32_t bestOfLimit, std::optional<uint32_t> maxModelLength);
absl::Status parseResponseFormat();
Expand Down Expand Up @@ -156,6 +166,28 @@ class OpenAIApiHandler {
std::string getToolChoice() const;
const std::unique_ptr<OutputParser>& getOutputParser() const;

// Verbose response configuration
void enableVerboseResponse(const std::string& promptAfterTemplate) {
verboseResponse = true;
verbosePrompt = promptAfterTemplate;
}
bool isVerboseResponse() const { return verboseResponse; }
const std::string& getVerbosePrompt() const { return verbosePrompt; }
// Accumulators used to assemble the "raw model output" for streaming responses.
void appendVerboseRawTokens(const std::vector<int64_t>& tokens) {
verboseRawTokens.insert(verboseRawTokens.end(), tokens.begin(), tokens.end());
}
void appendVerboseRawText(const std::string& chunk) {
verboseRawText.append(chunk);
}
void setVerboseRawText(std::string text) {
verboseRawText = std::move(text);
}
const std::vector<int64_t>& getVerboseRawTokens() const { return verboseRawTokens; }
const std::string& getVerboseRawText() const { return verboseRawText; }
// Builds a SSE-ready JSON chunk with the accumulated verbose info, or empty string if disabled.
std::string serializeStreamingVerboseChunk();

// Usage tracking
void setPromptTokensUsage(size_t promptTokens);
void setCompletionTokensUsage(size_t completionTokens);
Expand Down
42 changes: 42 additions & 0 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,16 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.String("prompt", getVerbosePrompt());
jsonResponse.StartArray("raw_outputs");
for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) {
Comment on lines +397 to +400
std::string rawText = tokenizer.decode(generationOutput.generated_ids, ov::genai::skip_special_tokens(false));
jsonResponse.String(rawText);
}
jsonResponse.EndArray();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -453,6 +463,16 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.String("prompt", getVerbosePrompt());
jsonResponse.StartArray("raw_outputs");
for (const auto& tokens : results.tokens) {
std::string rawText = tokenizer.decode(tokens, ov::genai::skip_special_tokens(false));
jsonResponse.String(rawText);
}
jsonResponse.EndArray();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -519,6 +539,16 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

if (isVerboseResponse()) {
jsonResponse.String("prompt", getVerbosePrompt());
jsonResponse.StartArray("raw_outputs");
// For VLM the raw decoded text is provided by GenAI directly.
if (!textResponse.empty()) {
jsonResponse.String(textResponse);
}
jsonResponse.EndArray();
}

// finish response object
jsonResponse.EndObject();
return jsonResponse.ToString();
Expand Down Expand Up @@ -613,6 +643,18 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str
// TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with.
// Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism.

// Verbose mode: attach prompt and raw model output to the FINAL chunk only.
if (isVerboseResponse() && finishReason != ov::genai::GenerationFinishReason::NONE) {
doc.AddMember("prompt", Value(getVerbosePrompt().c_str(), allocator), allocator);
std::string rawOutput;
if (!getVerboseRawTokens().empty()) {
rawOutput = tokenizer.decode(getVerboseRawTokens(), ov::genai::skip_special_tokens(false));
} else {
rawOutput = getVerboseRawText();
}
doc.AddMember("raw_output", Value(rawOutput.c_str(), allocator), allocator);
}

StringBuffer buffer;
Writer<StringBuffer> writer(buffer);
doc.Accept(writer);
Expand Down
3 changes: 3 additions & 0 deletions src/llm/language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptr<GenAiServabl
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
}
if (executionContext->apiHandler->isVerboseResponse() && !legacyExecutionContext->results.tokens.empty()) {
executionContext->apiHandler->appendVerboseRawTokens(legacyExecutionContext->results.tokens[0]);
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (!serializedChunk.empty()) {
executionContext->response = wrapTextInServerSideEventMessage(serializedChunk);
Expand Down
6 changes: 6 additions & 0 deletions src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
case Endpoint::TOKENIZE:
return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage");
}
if (Config::instance().getServerSettings().verboseResponse) {
executionContext->apiHandler->enableVerboseResponse(inputText);
}
bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS);
executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids;
if (getProperties()->maxModelLength.has_value()) {
Expand Down Expand Up @@ -299,6 +302,9 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr<GenAiServable
}
auto& generationOutput = executionContext->generationOutputs[0];
executionContext->apiHandler->incrementProcessedTokens(generationOutput.generated_ids.size());
if (executionContext->apiHandler->isVerboseResponse()) {
executionContext->apiHandler->appendVerboseRawTokens(generationOutput.generated_ids);
}

std::stringstream ss;
executionContext->textStreamer->write(generationOutput.generated_ids);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <unordered_map>
#include <vector>

#include "../../../config.hpp"
#include "../../../logging.hpp"
#include "../../text_utils.hpp"
#include "../../../tokenize/tokenize_parser.hpp"
Expand Down Expand Up @@ -110,6 +111,10 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
return absl::InvalidArgumentError("Unsupported endpoint");
}

if (Config::instance().getServerSettings().verboseResponse) {
vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText);
}

// Below logic is used only for the statistics and debugging purposes and does not affect the model execution.
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText);
bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens
Expand Down
10 changes: 10 additions & 0 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
lastTextChunk = executionContext->lastStreamerCallbackOutput;
executionContext->lastStreamerCallbackOutput = "";
}
if (executionContext->apiHandler->isVerboseResponse() && !lastTextChunk.empty()) {
executionContext->apiHandler->appendVerboseRawText(lastTextChunk);
}
if (generationStatus != std::future_status::ready) { // continue
// For RESPONSES endpoint, always call serializeStreamingChunk so that
// output item initialization events are emitted even before the tokenizer produces text.
Expand All @@ -244,6 +247,9 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
// if streamer::put returned a value, streamer::end() result will not contain it, so we add it manually
if (!executionContext->lastStreamerCallbackOutput.empty()) {
lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput;
if (executionContext->apiHandler->isVerboseResponse()) {
executionContext->apiHandler->appendVerboseRawText(executionContext->lastStreamerCallbackOutput);
}
}
std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP);
if (!serializedChunk.empty()) {
Expand Down Expand Up @@ -308,6 +314,10 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
return absl::InvalidArgumentError("Unsupported endpoint");
}

if (Config::instance().getServerSettings().verboseResponse) {
vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText);
}

// Below logic is used only for the statistics and debugging purposes and does not affect the model execution.
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText);
bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens
Expand Down