openvinotoolkit · przepeck · Apr 15, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/docs/parameters.md b/docs/parameters.md
@@ -138,8 +138,8 @@ Task specific parameters for different tasks (text generation/image generation/e
 | `--max_prompt_len`                    | `integer`    | Sets NPU specific property for maximum number of tokens in the prompt.                                                     |
 | `--kv_cache_precision`                | `string`     | Reduced kv cache precision to `u8` lowers the cache size consumption. Accepted values: `u8` or empty (default).            |
 | `--model_distribution_policy`         | `string`     | TENSOR_PARALLEL distributes tensor to multiple sockets/devices and processes it in parallel. PIPELINE_PARALLEL distributes different tensors to process by each device. Accepted values: `TENSOR_PARALLEL`, `PIPELINE_PARALLEL` or empty (default). |
-| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss]                     |
-| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2]            |
+| `--reasoning_parser`                  | `string`     | Type of parser to use for reasoning content extraction from model output. Currently supported: [qwen3, gptoss, gemma4]                     |
+| `--tool_parser`                       | `string`     | Type of parser to use for tool calls extraction from model output. Currently supported: [llama3, phi4, hermes3, mistral, qwen3coder, gptoss, devstral, lfm2, gemma4]            |
 | `--enable_tool_guided_generation`     | `bool`       | Enables enforcing tool schema during generation. Requires setting response parser. Default: false.                         |
 
 ### Image generation

diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -39,6 +39,7 @@ MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 GPT_OSS_MODEL="openai/gpt-oss-20b"
 DEVSTRAL_MODEL="unsloth/Devstral-Small-2507"
 LFM2_MODEL="LiquidAI/LFM2-2.6B"
+GEMMA4_MODEL="OpenVINO/gemma-4-E4B-it-int4-ov"
 
 if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi
 
@@ -228,4 +229,13 @@ fi
 if [ ! -f "$1/$LFM2_MODEL/$TOKENIZER_FILE" ]; then
   echo "[ERROR] Models file $1/$LFM2_MODEL/$TOKENIZER_FILE does not exist."
   exit 1
-fi
+fi
+if [ -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+else
+  hf download "$GEMMA4_MODEL" --local-dir $1/$GEMMA4_MODEL --include *tokenizer*
+fi
+if [ ! -f "$1/$GEMMA4_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Models file $1/$GEMMA4_MODEL/$TOKENIZER_FILE does not exist."
+  exit 1
+fi
diff --git a/spelling-whitelist.txt b/spelling-whitelist.txt
@@ -29,3 +29,4 @@ demos/vlm_npu/README.md:157: mane ==> main, many, maine
 demos/vlm_npu/README.md:218: mane ==> main, many, maine
 demos/integration_with_OpenWebUI/README.md:423: Buildin ==> Building, Build in
 src/test/llm/output_parsers/lfm2_output_parser_test.cpp
+src/test/llm/output_parsers/gemma4_output_parser_test.cpp
diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -197,6 +197,38 @@ ovms_cc_library(
     ],
     visibility = ["//visibility:public"],
 )
+ovms_cc_library(
+    name = "io_processing_gemma4_tool_parser",
+    hdrs = ["io_processing/gemma4/tool_parser.hpp", "io_processing/gemma4/reasoning_parser.hpp"],
+    srcs = ["io_processing/gemma4/tool_parser.cpp", "io_processing/gemma4/reasoning_parser.cpp"],
+    deps = [
+        "@com_github_tencent_rapidjson//:rapidjson",
+        "//src/port:rapidjson_document",
+        "//src:libovmslogging",
+        "//src:libovmsstring_utils",
+        ":io_processing_utils",
+        ":io_processing_base_output_parser",
+        ":io_processing_qwen3_reasoning_parser",
+        "//third_party:genai",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+ovms_cc_library(
+    name = "io_processing_qwen3_reasoning_parser",
+    hdrs = ["io_processing/qwen3/reasoning_parser.hpp"],
+    srcs = ["io_processing/qwen3/reasoning_parser.cpp"],
+    deps = [
+        "@com_github_tencent_rapidjson//:rapidjson",
+        "//src/port:rapidjson_document",
+        "//src:libovmslogging",
+        "//src:libovmsstring_utils",
+        ":io_processing_utils",
+        ":io_processing_base_output_parser",
+        "//third_party:genai",
+    ],
+    visibility = ["//visibility:public"],
+)
 
 ovms_cc_library( # TODO split further so we don't have to recompile everything when changing one parser ...
     name = "output_parsers",
@@ -206,7 +238,6 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/phi4/tool_parser.hpp",
             "io_processing/devstral/tool_parser.hpp",
             "io_processing/mistral/tool_parser.hpp",
-            "io_processing/qwen3/reasoning_parser.hpp",
             "io_processing/gptoss/reasoning_parser.hpp",
             "io_processing/gptoss/tool_parser.hpp",
             "io_processing/gptoss/harmony.hpp",
@@ -218,7 +249,6 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
             "io_processing/phi4/tool_parser.cpp",
             "io_processing/devstral/tool_parser.cpp",
             "io_processing/mistral/tool_parser.cpp",
-            "io_processing/qwen3/reasoning_parser.cpp",
             "io_processing/gptoss/reasoning_parser.cpp",
             "io_processing/gptoss/tool_parser.cpp",
             "io_processing/gptoss/harmony.cpp",
@@ -234,6 +264,8 @@ ovms_cc_library( # TODO split further so we don't have to recompile everything w
         ":io_processing_base_output_parser",
         ":io_processing_qwen3coder_tool_parser",
         ":io_processing_lfm2_tool_parser",
+        ":io_processing_gemma4_tool_parser",
+        ":io_processing_qwen3_reasoning_parser",
         ":io_processing_utils",
         ":apis_tool_schema_wrapper",
     ],

diff --git a/src/llm/io_processing/gemma4/gemma4_reasoning_parser.cpp b/src/llm/io_processing/gemma4/gemma4_reasoning_parser.cpp
@@ -0,0 +1,67 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <vector>
+
+#include "src/port/rapidjson_document.hpp"
+
+#include "../../../logging.hpp"
+#include "gemma4_reasoning_parser.hpp"
+#include "../utils.hpp"
+
+namespace ovms {
+void Gemma4ReasoningParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
+    std::string startReasoningTag = getParsingStartTags()[0];
+    std::string endReasoningTag = getParsingEndTag();
+    size_t startPos = parsedOutput.content.find(startReasoningTag);
+    size_t endPos = parsedOutput.content.find(endReasoningTag);
+
+    if (startPos != std::string::npos && endPos != std::string::npos && startPos < endPos) {
+        size_t reasoningStart = startPos + startReasoningTag.length();
+        std::string reasoningText = parsedOutput.content.substr(reasoningStart, endPos - reasoningStart);
+        parsedOutput.reasoning = reasoningText;
+        // Remove reasoning from content
+        parsedOutput.content.erase(startPos, endPos - startPos + endReasoningTag.length());
+    }
+}
+
+std::optional<rapidjson::Document> Gemma4ReasoningParser::parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) {
+    if (chunk.empty()) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Received empty chunk for Gemma4ReasoningParser");
+        return std::nullopt;
+    }
+
+    if (chunk.find(getParsingStartTags()[0]) != std::string::npos || chunk.find(getParsingEndTag()) != std::string::npos) {
+        return std::nullopt;
+    } else {
+        rapidjson::StringBuffer buffer;
+        rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+        writer.StartObject();
+        writer.String("delta");
+        writer.StartObject();
+        writer.String("reasoning_content");
+        writer.String(chunk.c_str());
+        writer.EndObject();
+        writer.EndObject();
+        rapidjson::Document doc;
+        doc.Parse(buffer.GetString());
+        return doc;
+    }
+    return std::nullopt;
+}
+}  // namespace ovms
diff --git a/src/llm/io_processing/gemma4/gemma4_reasoning_parser.hpp b/src/llm/io_processing/gemma4/gemma4_reasoning_parser.hpp
@@ -0,0 +1,48 @@
+//*****************************************************************************
+// Copyright 2026 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <string>
+#include <vector>
+
+#include "../base_output_parser.hpp"
+
+namespace ovms {
+class Gemma4ReasoningParser : public BaseOutputParser {
+protected:
+    // Tags used to identify the reasoning segment in the content
+    std::string parsingStartTag = "<|channel>thought\n";
+    std::string parsingEndTag = "<channel|>";
+
+public:
+    Gemma4ReasoningParser() = delete;
+    explicit Gemma4ReasoningParser(ov::genai::Tokenizer& tokenizer) :
+        BaseOutputParser(tokenizer) {}
+
+    void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
+    std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
+    const std::vector<std::string>& getParsingStartTags() const override {
+        static const std::vector<std::string> parsingStartTags{this->parsingStartTag};
+        return parsingStartTags;
+    }
+    const std::vector<std::string>& getSpecialParsingStartTags() const override {
+        static const std::vector<std::string> specialParsingStartTags{};
+        return specialParsingStartTags;
+    }
+    const std::string& getParsingEndTag() const override {
+        return parsingEndTag;
+    }
+};
+}  // namespace ovms
diff --git a/src/llm/io_processing/gemma4/reasoning_parser.cpp b/src/llm/io_processing/gemma4/reasoning_parser.cpp
@@ -0,0 +1,73 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <openvino/genai/tokenizer.hpp>
+#include <string>
+#include <vector>
+
+#include "src/port/rapidjson_document.hpp"
+
+#include "../../../logging.hpp"
+#include "reasoning_parser.hpp"
+#include "../utils.hpp"
+
+namespace ovms {
+void Gemma4ReasoningParser::parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) {
+    auto startPos = std::string::npos;
+    auto endPos = std::string::npos;
+
+    auto startIt = std::find(generatedTokens.begin(), generatedTokens.end(), reasoningTokenId);
+    auto endIt = std::find(generatedTokens.begin(), generatedTokens.end(), reasoningEndTokenId);
+
+    if (startIt != generatedTokens.end() && endIt != generatedTokens.end() && startIt < endIt) {
+        startPos = std::distance(generatedTokens.begin(), startIt);
+        endPos = std::distance(generatedTokens.begin(), endIt);
+    }
+
+    if (startPos != std::string::npos && endPos != std::string::npos && startPos < endPos) {
+        size_t reasoningStart = startPos + 3;  // deleting "<|channel>thought\n"
+        std::string reasoningText = tokenizer.decode(std::vector<int64_t>(generatedTokens.begin() + reasoningStart, generatedTokens.begin() + endPos), ov::genai::skip_special_tokens(true));
+        parsedOutput.reasoning = reasoningText;
+        // Remove reasoning from content
+        std::string contentWithoutReasoning = tokenizer.decode(std::vector<int64_t>(generatedTokens.begin() + endPos + 1, generatedTokens.end()), ov::genai::skip_special_tokens(true));  // content MUST never appear before reasoning
+        parsedOutput.content = contentWithoutReasoning;
+    }
+}
+std::optional<rapidjson::Document> Gemma4ReasoningParser::parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) {
+    if (chunk.empty()) {
+        SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Received empty chunk for Gemma4ReasoningParser");
+        return std::nullopt;
+    }
+
+    if (chunk.find(getParsingStartTags()[0]) != std::string::npos || chunk.find(getParsingEndTag()) != std::string::npos) {
+        return std::nullopt;
+    } else {
+        rapidjson::StringBuffer buffer;
+        rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
+        writer.StartObject();
+        writer.String("delta");
+        writer.StartObject();
+        writer.String("reasoning_content");
+        writer.String(chunk.c_str());
+        writer.EndObject();
+        writer.EndObject();
+        rapidjson::Document doc;
+        doc.Parse(buffer.GetString());
+        return doc;
+    }
+    return std::nullopt;
+}
+}  // namespace ovms
diff --git a/src/llm/io_processing/gemma4/reasoning_parser.hpp b/src/llm/io_processing/gemma4/reasoning_parser.hpp
@@ -0,0 +1,56 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+
+#include <openvino/genai/tokenizer.hpp>
+#include <vector>
+#include <string>
+
+#include "../qwen3/reasoning_parser.hpp"
+
+namespace ovms {
+class Gemma4ReasoningParser : public Qwen3ReasoningParser {
+protected:
+    const int64_t reasoningTokenId = 100;     // <|channel>
+    const int64_t reasoningEndTokenId = 101;  // <channel|>
+
+    const std::string parsingStartTag = "<|channel>thought\n";
+    const std::string parsingEndTag = "<channel|>";
+
+public:
+    Gemma4ReasoningParser() = delete;
+    explicit Gemma4ReasoningParser(ov::genai::Tokenizer& tokenizer) :
+        Qwen3ReasoningParser(tokenizer) {}
+    void parse(ParsedOutput& parsedOutput, const std::vector<int64_t>& generatedTokens) override;
+    std::optional<rapidjson::Document> parseChunk(const std::string& chunk, ov::genai::GenerationFinishReason finishReason) override;
+
+    bool requiresStreamingWithSpecialTokens() const override {
+        return true;
+    }
+
+    const std::vector<std::string>& getParsingStartTags() const override {
+        static const std::vector<std::string> parsingStartTags{this->parsingStartTag};
+        return parsingStartTags;
+    }
+    const std::vector<std::string>& getSpecialParsingStartTags() const override {
+        static const std::vector<std::string> specialParsingStartTags{};
+        return specialParsingStartTags;
+    }
+    const std::string& getParsingEndTag() const override {
+        return parsingEndTag;
+    }
+};
+}  // namespace ovms