openvinotoolkit · michalkulakowski · Mar 16, 2026
diff --git a/Dockerfile.redhat b/Dockerfile.redhat
@@ -109,6 +109,7 @@ SHELL ["/bin/bash", "-xo", "pipefail", "-c"]
 ARG JOBS=40
 ARG VERBOSE_LOGS=OFF
 ARG LTO_ENABLE=OFF
+ARG ESPEAK=1
 
 # hadolint ignore=DL3041
 RUN dnf install -y -d6 \
@@ -129,6 +130,10 @@ RUN dnf install -y -d6 \
             python3.12-pip \
             libicu-devel && \
             dnf clean all
+RUN if [ "$ESPEAK" == "1" ] ; then \
+    dnf install -y espeak-ng espeak-ng-libs || dnf install -y espeak-ng-libs ; \
+    dnf clean all ; \
+    fi
 
 WORKDIR /
 
@@ -234,11 +239,11 @@ RUN git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /ope
     fi
 
 WORKDIR /openvino_genai/
-ARG ov_genai_branch=master
-ARG ov_genai_org=openvinotoolkit
+ARG ov_genai_branch=kokoro_tts
+ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
 # hadolint ignore=DL3003
 RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
-    git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
+    git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
     cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
     cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
     cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
@@ -393,6 +398,7 @@ LABEL "maintainer"="dariusz.trawinski@intel.com"
 ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.52.32224"
 ARG GPU=0
+ARG ESPEAK=1
 ARG debug_bazel_flags=
 LABEL bazel-build-flags=${debug_bazel_flags}
 LABEL supported-devices="CPU=1 GPU=${GPU}"
@@ -407,6 +413,10 @@ COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
 # hadolint ignore=DL3003,DL3041,SC2164,SC1091
 RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_downloads=8\nretries=50" >> /etc/dnf/dnf.conf ; else export DNF_TOOL=microdnf ; fi ; \
     $DNF_TOOL upgrade --setopt=install_weak_deps=0 --nodocs -y ; \
+    if [ "$ESPEAK" == "1" ] ; then \
+        $DNF_TOOL install -y espeak-ng espeak-ng-libs --setopt=install_weak_deps=0 --nodocs || \
+        $DNF_TOOL install -y espeak-ng-libs --setopt=install_weak_deps=0 --nodocs ; \
+    fi ; \
     if [ "$GPU" == "1" ] ; then \
         source /install_gpu_drivers.sh && rm -rf /install_gpu_drivers.sh; \
     fi ; \

diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu
@@ -95,6 +95,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 SHELL ["/bin/bash", "-xo", "pipefail", "-c"]
 
 ARG debug_bazel_flags="--strip=always  --config=mp_on_py_on --//:distro=ubuntu"
+ARG ESPEAK=1
 RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
     apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
     apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
@@ -124,6 +125,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
             vim && \
             apt-get clean && \
             rm -rf /var/lib/apt/lists/*
+RUN if [ "$ESPEAK" == "1" ] ; then \
+    apt-get update && apt-get install -y --no-install-recommends espeak-ng && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* ; \
+    fi
 # on ubuntu 24.04 python3.12 is used as default python for ovms build and release
 # TF build needs python3.10 with numpy as it does not support python3.12
 RUN python3.10 -m pip install "numpy<2.0.0" --no-cache-dir
@@ -220,12 +225,12 @@ RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
     if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \
     cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \
     fi
-ARG ov_genai_branch=master
-ARG ov_genai_org=openvinotoolkit
+ARG ov_genai_branch=kokoro_tts
+ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
 WORKDIR /openvino_genai/
 # hadolint ignore=DL3003
 RUN if [ "$ov_use_binary" == "0" ]; then \
-    git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
+    git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
     cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} " -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
     cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
     cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
@@ -395,6 +400,7 @@ ARG INSTALL_RPMS_FROM_URL=
 ARG INSTALL_DRIVER_VERSION="24.26.30049"
 ARG GPU=0
 ARG NPU=0
+ARG ESPEAK=1
 ENV DEBIAN_FRONTEND=noninteractive
 ARG debug_bazel_flags=
 LABEL bazel-build-flags=${debug_bazel_flags}
@@ -413,6 +419,9 @@ COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
 # hadolint ignore=DL3003,SC2164
 RUN apt-get update ; \
     apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
+    if [ "$ESPEAK" == "1" ] ; then \
+	apt-get install -y --no-install-recommends espeak-ng || exit 1; \
+    fi ; \
     if [ "$GPU" == "1" ] ; then \
 	/tmp/install_gpu_drivers.sh ; \
     fi ; \

diff --git a/Makefile b/Makefile
@@ -61,6 +61,7 @@ BUILD_TESTS ?= 0
 RUN_GPU_TESTS ?=
 GPU ?= 0
 NPU ?= 0
+ESPEAK ?= 1
 BUILD_NGINX ?= 0
 MEDIAPIPE_DISABLE ?= 0
 PYTHON_DISABLE ?= 0
@@ -237,6 +238,7 @@ BUILD_ARGS = --build-arg http_proxy=$(HTTP_PROXY)\
 	--build-arg BASE_OS=$(BASE_OS)\
 	--build-arg INSTALL_RPMS_FROM_URL=$(INSTALL_RPMS_FROM_URL)\
 	--build-arg INSTALL_DRIVER_VERSION=$(INSTALL_DRIVER_VERSION)\
+	--build-arg ESPEAK=$(ESPEAK)\
 	--build-arg RELEASE_BASE_IMAGE=$(BASE_IMAGE_RELEASE)\
 	--build-arg JOBS=$(JOBS)\
 	--build-arg CAPI_FLAGS=$(CAPI_FLAGS)\

diff --git a/src/audio/audio_utils.cpp b/src/audio/audio_utils.cpp
@@ -1,5 +1,5 @@
 //*****************************************************************************
-// Copyright 2025 Intel Corporation
+// Copyright 2026 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "src/logging.hpp"
 #include <string>
 #include <vector>
+#include <cmath>
 #include <random>
 #include <algorithm>
 #pragma warning(push)
@@ -188,3 +189,33 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
     auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
     SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
 }
+
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) {
+    enum : unsigned int {
+        OUTPUT_PREPARATION,
+        TIMER_END
+    };
+    Timer<TIMER_END> timer;
+    timer.start(OUTPUT_PREPARATION);
+
+    drwav_data_format format;
+    format.container = drwav_container_riff;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+    format.channels = 1;
+    format.sampleRate = 24000;  // Kokoro native sample rate
+    format.bitsPerSample = 32;
+    drwav wav;
+
+    auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
+    if (status == DRWAV_FALSE) {
+        throw std::runtime_error("Failed to initialize WAV writer");
+    }
+    drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr);
+    if (framesWritten != speechSize) {
+        throw std::runtime_error("Failed to write all frames");
+    }
+    drwav_uninit(&wav);
+    timer.stop(OUTPUT_PREPARATION);
+    auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
+    SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
+}
diff --git a/src/audio/audio_utils.hpp b/src/audio/audio_utils.hpp
@@ -25,3 +25,4 @@ bool isWavBuffer(const std::string buf);
 std::vector<float> readWav(const std::string_view& wavData);
 std::vector<float> readMp3(const std::string_view& mp3Data);
 void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
+void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr);
diff --git a/src/audio/speech_to_text/s2t_servable.cpp b/src/audio/speech_to_text/s2t_servable.cpp
@@ -35,7 +35,6 @@ namespace ovms {
 namespace {
 constexpr size_t ISO_LANG_CODE_MAX = 3;
 }
-
 SttServable::SttServable(const ::mediapipe::S2tCalculatorOptions& nodeOptions, const std::string& graphPath) {
     auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
     if (fsModelsPath.is_relative()) {

diff --git a/src/audio/text_to_speech/BUILD b/src/audio/text_to_speech/BUILD
@@ -37,6 +37,7 @@ ovms_cc_library(
     srcs = ["t2s_calculator.cc",
             "tts_node_initializer.cpp"],
     deps = [
+        "//third_party:genai",
         "@mediapipe//mediapipe/framework:calculator_framework",
         "//src:httppayload",
         "//src:libovmslogging",

diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc
@@ -28,6 +28,8 @@
 #include "src/client_connection.hpp"
 #include "src/http_payload.hpp"
 #include "src/logging.hpp"
+#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+#include "openvino/openvino.hpp"
 #include <mutex>
 #include <thread>
 
@@ -63,6 +65,8 @@ static absl::Status checkClientDisconnected(const ovms::HttpPayload& payload, co
 class T2sCalculator : public CalculatorBase {
     static const std::string INPUT_TAG_NAME;
     static const std::string OUTPUT_TAG_NAME;
+    std::string defaultLanguage = "en-us";
+    float defaultSpeed = 1.0f;
 
 public:
     static absl::Status GetContract(CalculatorContract* cc) {
@@ -81,6 +85,13 @@ class T2sCalculator : public CalculatorBase {
 
     absl::Status Open(CalculatorContext* cc) final {
         SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "T2sCalculator  [Node: {}] Open start", cc->NodeName());
+        const auto& options = cc->Options<mediapipe::T2sCalculatorOptions>();
+        if (options.has_language() && !options.language().empty()) {
+            defaultLanguage = options.language();
+        }
+        if (options.has_speed()) {
+            defaultSpeed = options.speed();
+        }
         return absl::OkStatus();
     }
 
@@ -113,26 +124,49 @@ class T2sCalculator : public CalculatorBase {
                 if (streamIt != payload.parsedJson->MemberEnd()) {
                     return absl::InvalidArgumentError("streaming is not supported");
                 }
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "1");
                 std::optional<std::string> voiceName;
                 auto voiceIt = payload.parsedJson->FindMember("voice");
-                if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                if (voiceIt != payload.parsedJson->MemberEnd()) {
+                    if (!voiceIt->value.IsString()) {
+                        return absl::InvalidArgumentError("voice field is not a string");
+                    }
                     voiceName = voiceIt->value.GetString();
-                    if (pipe->voices.find(voiceName.value()) == pipe->voices.end())
-                        return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", voiceName.value()));
                 }
-
+                std::string language = defaultLanguage;
+                auto languageIt = payload.parsedJson->FindMember("language");
+                if (languageIt != payload.parsedJson->MemberEnd()) {
+                    if (!languageIt->value.IsString()) {
+                        return absl::InvalidArgumentError("language field is not a string");
+                    }
+                    language = languageIt->value.GetString();
+                }
+                float speed = defaultSpeed;
+                auto speedIt = payload.parsedJson->FindMember("speed");
+                if (speedIt != payload.parsedJson->MemberEnd()) {
+                    if (!speedIt->value.IsNumber()) {
+                        return absl::InvalidArgumentError("speed field is not a number");
+                    }
+                    speed = speedIt->value.GetFloat();
+                }
                 ov::genai::Text2SpeechDecodedResults generatedSpeech;
                 std::unique_lock lock(pipe->ttsPipelineMutex);
                 auto disconnectStatus = checkClientDisconnected(payload, cc->NodeName(), "before generation");
                 if (!disconnectStatus.ok())
                     return disconnectStatus;
-
+                ov::Tensor speakerEmbedding;
+                std::string selectedVoice = "af_alloy";
                 if (voiceName.has_value()) {
-                    generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voiceName.value()]);
-                } else {
-                    generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+                    selectedVoice = voiceName.value();
+                    auto speakerIt = pipe->voices.find(selectedVoice);
+                    if (speakerIt != pipe->voices.end()) {
+                        speakerEmbedding = speakerIt->second;
+                    }
                 }
-                auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
+                ov::AnyMap properties{{"voice", selectedVoice}, {"language", language}, {"speed", speed}};
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding, properties);
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "3");
+                //auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
                 auto speechSize = generatedSpeech.speeches[0].get_size();
                 ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
                 // copy results to release inference request
@@ -143,14 +177,18 @@ class T2sCalculator : public CalculatorBase {
                     return disconnectStatus;
                 void* ppData;
                 size_t pDataSize;
-                prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpuTensor.data<const float>());
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "4");
+                prepareAudioOutputKokoro(&ppData, pDataSize, speechSize, cpuTensor.data<const float>());
+                SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "5");
                 output = std::make_unique<std::string>(reinterpret_cast<char*>(ppData), pDataSize);
                 drwav_free(ppData, NULL);
             } else {
                 return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri));
             }
         } catch (ov::AssertFailure& e) {
             return absl::InvalidArgumentError(e.what());
+        }catch (std::runtime_error& e) {
+            return absl::InvalidArgumentError(e.what());
         } catch (...) {
             return absl::InvalidArgumentError("Response generation failed");
         }

diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto
@@ -40,4 +40,6 @@ message T2sCalculatorOptions {
       required string path = 2;
     }
     repeated SpeakerEmbeddings voices = 4;
+    optional string language = 5 [default = "en-us"];
+    optional float speed = 6 [default = 1.0];
 }
diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp
@@ -19,8 +19,8 @@
 #include <unordered_map>
 #include <vector>
 #include <fstream>
+#include <sstream>
 
-#include "openvino/genai/whisper_pipeline.hpp"
 #include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
 #include "src/audio/text_to_speech/t2s_calculator.pb.h"
 #include "src/status.hpp"
@@ -31,7 +31,15 @@
 
 namespace ovms {
 
-static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
+static size_t getShapeElementsCount(const ov::Shape& shape) {
+    size_t elementsCount = 1;
+    for (const auto dim : shape) {
+        elementsCount *= dim;
+    }
+    return elementsCount;
+}
+
+static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path, const ov::Shape& expectedShape) {
     std::ifstream input(file_path, std::ios::binary);
     if (input.fail()) {
         std::stringstream ss;
@@ -48,12 +56,16 @@ static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path)
     if (buffer_size % sizeof(float) != 0) {
         throw std::runtime_error("File size is not a multiple of float size.");
     }
-    size_t num_floats = buffer_size / sizeof(float);
-    if (num_floats != 512) {
-        throw std::runtime_error("File must contain speaker embedding including 512 32-bit floats.");
+    const size_t numFloats = buffer_size / sizeof(float);
+    const size_t expectedElements = getShapeElementsCount(expectedShape);
+    if (numFloats != expectedElements) {
+        std::stringstream ss;
+        ss << "File must contain speaker embedding with " << expectedElements
+           << " 32-bit floats. Got: " << numFloats;
+        throw std::runtime_error(ss.str());
     }
 
-    ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
+    ov::Tensor floats_tensor(ov::element::f32, expectedShape);
     input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
     if (input.fail()) {
         throw std::runtime_error("Failed to read all data from file.");
@@ -76,10 +88,11 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD
         throw std::runtime_error("Error during plugin_config option parsing");
     }
     ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice, config);
+    const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape();
     for (auto voice : graphVoices) {
         if (!std::filesystem::exists(voice.path()))
             throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voice.path()};
-        voices[voice.name()] = read_speaker_embedding(voice.path());
+        voices[voice.name()] = read_speaker_embedding(voice.path(), speakerEmbeddingShape);
     }
 }
 }  // namespace ovms
diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp
@@ -16,15 +16,21 @@
 
 #pragma once
 
-#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
 #include "src/audio/text_to_speech/t2s_calculator.pb.h"
 
+#include <filesystem>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 
-namespace ovms {
+#include "openvino/runtime/tensor.hpp"
+
+namespace ov::genai {
+class Text2SpeechPipeline;
+}
 
+namespace ovms {
 class TtsServable {
 public:
     std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;