openvinotoolkit · michalkulakowski · Jan 21, 2026 · Nov 25, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/demos/audio/README.md b/demos/audio/README.md
@@ -47,6 +47,41 @@ python export_model.py text2speech --source_model microsoft/speecht5_tts --weigh
 
 The default configuration should work in most cases but the parameters can be tuned via `export_model.py` script arguments. Run the script with `--help` argument to check available parameters and see the [T2s calculator documentation](../../docs/speech_generation/reference.md) to learn more about configuration options and limitations.
 
+### Speaker embeddings
+
+Instead of generating speech with default model voice you can create speaker embeddings with [this script](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/speech_generation/create_speaker_embedding.py)
+```bash
+curl --output create_speaker_embedding.py "https://raw.githubusercontent.com/openvinotoolkit/openvino.genai/refs/heads/master/samples/python/speech_generation/create_speaker_embedding.py"
+python create_speaker_embedding.py
+mv speaker_embedding.bin models/
+```
+Script records your speech for 5 seconds(you can adjust duration of recording to achieve better results) and then, using speechbrain/spkrec-xvect-voxceleb model, creates `speaker_embedding.bin` file that contains your speaker embedding.
+Now you need to add speaker embedding path to graph.pbtxt file of text2speech graph:
+```
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+node {
+  name: "T2sExecutor"
+  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
+  calculator: "T2sCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+  node_options: {
+    [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
+      models_path: "./",
+      plugin_config: '{ "NUM_STREAMS": "1" }',
+      target_device: "CPU",
+      voices: [
+        {
+          name: "voice",
+          path: "/models/speaker_embedding.bin",
+        }
+      ]
+    }
+  }
+}
+```
+
 ### Deployment
 
 **CPU**

diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -27,6 +27,7 @@ LEGACY_MODEL_FILE="1/model.bin"
 EMBEDDING_MODEL="thenlper/gte-small"
 RERANK_MODEL="BAAI/bge-reranker-base"
 VLM_MODEL="OpenGVLab/InternVL2-1B"
+TTS_MODEL="microsoft/speecht5_tts"
 
 # Models for tools testing. Only tokenizers are downloaded.
 QWEN3_MODEL="Qwen/Qwen3-8B"
@@ -78,6 +79,16 @@ if [ ! -f "$1/$FACEBOOK/chat_template.jinja" ]; then
     cp src/test/llm/dummy_facebook_template.jinja "$1/$FACEBOOK/chat_template.jinja"
 fi
 
+if [ -f "$1/$TTS_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Model file $1/$TTS_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+else
+  python3 demos/common/export_models/export_model.py text2speech --source_model "$TTS_MODEL" --weight-format int4 --model_repository_path $1 --vocoder microsoft/speecht5_hifigan
+fi
+if [ ! -f "$1/$TTS_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Model file $1/$TTS_MODEL/$TOKENIZER_FILE does not exist."
+  exit 1
+fi
+
 if [ -f "$1/$VLM_MODEL/$TOKENIZER_FILE" ]; then
   echo "Model file $1/$VLM_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
 else

diff --git a/src/BUILD b/src/BUILD
@@ -2517,6 +2517,7 @@ cc_test(
                 "test/llm/text_streamer_test.cpp",
                 "test/llm/visual_language_model/complete_flow_test.cpp",
                 "test/llm/visual_language_model/initialization_test.cpp",
+                "test/audio/text2speech_test.cpp",
             ],
             "//:disable_mediapipe" : [
                 "test/disabled_mediapipe_test.cpp",

diff --git a/src/audio/text_to_speech/BUILD b/src/audio/text_to_speech/BUILD
@@ -20,6 +20,14 @@ load("//:common_settings.bzl", "ovms_cc_library")
 ovms_cc_library(
     name = "t2s_servable",
     hdrs = ["t2s_servable.hpp"],
+    srcs = ["t2s_servable.cpp"],
+        deps = [
+        "//third_party:genai",
+        "@mediapipe//mediapipe/framework:calculator_framework",
+        "//src:libovmslogging",
+        "//src:libmodelconfigjsonparser",
+        "t2s_calculator_cc_proto",
+    ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
 )
@@ -36,7 +44,6 @@ ovms_cc_library(
         "//src/port:rapidjson_stringbuffer",
         "//src/port:rapidjson_writer",
         ":t2s_servable",
-        "//third_party:genai",
         "//src/audio:audio_utils",
         "//src:libmodelconfigjsonparser",
     ],

diff --git a/src/audio/text_to_speech/t2s_calculator.cc b/src/audio/text_to_speech/t2s_calculator.cc
@@ -103,8 +103,21 @@ class T2sCalculator : public CalculatorBase {
             if (streamIt != payload.parsedJson->MemberEnd()) {
                 return absl::InvalidArgumentError("streaming is not supported");
             }
+            std::optional<std::string> voiceName;
+            auto voiceIt = payload.parsedJson->FindMember("voice");
+            if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
+                voiceName = voiceIt->value.GetString();
+                if (pipe->voices.find(voiceName.value()) == pipe->voices.end())
+                    return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", voiceName.value()));
+            }
+            ov::genai::Text2SpeechDecodedResults generatedSpeech;
             std::unique_lock lock(pipe->ttsPipelineMutex);
-            auto generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+
+            if (voiceName.has_value()) {
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voiceName.value()]);
+            } else {
+                generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
+            }
             auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
             auto speechSize = generatedSpeech.speeches[0].get_size();
             ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());

diff --git a/src/audio/text_to_speech/t2s_calculator.proto b/src/audio/text_to_speech/t2s_calculator.proto
@@ -31,4 +31,13 @@ message T2sCalculatorOptions {
     required string models_path = 1;
     optional string target_device = 2;
     optional string plugin_config = 3;
+
+    message SpeakerEmbeddings {
+      // Speaker name.
+      required string name = 1;
+
+      // Path to speaker embeddings file.
+      required string path = 2;
+    }
+    repeated SpeakerEmbeddings voices = 4;
 }
diff --git a/src/audio/text_to_speech/t2s_servable.cpp b/src/audio/text_to_speech/t2s_servable.cpp
@@ -0,0 +1,85 @@
+//*****************************************************************************
+// Copyright 2026 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <fstream>
+
+#include "openvino/genai/whisper_pipeline.hpp"
+#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+#include "src/audio/text_to_speech/t2s_calculator.pb.h"
+#include "src/status.hpp"
+#include "src/logging.hpp"
+#include "src/json_parser.hpp"
+
+#include "src/audio/text_to_speech/t2s_servable.hpp"
+
+namespace ovms {
+
+static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
+    std::ifstream input(file_path, std::ios::binary);
+    if (input.fail()) {
+        std::stringstream ss;
+        ss << "Failed to open file: " << file_path.string();
+        throw std::runtime_error(ss.str());
+    }
+
+    // Get file size
+    input.seekg(0, std::ios::end);
+    size_t buffer_size = static_cast<size_t>(input.tellg());
+    input.seekg(0, std::ios::beg);
+
+    // Check size is multiple of float
+    if (buffer_size % sizeof(float) != 0) {
+        throw std::runtime_error("File size is not a multiple of float size.");
+    }
+    size_t num_floats = buffer_size / sizeof(float);
+    if (num_floats != 512) {
+        throw std::runtime_error("File must contain speaker embedding including 512 32-bit floats.");
+    }
+
+    ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
+    input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
+    if (input.fail()) {
+        throw std::runtime_error("Failed to read all data from file.");
+    }
+
+    return floats_tensor;
+}
+
+TtsServable::TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& pluginConfig, const std::string& graphPath) {
+    auto fsModelsPath = std::filesystem::path(modelDir);
+    if (fsModelsPath.is_relative()) {
+        parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
+    } else {
+        parsedModelsPath = fsModelsPath;
+    }
+    ov::AnyMap config;
+    Status status = JsonParser::parsePluginConfig(pluginConfig, config);
+    if (!status.ok()) {
+        SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", pluginConfig);
+        throw std::runtime_error("Error during plugin_config option parsing");
+    }
+    ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice, config);
+    for (auto voice : graphVoices) {
+        if (!std::filesystem::exists(voice.path()))
+            throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voice.path()};
+        voices[voice.name()] = read_speaker_embedding(voice.path());
+    }
+}
+}  // namespace ovms
diff --git a/src/audio/text_to_speech/t2s_servable.hpp b/src/audio/text_to_speech/t2s_servable.hpp
@@ -13,51 +13,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
+
 #pragma once
-// cSpell:ignore genai
+
+#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
+#include "src/audio/text_to_speech/t2s_calculator.pb.h"
 
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <vector>
-
-#pragma warning(push)
-#pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#include "mediapipe/framework/calculator_graph.h"
-#pragma GCC diagnostic pop
-#pragma warning(pop)
-
-#include "openvino/genai/whisper_pipeline.hpp"
-#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
-#include "src/audio/text_to_speech/t2s_calculator.pb.h"
-#include "src/status.hpp"
-#include "src/logging.hpp"
-#include "src/json_parser.hpp"
 
 namespace ovms {
 
-struct TtsServable {
-    std::filesystem::path parsedModelsPath;
+class TtsServable {
+public:
     std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
+    std::unordered_map<std::string, ov::Tensor> voices;
     std::mutex ttsPipelineMutex;
+    std::filesystem::path parsedModelsPath;
 
-    TtsServable(const mediapipe::T2sCalculatorOptions& nodeOptions, const std::string& graphPath) {
-        auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
-        if (fsModelsPath.is_relative()) {
-            parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
-        } else {
-            parsedModelsPath = fsModelsPath;
-        }
-        ov::AnyMap config;
-        Status status = JsonParser::parsePluginConfig(nodeOptions.plugin_config(), config);
-        if (!status.ok()) {
-            SPDLOG_ERROR("Error during llm node plugin_config option parsing to JSON: {}", nodeOptions.plugin_config());
-            throw std::runtime_error("Error during plugin_config option parsing");
-        }
-        ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), nodeOptions.target_device(), config);
-    }
+    TtsServable(const std::string& modelDir, const std::string& targetDevice, const google::protobuf::RepeatedPtrField<mediapipe::T2sCalculatorOptions_SpeakerEmbeddings>& graphVoices, const std::string& pluginConfig, const std::string& graphPath);
 };
 
 using TtsServableMap = std::unordered_map<std::string, std::shared_ptr<TtsServable>>;

diff --git a/src/mediapipe_internal/mediapipegraphdefinition.cpp b/src/mediapipe_internal/mediapipegraphdefinition.cpp
@@ -616,9 +616,14 @@ Status MediapipeGraphDefinition::initializeNodes() {
                 SPDLOG_LOGGER_ERROR(modelmanager_logger, "Failed to unpack calculator options");
                 return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
             }
-            std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions, mgconfig.getBasePath());
-            ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
-            ttsServablesCleaningGuard.disableCleaning();
+            try {
+                std::shared_ptr<TtsServable> servable = std::make_shared<TtsServable>(nodeOptions.models_path(), nodeOptions.target_device(), nodeOptions.voices(), nodeOptions.plugin_config(), mgconfig.getBasePath());
+                ttsServableMap.insert(std::pair<std::string, std::shared_ptr<TtsServable>>(nodeName, std::move(servable)));
+                ttsServablesCleaningGuard.disableCleaning();
+            } catch (const std::runtime_error& e) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "TextToSpeech node name: {} initialization failed: {}. ", nodeName, e.what());
+                return StatusCode::MEDIAPIPE_GRAPH_CONFIG_FILE_INVALID;
+            }
         }
     }
     return StatusCode::OK;

diff --git a/src/test/audio/config.json b/src/test/audio/config.json
@@ -0,0 +1,10 @@
+{
+    "model_config_list": [],
+    "mediapipe_config_list": [
+    {
+        "name":"text2speech",
+        "base_path":"/ovms/src/test/audio/",
+        "graph_path":"/ovms/src/test/audio/graph.pbtxt"
+    }
+    ]
+}
diff --git a/src/test/audio/graph.pbtxt b/src/test/audio/graph.pbtxt
@@ -0,0 +1,38 @@
+# Copyright 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+input_stream: "HTTP_REQUEST_PAYLOAD:input"
+output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+node {
+  name: "ttsNode1"
+  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
+  calculator: "T2sCalculator"
+  input_stream: "HTTP_REQUEST_PAYLOAD:input"
+  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+    node_options: {
+      [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
+        models_path: "/ovms/src/test/llm_testing/microsoft/speecht5_tts"
+        plugin_config: '{"NUM_STREAMS": "1" }',
+        target_device: "CPU"
+        voices: [
+        {
+          name: "speaker1",
+          path: "/ovms/src/test/audio/speaker.bin",
+        }
+      ]
+      }
+    }
+}
diff --git a/src/test/audio/invalid_speaker.bin b/src/test/audio/invalid_speaker.bin
diff --git a/src/test/audio/speaker.bin b/src/test/audio/speaker.bin