Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Dockerfile.redhat
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ SHELL ["/bin/bash", "-xo", "pipefail", "-c"]
ARG JOBS=40
ARG VERBOSE_LOGS=OFF
ARG LTO_ENABLE=OFF
ARG ESPEAK=1

# hadolint ignore=DL3041
RUN dnf install -y -d6 \
Expand All @@ -129,6 +130,10 @@ RUN dnf install -y -d6 \
python3.12-pip \
libicu-devel && \
dnf clean all
RUN if [ "$ESPEAK" == "1" ] ; then \
dnf install -y espeak-ng espeak-ng-libs || dnf install -y espeak-ng-libs ; \
dnf clean all ; \
fi

WORKDIR /

Expand Down Expand Up @@ -234,11 +239,11 @@ RUN git clone https://github.com/$ov_tokenizers_org/openvino_tokenizers.git /ope
fi

WORKDIR /openvino_genai/
ARG ov_genai_branch=master
ARG ov_genai_org=openvinotoolkit
ARG ov_genai_branch=kokoro_tts
ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
# hadolint ignore=DL3003
RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
Comment on lines 241 to +246
cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} ${LTO_CXX_FLAGS} " -DCMAKE_SHARED_LINKER_FLAGS="${LTO_LD_FLAGS}" -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
Comment on lines 241 to 247
cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
Expand Down Expand Up @@ -393,6 +398,7 @@ LABEL "maintainer"="dariusz.trawinski@intel.com"
ARG INSTALL_RPMS_FROM_URL=
ARG INSTALL_DRIVER_VERSION="24.52.32224"
ARG GPU=0
ARG ESPEAK=1
ARG debug_bazel_flags=
LABEL bazel-build-flags=${debug_bazel_flags}
LABEL supported-devices="CPU=1 GPU=${GPU}"
Expand All @@ -407,6 +413,10 @@ COPY ./install_redhat_gpu_drivers.sh /install_gpu_drivers.sh
# hadolint ignore=DL3003,DL3041,SC2164,SC1091
RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_downloads=8\nretries=50" >> /etc/dnf/dnf.conf ; else export DNF_TOOL=microdnf ; fi ; \
$DNF_TOOL upgrade --setopt=install_weak_deps=0 --nodocs -y ; \
if [ "$ESPEAK" == "1" ] ; then \
$DNF_TOOL install -y espeak-ng espeak-ng-libs --setopt=install_weak_deps=0 --nodocs || \
$DNF_TOOL install -y espeak-ng-libs --setopt=install_weak_deps=0 --nodocs ; \
fi ; \
if [ "$GPU" == "1" ] ; then \
source /install_gpu_drivers.sh && rm -rf /install_gpu_drivers.sh; \
fi ; \
Expand Down
15 changes: 12 additions & 3 deletions Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ ENV DEBIAN_FRONTEND=noninteractive
SHELL ["/bin/bash", "-xo", "pipefail", "-c"]

ARG debug_bazel_flags="--strip=always --config=mp_on_py_on --//:distro=ubuntu"
ARG ESPEAK=1
RUN if [ "$BASE_OS" == "ubuntu24" ] ; then apt-get update && \
apt-get install -y software-properties-common --no-install-recommends; add-apt-repository 'ppa:deadsnakes/ppa' -y && \
apt-get clean && rm -rf /var/lib/apt/lists/* ; fi
Expand Down Expand Up @@ -124,6 +125,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
vim && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN if [ "$ESPEAK" == "1" ] ; then \
apt-get update && apt-get install -y --no-install-recommends espeak-ng && \
apt-get clean && rm -rf /var/lib/apt/lists/* ; \
fi
# on ubuntu 24.04 python3.12 is used as default python for ovms build and release
# TF build needs python3.10 with numpy as it does not support python3.12
RUN python3.10 -m pip install "numpy<2.0.0" --no-cache-dir
Expand Down Expand Up @@ -220,12 +225,12 @@ RUN if [ "$ov_use_binary" == "0" ]; then true ; else exit 0 ; fi ; \
if ! [[ $debug_bazel_flags == *"_py_off"* ]]; then \
cp build/python/* /opt/intel/openvino/python/openvino_tokenizers/ ; \
fi
ARG ov_genai_branch=master
ARG ov_genai_org=openvinotoolkit
ARG ov_genai_branch=kokoro_tts
ARG ov_genai_repo=https://github.com/RyanMetcalfeInt8/openvino.genai.git
WORKDIR /openvino_genai/
# hadolint ignore=DL3003
RUN if [ "$ov_use_binary" == "0" ]; then \
git clone https://github.com/$ov_genai_org/openvino.genai /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
git clone $ov_genai_repo /openvino_genai && cd /openvino_genai && git checkout $ov_genai_branch && git submodule update --init --recursive && \
Comment on lines +228 to +233
cmake -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE -DCMAKE_CXX_FLAGS=" ${SDL_OPS} " -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DENABLE_SYSTEM_ICU="True" -DBUILD_TOKENIZERS=OFF -DENABLE_SAMPLES=OFF -DENABLE_TOOLS=OFF -DENABLE_TESTS=OFF -DENABLE_XGRAMMAR=ON -S ./ -B ./build/ && \
Comment on lines +228 to 234
cmake --build ./build/ --parallel $JOBS && cp /openvino_genai/build/openvino_genai/lib*.so* /opt/intel/openvino/runtime/lib/intel64/ && \
cp -r /openvino_genai/src/cpp/include/* /opt/intel/openvino/runtime/include/ && \
Expand Down Expand Up @@ -395,6 +400,7 @@ ARG INSTALL_RPMS_FROM_URL=
ARG INSTALL_DRIVER_VERSION="24.26.30049"
ARG GPU=0
ARG NPU=0
ARG ESPEAK=1
ENV DEBIAN_FRONTEND=noninteractive
ARG debug_bazel_flags=
LABEL bazel-build-flags=${debug_bazel_flags}
Expand All @@ -413,6 +419,9 @@ COPY ./install_ubuntu_gpu_drivers.sh /tmp/install_gpu_drivers.sh
# hadolint ignore=DL3003,SC2164
RUN apt-get update ; \
apt-get install -y --no-install-recommends curl ca-certificates libxml2 || exit 1; \
if [ "$ESPEAK" == "1" ] ; then \
apt-get install -y --no-install-recommends espeak-ng || exit 1; \
fi ; \
if [ "$GPU" == "1" ] ; then \
/tmp/install_gpu_drivers.sh ; \
fi ; \
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ BUILD_TESTS ?= 0
RUN_GPU_TESTS ?=
GPU ?= 0
NPU ?= 0
ESPEAK ?= 1
BUILD_NGINX ?= 0
MEDIAPIPE_DISABLE ?= 0
PYTHON_DISABLE ?= 0
Expand Down Expand Up @@ -237,6 +238,7 @@ BUILD_ARGS = --build-arg http_proxy=$(HTTP_PROXY)\
--build-arg BASE_OS=$(BASE_OS)\
--build-arg INSTALL_RPMS_FROM_URL=$(INSTALL_RPMS_FROM_URL)\
--build-arg INSTALL_DRIVER_VERSION=$(INSTALL_DRIVER_VERSION)\
--build-arg ESPEAK=$(ESPEAK)\
--build-arg RELEASE_BASE_IMAGE=$(BASE_IMAGE_RELEASE)\
--build-arg JOBS=$(JOBS)\
--build-arg CAPI_FLAGS=$(CAPI_FLAGS)\
Expand Down
33 changes: 32 additions & 1 deletion src/audio/audio_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//*****************************************************************************
// Copyright 2025 Intel Corporation
// Copyright 2026 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -22,6 +22,7 @@
#include "src/logging.hpp"
#include <string>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
#pragma warning(push)
Expand Down Expand Up @@ -188,3 +189,33 @@ void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample
auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
}

void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr) {
enum : unsigned int {
OUTPUT_PREPARATION,
TIMER_END
};
Timer<TIMER_END> timer;
timer.start(OUTPUT_PREPARATION);

drwav_data_format format;
format.container = drwav_container_riff;
format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
format.channels = 1;
format.sampleRate = 24000; // Kokoro native sample rate
format.bitsPerSample = 32;
Comment on lines +201 to +206
drwav wav;

auto status = drwav_init_memory_write(&wav, ppData, &pDataSize, &format, nullptr);
if (status == DRWAV_FALSE) {
throw std::runtime_error("Failed to initialize WAV writer");
}
drwav_uint64 framesWritten = drwav_write_pcm_frames(&wav, speechSize, waveformPtr);
if (framesWritten != speechSize) {
throw std::runtime_error("Failed to write all frames");
}
drwav_uninit(&wav);
timer.stop(OUTPUT_PREPARATION);
auto outputPreparationTime = (timer.elapsed<std::chrono::microseconds>(OUTPUT_PREPARATION)) / 1000;
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "Output preparation time: {} ms", outputPreparationTime);
}
1 change: 1 addition & 0 deletions src/audio/audio_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ bool isWavBuffer(const std::string buf);
std::vector<float> readWav(const std::string_view& wavData);
std::vector<float> readMp3(const std::string_view& mp3Data);
void prepareAudioOutput(void** ppData, size_t& pDataSize, uint16_t bitsPerSample, size_t speechSize, const float* waveformPtr);
void prepareAudioOutputKokoro(void** ppData, size_t& pDataSize, size_t speechSize, const float* waveformPtr);
1 change: 0 additions & 1 deletion src/audio/speech_to_text/s2t_servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ namespace ovms {
namespace {
constexpr size_t ISO_LANG_CODE_MAX = 3;
}

SttServable::SttServable(const ::mediapipe::S2tCalculatorOptions& nodeOptions, const std::string& graphPath) {
auto fsModelsPath = std::filesystem::path(nodeOptions.models_path());
if (fsModelsPath.is_relative()) {
Expand Down
1 change: 1 addition & 0 deletions src/audio/text_to_speech/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ ovms_cc_library(
srcs = ["t2s_calculator.cc",
"tts_node_initializer.cpp"],
deps = [
"//third_party:genai",
"@mediapipe//mediapipe/framework:calculator_framework",
"//src:httppayload",
"//src:libovmslogging",
Expand Down
58 changes: 48 additions & 10 deletions src/audio/text_to_speech/t2s_calculator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "src/client_connection.hpp"
#include "src/http_payload.hpp"
#include "src/logging.hpp"
#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
#include "openvino/openvino.hpp"
#include <mutex>
#include <thread>

Expand Down Expand Up @@ -63,6 +65,8 @@ static absl::Status checkClientDisconnected(const ovms::HttpPayload& payload, co
class T2sCalculator : public CalculatorBase {
static const std::string INPUT_TAG_NAME;
static const std::string OUTPUT_TAG_NAME;
std::string defaultLanguage = "en-us";
float defaultSpeed = 1.0f;

public:
static absl::Status GetContract(CalculatorContract* cc) {
Expand All @@ -81,6 +85,13 @@ class T2sCalculator : public CalculatorBase {

absl::Status Open(CalculatorContext* cc) final {
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "T2sCalculator [Node: {}] Open start", cc->NodeName());
const auto& options = cc->Options<mediapipe::T2sCalculatorOptions>();
if (options.has_language() && !options.language().empty()) {
defaultLanguage = options.language();
}
if (options.has_speed()) {
defaultSpeed = options.speed();
}
return absl::OkStatus();
}

Expand Down Expand Up @@ -113,26 +124,49 @@ class T2sCalculator : public CalculatorBase {
if (streamIt != payload.parsedJson->MemberEnd()) {
return absl::InvalidArgumentError("streaming is not supported");
}
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "1");
std::optional<std::string> voiceName;
auto voiceIt = payload.parsedJson->FindMember("voice");
if (voiceIt != payload.parsedJson->MemberEnd() && voiceIt->value.IsString()) {
if (voiceIt != payload.parsedJson->MemberEnd()) {
if (!voiceIt->value.IsString()) {
return absl::InvalidArgumentError("voice field is not a string");
}
voiceName = voiceIt->value.GetString();
if (pipe->voices.find(voiceName.value()) == pipe->voices.end())
return absl::InvalidArgumentError(absl::StrCat("Requested voice not available: ", voiceName.value()));
}

std::string language = defaultLanguage;
auto languageIt = payload.parsedJson->FindMember("language");
if (languageIt != payload.parsedJson->MemberEnd()) {
if (!languageIt->value.IsString()) {
return absl::InvalidArgumentError("language field is not a string");
}
language = languageIt->value.GetString();
}
float speed = defaultSpeed;
auto speedIt = payload.parsedJson->FindMember("speed");
if (speedIt != payload.parsedJson->MemberEnd()) {
if (!speedIt->value.IsNumber()) {
return absl::InvalidArgumentError("speed field is not a number");
}
speed = speedIt->value.GetFloat();
}
ov::genai::Text2SpeechDecodedResults generatedSpeech;
std::unique_lock lock(pipe->ttsPipelineMutex);
auto disconnectStatus = checkClientDisconnected(payload, cc->NodeName(), "before generation");
if (!disconnectStatus.ok())
return disconnectStatus;

ov::Tensor speakerEmbedding;
std::string selectedVoice = "af_alloy";
if (voiceName.has_value()) {
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), pipe->voices[voiceName.value()]);
} else {
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString());
selectedVoice = voiceName.value();
Comment on lines +157 to +160
auto speakerIt = pipe->voices.find(selectedVoice);
if (speakerIt != pipe->voices.end()) {
speakerEmbedding = speakerIt->second;
}
Comment on lines +162 to +164
}
auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
ov::AnyMap properties{{"voice", selectedVoice}, {"language", language}, {"speed", speed}};
generatedSpeech = pipe->ttsPipeline->generate(inputIt->value.GetString(), speakerEmbedding, properties);
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "3");
//auto bitsPerSample = generatedSpeech.speeches[0].get_element_type().bitwidth();
auto speechSize = generatedSpeech.speeches[0].get_size();
ov::Tensor cpuTensor(generatedSpeech.speeches[0].get_element_type(), generatedSpeech.speeches[0].get_shape());
// copy results to release inference request
Expand All @@ -143,14 +177,18 @@ class T2sCalculator : public CalculatorBase {
return disconnectStatus;
void* ppData;
size_t pDataSize;
prepareAudioOutput(&ppData, pDataSize, bitsPerSample, speechSize, cpuTensor.data<const float>());
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "4");
prepareAudioOutputKokoro(&ppData, pDataSize, speechSize, cpuTensor.data<const float>());
SPDLOG_LOGGER_DEBUG(t2s_calculator_logger, "5");
Comment on lines 169 to +182
output = std::make_unique<std::string>(reinterpret_cast<char*>(ppData), pDataSize);
drwav_free(ppData, NULL);
} else {
return absl::InvalidArgumentError(absl::StrCat("Unsupported URI: ", payload.uri));
}
} catch (ov::AssertFailure& e) {
return absl::InvalidArgumentError(e.what());
}catch (std::runtime_error& e) {
return absl::InvalidArgumentError(e.what());
} catch (...) {
return absl::InvalidArgumentError("Response generation failed");
}
Expand Down
2 changes: 2 additions & 0 deletions src/audio/text_to_speech/t2s_calculator.proto
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ message T2sCalculatorOptions {
required string path = 2;
}
repeated SpeakerEmbeddings voices = 4;
optional string language = 5 [default = "en-us"];
optional float speed = 6 [default = 1.0];
}
27 changes: 20 additions & 7 deletions src/audio/text_to_speech/t2s_servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#include <unordered_map>
#include <vector>
#include <fstream>
#include <sstream>

#include "openvino/genai/whisper_pipeline.hpp"
#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
#include "src/audio/text_to_speech/t2s_calculator.pb.h"
#include "src/status.hpp"
Expand All @@ -31,7 +31,15 @@

namespace ovms {

static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path) {
static size_t getShapeElementsCount(const ov::Shape& shape) {
size_t elementsCount = 1;
for (const auto dim : shape) {
elementsCount *= dim;
}
return elementsCount;
}

static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path, const ov::Shape& expectedShape) {
std::ifstream input(file_path, std::ios::binary);
if (input.fail()) {
std::stringstream ss;
Expand All @@ -48,12 +56,16 @@ static ov::Tensor read_speaker_embedding(const std::filesystem::path& file_path)
if (buffer_size % sizeof(float) != 0) {
throw std::runtime_error("File size is not a multiple of float size.");
}
size_t num_floats = buffer_size / sizeof(float);
if (num_floats != 512) {
throw std::runtime_error("File must contain speaker embedding including 512 32-bit floats.");
const size_t numFloats = buffer_size / sizeof(float);
const size_t expectedElements = getShapeElementsCount(expectedShape);
if (numFloats != expectedElements) {
std::stringstream ss;
ss << "File must contain speaker embedding with " << expectedElements
<< " 32-bit floats. Got: " << numFloats;
throw std::runtime_error(ss.str());
}

ov::Tensor floats_tensor(ov::element::f32, ov::Shape{1, num_floats});
ov::Tensor floats_tensor(ov::element::f32, expectedShape);
input.read(reinterpret_cast<char*>(floats_tensor.data()), buffer_size);
if (input.fail()) {
throw std::runtime_error("Failed to read all data from file.");
Expand All @@ -76,10 +88,11 @@ TtsServable::TtsServable(const std::string& modelDir, const std::string& targetD
throw std::runtime_error("Error during plugin_config option parsing");
}
ttsPipeline = std::make_shared<ov::genai::Text2SpeechPipeline>(parsedModelsPath.string(), targetDevice, config);
const ov::Shape speakerEmbeddingShape = ttsPipeline->get_speaker_embedding_shape();
for (auto voice : graphVoices) {
if (!std::filesystem::exists(voice.path()))
throw std::runtime_error{"Requested voice speaker embeddings file does not exist: " + voice.path()};
voices[voice.name()] = read_speaker_embedding(voice.path());
voices[voice.name()] = read_speaker_embedding(voice.path(), speakerEmbeddingShape);
}
}
} // namespace ovms
10 changes: 8 additions & 2 deletions src/audio/text_to_speech/t2s_servable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,21 @@

#pragma once

#include "openvino/genai/speech_generation/text2speech_pipeline.hpp"
#include "src/audio/text_to_speech/t2s_calculator.pb.h"

#include <filesystem>
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>

namespace ovms {
#include "openvino/runtime/tensor.hpp"

namespace ov::genai {
class Text2SpeechPipeline;
}

namespace ovms {
class TtsServable {
public:
std::shared_ptr<ov::genai::Text2SpeechPipeline> ttsPipeline;
Expand Down
Loading