Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
- **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice.
- **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice.
- **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice.
- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation.
- **Local**: Placeholder for a local TTS model.

## Detailed Module Descriptions 📘
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ sounddevice
cartesia
soundfile
ollama
pydub
pydub
google-generativeai
2 changes: 1 addition & 1 deletion run_voice_assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def main():
chat_history.append({"role": "assistant", "content": response_text})

# Determine the output file format based on the TTS model
if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini':
output_file = 'output.mp3'
else:
output_file = 'output.wav'
Expand Down
3 changes: 2 additions & 1 deletion voice_assistant/api_key_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"tts": {
"openai": Config.OPENAI_API_KEY,
"deepgram":Config.DEEPGRAM_API_KEY,
"elevenlabs": Config.ELEVENLABS_API_KEY
"elevenlabs": Config.ELEVENLABS_API_KEY,
"gemini": Config.GOOGLE_API_KEY
}
}

Expand Down
4 changes: 3 additions & 1 deletion voice_assistant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class Config:
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# for serving the MeloTTS model
TTS_PORT_LOCAL = 5150
Expand All @@ -64,7 +65,7 @@ def validate_config():
Config._validate_model('RESPONSE_MODEL', [
'openai', 'groq', 'ollama', 'local'])
Config._validate_model('TTS_MODEL', [
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini'])

Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
Expand All @@ -77,6 +78,7 @@ def validate_config():
Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY')
Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY')
Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY')

@staticmethod
def _validate_model(attribute, valid_options):
Expand Down
23 changes: 23 additions & 0 deletions voice_assistant/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import elevenlabs
import soundfile as sf
import requests
import google.generativeai as genai

from openai import OpenAI
from deepgram import DeepgramClient, SpeakOptions
Expand Down Expand Up @@ -58,6 +59,28 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
model="eleven_turbo_v2"
)
elevenlabs.save(audio, output_file_path)

elif model == 'gemini':
genai.configure(api_key=api_key)
model_instance = genai.GenerativeModel("gemini-2.0-flash-exp")

response = model_instance.generate_content(
text,
generation_config=genai.GenerationConfig(
response_modalities=["AUDIO"],
speech_config=genai.SpeechConfig(
voice_config=genai.VoiceConfig(
prebuilt_voice_config=genai.PrebuiltVoiceConfig(
voice_name="Aoede"
)
)
)
)
)

# Save the audio content to file
with open(output_file_path, "wb") as f:
f.write(response.candidates[0].content.parts[0].inline_data.data)

elif model == "cartesia":
client = Cartesia(api_key=api_key)
Expand Down