PromtEngineer · shoryasethia · Jan 7, 2026
diff --git a/README.md b/README.md
@@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
 - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice.
 - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice.
 - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice.
+- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation.
 - **Local**: Placeholder for a local TTS model.
 
 ## Detailed Module Descriptions  📘

diff --git a/requirements.txt b/requirements.txt
@@ -34,4 +34,5 @@ sounddevice
 cartesia
 soundfile
 ollama
-pydub
+pydub
+google-generativeai
diff --git a/run_voice_assistant.py b/run_voice_assistant.py
@@ -65,7 +65,7 @@ def main():
             chat_history.append({"role": "assistant", "content": response_text})
 
             # Determine the output file format based on the TTS model
-            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
+            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini':
                 output_file = 'output.mp3'
             else:
                 output_file = 'output.wav'

diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
@@ -15,7 +15,8 @@
     "tts": {
         "openai": Config.OPENAI_API_KEY,
         "deepgram":Config.DEEPGRAM_API_KEY,
-        "elevenlabs": Config.ELEVENLABS_API_KEY
+        "elevenlabs": Config.ELEVENLABS_API_KEY,
+        "gemini": Config.GOOGLE_API_KEY
     }
 }
 

diff --git a/voice_assistant/config.py b/voice_assistant/config.py
@@ -44,6 +44,7 @@ class Config:
     ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
     LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
     CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
     # for serving the MeloTTS model
     TTS_PORT_LOCAL = 5150
@@ -64,7 +65,7 @@ def validate_config():
         Config._validate_model('RESPONSE_MODEL', [
             'openai', 'groq', 'ollama', 'local'])
         Config._validate_model('TTS_MODEL', [
-            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
+            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini'])
 
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
@@ -77,6 +78,7 @@ def validate_config():
         Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY')
+        Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY')
 
     @staticmethod
     def _validate_model(attribute, valid_options):

diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
@@ -5,6 +5,7 @@
 import elevenlabs
 import soundfile as sf
 import requests
+import google.generativeai as genai
 
 from openai import OpenAI
 from deepgram import DeepgramClient, SpeakOptions
@@ -58,6 +59,28 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                 model="eleven_turbo_v2"
             )
             elevenlabs.save(audio, output_file_path)
+
+        elif model == 'gemini':
+            genai.configure(api_key=api_key)
+            model_instance = genai.GenerativeModel("gemini-2.0-flash-exp")
+
+            response = model_instance.generate_content(
+                text,
+                generation_config=genai.GenerationConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=genai.SpeechConfig(
+                        voice_config=genai.VoiceConfig(
+                            prebuilt_voice_config=genai.PrebuiltVoiceConfig(
+                                voice_name="Aoede"
+                            )
+                        )
+                    )
+                )
+            )
+
+            # Save the audio content to file
+            with open(output_file_path, "wb") as f:
+                f.write(response.candidates[0].content.parts[0].inline_data.data)
 
         elif model == "cartesia":
             client = Cartesia(api_key=api_key)