import os from datetime import datetime from io import BytesIO from elevenlabs import ElevenLabs, VoiceSettings from pydub import AudioSegment from utils.config import ( ELEVENLABS_API_KEY, ELEVENLABS_GUEST_VOICE, ELEVENLABS_HOST_VOICE, OUTPUT_DIR, ) # Import Supertonic TTS wrapper from synthesis.supertonic_tts import ( SupertonicWrapper, SUPERTONIC_VOICES, DEFAULT_HOST_VOICE as SUPERTONIC_DEFAULT_HOST, DEFAULT_GUEST_VOICE as SUPERTONIC_DEFAULT_GUEST, get_supertonic_engine, ) # ElevenLabs Voice Options ELEVENLABS_VOICES = { # Male Voices "Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV", "Josh (Male - Deep)": "TxGEqnHWrfWFTfGW9XjX", "Arnold (Male - Crisp)": "VR6AewLTigWG4xSOukaG", "Callum (Male - Hoarse)": "N2lVS1w4EtoT3dr4eOWO", "Charlie (Male - Casual)": "IKne3meq5aSn9XLyUdCD", "Clyde (Male - War veteran)": "2EiwWnXFnvU5JabPnv8n", "Daniel (Male - Deep British)": "onwK4e9ZLuTAKqWW03F9", "Ethan (Male - Young American)": "g5CIjZEefAph4nQFvHAz", "Fin (Male - Irish)": "D38z5RcWu1voky8WS1ja", "George (Male - British)": "JBFqnCBsd6RMkjVDRZzb", # Female Voices "Bella (Female - Soft)": "EXAVITQu4vr4xnSDxMaL", "Rachel (Female - Calm)": "21m00Tcm4TlvDq8ikWAM", "Domi (Female - Strong)": "AZnzlk1XvdvUeBnXmlld", "Elli (Female - Emotional)": "MF3mGyEYCl7XYWbV9V6O", "Emily (Female - Calm British)": "LcfcDJNUP1GQjkzn1xUU", "Freya (Female - Young American)": "jsCqWAovK2LkecY7zXl4", "Gigi (Female - Young Expressive)": "jBpfuIE2acCO8z3wKNLl", "Grace (Female - Southern American)": "oWAxZDx7w5VEj9dCyTzz", "Lily (Female - Warm British)": "pFZP5JQG7iQjIQuC4Bku", "Matilda (Female - Warm)": "XrExE9yKIg1WjnnlVkGX", } def generate_unique_filename(): """Generate unique filename using timestamp""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") return f"podcast_{timestamp}.wav" class TTSEngine: def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None): """ Initialize TTS Engine with either ElevenLabs or Supertonic. Args: tts_provider: "elevenlabs" or "supertonic" custom_api_key: API key for ElevenLabs (required if using elevenlabs) host_voice: Voice ID for Host (optional, uses default if not provided) guest_voice: Voice ID for Guest (optional, uses default if not provided) """ self.mode = tts_provider.lower() if self.mode == "elevenlabs": print("Initializing ElevenLabs TTS API...") # Use custom key if provided, otherwise use default api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY if not api_key: raise ValueError("ElevenLabs API key is required") self.client = ElevenLabs(api_key=api_key) # Use custom voices or defaults self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE if custom_api_key: print("✓ ElevenLabs TTS ready (custom API key)") else: print("✓ ElevenLabs TTS ready") # Print selected voices host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id] guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id] print(f" Host: {host_name[0] if host_name else 'Custom/Default'}") print(f" Guest: {guest_name[0] if guest_name else 'Custom/Default'}") elif self.mode == "supertonic": print("Initializing Supertonic TTS (CPU mode)...") self.supertonic_engine = get_supertonic_engine() self.supertonic_engine.initialize() # Use custom voices or defaults # For Supertonic, voice is the display name, we'll convert to ID later self.host_voice_id = host_voice if host_voice else SUPERTONIC_DEFAULT_HOST self.guest_voice_id = guest_voice if guest_voice else SUPERTONIC_DEFAULT_GUEST print("✓ Supertonic TTS ready (CPU mode, no API key required)") print(f" Host: {self.host_voice_id}") print(f" Guest: {self.guest_voice_id}") else: raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.") def _build_speaker_mapping(self, script: list) -> dict: """ Build a mapping from speaker names to voice IDs. First unique speaker gets host_voice, second gets guest_voice. This allows PPF personas to work with any character names. Args: script: List of dialogue items with 'speaker' keys Returns: dict: Mapping from speaker name to voice ID """ unique_speakers = [] for item in script: if isinstance(item, dict) and "speaker" in item: speaker = item["speaker"] if speaker not in unique_speakers: unique_speakers.append(speaker) # Map first speaker to host_voice, second to guest_voice mapping = {} if len(unique_speakers) >= 1: mapping[unique_speakers[0]] = self.host_voice_id print(f" 🎙️ Speaker mapping: {unique_speakers[0]} → Host Voice") if len(unique_speakers) >= 2: mapping[unique_speakers[1]] = self.guest_voice_id print(f" 🎙️ Speaker mapping: {unique_speakers[1]} → Guest Voice") return mapping def synthesize_dialogue(self, script: list) -> str: """ Synthesize the script to audio using the selected TTS provider. Args: script: List of dialogue items Returns: str: Path to the generated audio file """ if self.mode == "elevenlabs": return self._synthesize_elevenlabs(script) elif self.mode == "supertonic": return self._synthesize_supertonic(script) else: raise ValueError(f"Unknown TTS mode: {self.mode}") def _synthesize_elevenlabs(self, script: list) -> str: """Synthesize using ElevenLabs API""" print("Synthesizing audio via ElevenLabs API...") audio_segments = [] # Build dynamic speaker-to-voice mapping # First unique speaker gets host_voice, second gets guest_voice speaker_to_voice = self._build_speaker_mapping(script) for i, item in enumerate(script): # Defensive checks for required keys if not isinstance(item, dict): print(f"⚠️ Skipping item {i + 1}: not a dictionary") continue if "text" not in item: print(f"⚠️ Skipping item {i + 1}: missing 'text' key") continue if "speaker" not in item: print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key") continue text = item["text"] speaker = item["speaker"] emotion = item.get("emotion", "neutral") # Note: ElevenLabs doesn't have a direct emotion parameter. # Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.) # which the script generator already creates based on the emotion field. # We log the emotion for debugging but don't modify the text (would be spoken out loud). # Select voice based on speaker using dynamic mapping voice_id = speaker_to_voice.get(speaker, self.host_voice_id) try: print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...") # Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed # Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2 audio_generator = self.client.text_to_speech.convert( voice_id=voice_id, text=text, model_id="eleven_turbo_v2_5", # Upgraded from multilingual_v2 for better quality voice_settings=VoiceSettings( stability=0.4, # Lower = more expressiveness and variation (default: 0.5) similarity_boost=0.8, # Higher = better voice consistency (default: 0.75) style=0.6, # Higher = more dynamic, expressive delivery (default: 0.5) use_speaker_boost=True, # Enhances similarity to original voice ), ) # Collect audio bytes audio_bytes = b"".join(audio_generator) # Convert to AudioSegment audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes)) audio_segments.append(audio_segment) # Add 500ms silence between speakers silence = AudioSegment.silent(duration=500) audio_segments.append(silence) print(f"✓ Synthesized line {i + 1}/{len(script)}") except Exception as e: print(f"Error synthesizing line '{text[:50]}...': {e}") # Continue with next line even if one fails if not audio_segments: print("No audio generated") return "" # Combine all segments print("Combining audio segments...") combined = sum(audio_segments) # Export as WAV with unique filename filename = generate_unique_filename() output_path = os.path.join(OUTPUT_DIR, filename) combined.export(output_path, format="wav") print(f"✓ Podcast saved to: {output_path}") return output_path def _synthesize_supertonic(self, script: list) -> str: """Synthesize using Supertonic TTS (CPU-based)""" print("Synthesizing audio via Supertonic TTS (CPU mode)...") audio_segments = [] # Build dynamic speaker-to-voice mapping # First unique speaker gets host_voice, second gets guest_voice speaker_to_voice = self._build_speaker_mapping(script) for i, item in enumerate(script): # Defensive checks for required keys if not isinstance(item, dict): print(f"⚠️ Skipping item {i + 1}: not a dictionary") continue if "text" not in item: print(f"⚠️ Skipping item {i + 1}: missing 'text' key") continue if "speaker" not in item: print(f"⚠️ Skipping item {i + 1}: missing 'speaker' key") continue text = item["text"] speaker = item["speaker"] emotion = item.get("emotion", "neutral") # Select voice based on speaker using dynamic mapping voice_id = speaker_to_voice.get(speaker, self.host_voice_id) try: print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...") # Generate audio using Supertonic # Parameters optimized for quality vs speed audio_segment = self.supertonic_engine.synthesize_to_audio_segment( text=text, voice_id=voice_id, speed=1.0, steps=5, # Balanced quality/speed (1-50, lower=faster) ) audio_segments.append(audio_segment) # Add 500ms silence between speakers silence = AudioSegment.silent(duration=500) audio_segments.append(silence) print(f"✓ Synthesized line {i + 1}/{len(script)}") except Exception as e: print(f"Error synthesizing line '{text[:50]}...': {e}") # Continue with next line even if one fails if not audio_segments: print("No audio generated") return "" # Combine all segments print("Combining audio segments...") combined = sum(audio_segments) # Export as WAV with unique filename filename = generate_unique_filename() output_path = os.path.join(OUTPUT_DIR, filename) combined.export(output_path, format="wav") print(f"✓ Podcast saved to: {output_path}") return output_path # Global instance _tts_instance = None def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None): """ Get TTS engine instance with ElevenLabs or Supertonic. Args: tts_provider: "elevenlabs" or "supertonic" custom_api_key: ElevenLabs API key (required if using elevenlabs, not needed for supertonic) host_voice: Voice ID for Host (optional) guest_voice: Voice ID for Guest (optional) Returns: TTSEngine instance """ global _tts_instance # Always create new instance if custom settings provided or if using Supertonic if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice: return TTSEngine( tts_provider=tts_provider, custom_api_key=custom_api_key, host_voice=host_voice, guest_voice=guest_voice ) # Otherwise, reuse global instance (for default ElevenLabs) if _tts_instance is None: _tts_instance = TTSEngine(tts_provider="elevenlabs") return _tts_instance