import os
from datetime import datetime
from io import BytesIO

from elevenlabs import ElevenLabs, VoiceSettings
from pydub import AudioSegment

from utils.config import (
    ELEVENLABS_API_KEY,
    ELEVENLABS_GUEST_VOICE,
    ELEVENLABS_HOST_VOICE,
    OUTPUT_DIR,
)

# Import Supertonic TTS wrapper
from synthesis.supertonic_tts import (
    SupertonicWrapper,
    SUPERTONIC_VOICES,
    DEFAULT_HOST_VOICE as SUPERTONIC_DEFAULT_HOST,
    DEFAULT_GUEST_VOICE as SUPERTONIC_DEFAULT_GUEST,
    get_supertonic_engine,
)

# ElevenLabs Voice Options
ELEVENLABS_VOICES = {
    # Male Voices
    "Antoni (Male - Well-rounded)": "ErXwobaYiN019PkySvjV",
    "Josh (Male - Deep)": "TxGEqnHWrfWFTfGW9XjX",
    "Arnold (Male - Crisp)": "VR6AewLTigWG4xSOukaG",
    "Callum (Male - Hoarse)": "N2lVS1w4EtoT3dr4eOWO",
    "Charlie (Male - Casual)": "IKne3meq5aSn9XLyUdCD",
    "Clyde (Male - War veteran)": "2EiwWnXFnvU5JabPnv8n",
    "Daniel (Male - Deep British)": "onwK4e9ZLuTAKqWW03F9",
    "Ethan (Male - Young American)": "g5CIjZEefAph4nQFvHAz",
    "Fin (Male - Irish)": "D38z5RcWu1voky8WS1ja",
    "George (Male - British)": "JBFqnCBsd6RMkjVDRZzb",

    # Female Voices
    "Bella (Female - Soft)": "EXAVITQu4vr4xnSDxMaL",
    "Rachel (Female - Calm)": "21m00Tcm4TlvDq8ikWAM",
    "Domi (Female - Strong)": "AZnzlk1XvdvUeBnXmlld",
    "Elli (Female - Emotional)": "MF3mGyEYCl7XYWbV9V6O",
    "Emily (Female - Calm British)": "LcfcDJNUP1GQjkzn1xUU",
    "Freya (Female - Young American)": "jsCqWAovK2LkecY7zXl4",
    "Gigi (Female - Young Expressive)": "jBpfuIE2acCO8z3wKNLl",
    "Grace (Female - Southern American)": "oWAxZDx7w5VEj9dCyTzz",
    "Lily (Female - Warm British)": "pFZP5JQG7iQjIQuC4Bku",
    "Matilda (Female - Warm)": "XrExE9yKIg1WjnnlVkGX",
}


def generate_unique_filename():
    """Generate unique filename using timestamp"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    return f"podcast_{timestamp}.wav"


class TTSEngine:
    def __init__(self, tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
        """
        Initialize TTS Engine with either ElevenLabs or Supertonic.

        Args:
            tts_provider: "elevenlabs" or "supertonic"
            custom_api_key: API key for ElevenLabs (required if using elevenlabs)
            host_voice: Voice ID for Host (optional, uses default if not provided)
            guest_voice: Voice ID for Guest (optional, uses default if not provided)
        """
        self.mode = tts_provider.lower()

        if self.mode == "elevenlabs":
            print("Initializing ElevenLabs TTS API...")
            # Use custom key if provided, otherwise use default
            api_key = custom_api_key if custom_api_key else ELEVENLABS_API_KEY

            if not api_key:
                raise ValueError("ElevenLabs API key is required")

            self.client = ElevenLabs(api_key=api_key)

            # Use custom voices or defaults
            self.host_voice_id = host_voice if host_voice else ELEVENLABS_HOST_VOICE
            self.guest_voice_id = guest_voice if guest_voice else ELEVENLABS_GUEST_VOICE

            if custom_api_key:
                print("✓ ElevenLabs TTS ready (custom API key)")
            else:
                print("✓ ElevenLabs TTS ready")

            # Print selected voices
            host_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.host_voice_id]
            guest_name = [k for k, v in ELEVENLABS_VOICES.items() if v == self.guest_voice_id]
            print(f"  Host: {host_name[0] if host_name else 'Custom/Default'}")
            print(f"  Guest: {guest_name[0] if guest_name else 'Custom/Default'}")

        elif self.mode == "supertonic":
            print("Initializing Supertonic TTS (CPU mode)...")
            self.supertonic_engine = get_supertonic_engine()
            self.supertonic_engine.initialize()

            # Use custom voices or defaults
            # For Supertonic, voice is the display name, we'll convert to ID later
            self.host_voice_id = host_voice if host_voice else SUPERTONIC_DEFAULT_HOST
            self.guest_voice_id = guest_voice if guest_voice else SUPERTONIC_DEFAULT_GUEST

            print("✓ Supertonic TTS ready (CPU mode, no API key required)")
            print(f"  Host: {self.host_voice_id}")
            print(f"  Guest: {self.guest_voice_id}")

        else:
            raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.")

    def _build_speaker_mapping(self, script: list) -> dict:
        """
        Build a mapping from speaker names to voice IDs.
        First unique speaker gets host_voice, second gets guest_voice.
        This allows PPF personas to work with any character names.

        Args:
            script: List of dialogue items with 'speaker' keys

        Returns:
            dict: Mapping from speaker name to voice ID
        """
        unique_speakers = []
        for item in script:
            if isinstance(item, dict) and "speaker" in item:
                speaker = item["speaker"]
                if speaker not in unique_speakers:
                    unique_speakers.append(speaker)

        # Map first speaker to host_voice, second to guest_voice
        mapping = {}
        if len(unique_speakers) >= 1:
            mapping[unique_speakers[0]] = self.host_voice_id
            print(f"  🎙️  Speaker mapping: {unique_speakers[0]} → Host Voice")
        if len(unique_speakers) >= 2:
            mapping[unique_speakers[1]] = self.guest_voice_id
            print(f"  🎙️  Speaker mapping: {unique_speakers[1]} → Guest Voice")

        return mapping

    def synthesize_dialogue(self, script: list) -> str:
        """
        Synthesize the script to audio using the selected TTS provider.

        Args:
            script: List of dialogue items

        Returns:
            str: Path to the generated audio file
        """
        if self.mode == "elevenlabs":
            return self._synthesize_elevenlabs(script)
        elif self.mode == "supertonic":
            return self._synthesize_supertonic(script)
        else:
            raise ValueError(f"Unknown TTS mode: {self.mode}")

    def _synthesize_elevenlabs(self, script: list) -> str:
        """Synthesize using ElevenLabs API"""
        print("Synthesizing audio via ElevenLabs API...")
        audio_segments = []

        # Build dynamic speaker-to-voice mapping
        # First unique speaker gets host_voice, second gets guest_voice
        speaker_to_voice = self._build_speaker_mapping(script)

        for i, item in enumerate(script):
            # Defensive checks for required keys
            if not isinstance(item, dict):
                print(f"⚠️  Skipping item {i + 1}: not a dictionary")
                continue

            if "text" not in item:
                print(f"⚠️  Skipping item {i + 1}: missing 'text' key")
                continue

            if "speaker" not in item:
                print(f"⚠️  Skipping item {i + 1}: missing 'speaker' key")
                continue

            text = item["text"]
            speaker = item["speaker"]
            emotion = item.get("emotion", "neutral")

            # Note: ElevenLabs doesn't have a direct emotion parameter.
            # Emotion is conveyed through the text content itself (exclamation marks, word choice, etc.)
            # which the script generator already creates based on the emotion field.
            # We log the emotion for debugging but don't modify the text (would be spoken out loud).

            # Select voice based on speaker using dynamic mapping
            voice_id = speaker_to_voice.get(speaker, self.host_voice_id)

            try:
                print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")

                # Generate audio using ElevenLabs with Turbo v2.5 for better quality and speed
                # Turbo v2.5: High quality, low latency (~250-300ms), 50% cheaper than v2
                audio_generator = self.client.text_to_speech.convert(
                    voice_id=voice_id,
                    text=text,
                    model_id="eleven_turbo_v2_5",  # Upgraded from multilingual_v2 for better quality
                    voice_settings=VoiceSettings(
                        stability=0.4,  # Lower = more expressiveness and variation (default: 0.5)
                        similarity_boost=0.8,  # Higher = better voice consistency (default: 0.75)
                        style=0.6,  # Higher = more dynamic, expressive delivery (default: 0.5)
                        use_speaker_boost=True,  # Enhances similarity to original voice
                    ),
                )

                # Collect audio bytes
                audio_bytes = b"".join(audio_generator)

                # Convert to AudioSegment
                audio_segment = AudioSegment.from_mp3(BytesIO(audio_bytes))
                audio_segments.append(audio_segment)

                # Add 500ms silence between speakers
                silence = AudioSegment.silent(duration=500)
                audio_segments.append(silence)

                print(f"✓ Synthesized line {i + 1}/{len(script)}")

            except Exception as e:
                print(f"Error synthesizing line '{text[:50]}...': {e}")
                # Continue with next line even if one fails

        if not audio_segments:
            print("No audio generated")
            return ""

        # Combine all segments
        print("Combining audio segments...")
        combined = sum(audio_segments)

        # Export as WAV with unique filename
        filename = generate_unique_filename()
        output_path = os.path.join(OUTPUT_DIR, filename)
        combined.export(output_path, format="wav")
        print(f"✓ Podcast saved to: {output_path}")

        return output_path

    def _synthesize_supertonic(self, script: list) -> str:
        """Synthesize using Supertonic TTS (CPU-based)"""
        print("Synthesizing audio via Supertonic TTS (CPU mode)...")
        audio_segments = []

        # Build dynamic speaker-to-voice mapping
        # First unique speaker gets host_voice, second gets guest_voice
        speaker_to_voice = self._build_speaker_mapping(script)

        for i, item in enumerate(script):
            # Defensive checks for required keys
            if not isinstance(item, dict):
                print(f"⚠️  Skipping item {i + 1}: not a dictionary")
                continue

            if "text" not in item:
                print(f"⚠️  Skipping item {i + 1}: missing 'text' key")
                continue

            if "speaker" not in item:
                print(f"⚠️  Skipping item {i + 1}: missing 'speaker' key")
                continue

            text = item["text"]
            speaker = item["speaker"]
            emotion = item.get("emotion", "neutral")

            # Select voice based on speaker using dynamic mapping
            voice_id = speaker_to_voice.get(speaker, self.host_voice_id)

            try:
                print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")

                # Generate audio using Supertonic
                # Parameters optimized for quality vs speed
                audio_segment = self.supertonic_engine.synthesize_to_audio_segment(
                    text=text,
                    voice_id=voice_id,
                    speed=1.0,
                    steps=5,  # Balanced quality/speed (1-50, lower=faster)
                )

                audio_segments.append(audio_segment)

                # Add 500ms silence between speakers
                silence = AudioSegment.silent(duration=500)
                audio_segments.append(silence)

                print(f"✓ Synthesized line {i + 1}/{len(script)}")

            except Exception as e:
                print(f"Error synthesizing line '{text[:50]}...': {e}")
                # Continue with next line even if one fails

        if not audio_segments:
            print("No audio generated")
            return ""

        # Combine all segments
        print("Combining audio segments...")
        combined = sum(audio_segments)

        # Export as WAV with unique filename
        filename = generate_unique_filename()
        output_path = os.path.join(OUTPUT_DIR, filename)
        combined.export(output_path, format="wav")
        print(f"✓ Podcast saved to: {output_path}")

        return output_path


# Global instance
_tts_instance = None


def get_tts_engine(tts_provider="elevenlabs", custom_api_key=None, host_voice=None, guest_voice=None):
    """
    Get TTS engine instance with ElevenLabs or Supertonic.

    Args:
        tts_provider: "elevenlabs" or "supertonic"
        custom_api_key: ElevenLabs API key (required if using elevenlabs, not needed for supertonic)
        host_voice: Voice ID for Host (optional)
        guest_voice: Voice ID for Guest (optional)

    Returns:
        TTSEngine instance
    """
    global _tts_instance

    # Always create new instance if custom settings provided or if using Supertonic
    if custom_api_key or tts_provider != "elevenlabs" or host_voice or guest_voice:
        return TTSEngine(
            tts_provider=tts_provider,
            custom_api_key=custom_api_key,
            host_voice=host_voice,
            guest_voice=guest_voice
        )

    # Otherwise, reuse global instance (for default ElevenLabs)
    if _tts_instance is None:
        _tts_instance = TTSEngine(tts_provider="elevenlabs")
    return _tts_instance