Spaces:
Running
Running
| """ | |
| Kokoro-82M TTS with 54 Voices | |
| Built on StyleTTS 2 Architecture | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import scipy.io.wavfile as wavfile | |
| from io import BytesIO | |
| import requests | |
| import json | |
| # Voice database - 54 voices | |
| VOICES = { | |
| "American Female": { | |
| "af_heart": "Heart - Warm & Friendly", | |
| "af_bella": "Bella - Elegant & Smooth", | |
| "af_nicole": "Nicole - Professional", | |
| "af_aoede": "Aoede - Cheerful", | |
| "af_kore": "Kore - Gentle", | |
| "af_sarah": "Sarah - Clear", | |
| "af_nova": "Nova - Modern", | |
| "af_sky": "Sky - Light", | |
| "af_alloy": "Alloy - Versatile", | |
| "af_jessica": "Jessica - Natural", | |
| "af_river": "River - Calm" | |
| }, | |
| "American Male": { | |
| "am_michael": "Michael - Deep & Authoritative", | |
| "am_fenrir": "Fenrir - Strong", | |
| "am_puck": "Puck - Playful", | |
| "am_echo": "Echo - Resonant", | |
| "am_eric": "Eric - Professional", | |
| "am_liam": "Liam - Friendly", | |
| "am_onyx": "Onyx - Rich", | |
| "am_adam": "Adam - Natural" | |
| }, | |
| "British Female": { | |
| "bf_emma": "Emma - Refined", | |
| "bf_isabella": "Isabella - Elegant", | |
| "bf_alice": "Alice - Clear", | |
| "bf_lily": "Lily - Soft" | |
| }, | |
| "British Male": { | |
| "bm_george": "George - Distinguished", | |
| "bm_fable": "Fable - Storyteller", | |
| "bm_lewis": "Lewis - Smooth", | |
| "bm_daniel": "Daniel - Professional" | |
| } | |
| } | |
| # Flatten voice dict for dropdown | |
| def get_voice_list(): | |
| voice_list = [] | |
| for category, voices in VOICES.items(): | |
| for voice_id, desc in voices.items(): | |
| voice_list.append(f"{desc} ({voice_id})") | |
| return voice_list | |
| def generate_speech(text, voice_dropdown, speed): | |
| """Generate speech using Kokoro-82M via HF API""" | |
| if not text.strip(): | |
| return None, "β Please enter some text" | |
| # Extract voice_id from dropdown selection | |
| voice_id = voice_dropdown.split("(")[-1].strip(")") | |
| try: | |
| # Use Hugging Face Inference API | |
| API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M" | |
| headers = { | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "inputs": text, | |
| "parameters": { | |
| "voice": voice_id, | |
| "speed": speed | |
| } | |
| } | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| if response.status_code == 200: | |
| # Save audio | |
| audio_bytes = response.content | |
| # Return audio for playback | |
| return audio_bytes, f"β Generated with {voice_id} at {speed}x speed" | |
| else: | |
| return None, f"β API Error: {response.status_code}" | |
| except Exception as e: | |
| return None, f"β Error: {str(e)}" | |
| # Build Gradio interface | |
| with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ποΈ Kokoro-82M Text-to-Speech | |
| **82 Million Parameters β’ 54 Premium Voices β’ StyleTTS 2 Architecture** | |
| Choose from American & British voices with unique characteristics! | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Voice Selection") | |
| voice_selector = gr.Dropdown( | |
| choices=get_voice_list(), | |
| value=get_voice_list()[0], | |
| label="Choose Voice (54 options)", | |
| interactive=True | |
| ) | |
| gr.Markdown("### βοΈ Settings") | |
| speed = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.05, | |
| label="Speed", | |
| interactive=True | |
| ) | |
| gr.Markdown(""" | |
| ### π Voice Categories | |
| - πΊπΈ **American Female**: 11 voices | |
| - πΊπΈ **American Male**: 8 voices | |
| - π¬π§ **British Female**: 4 voices | |
| - π¬π§ **British Male**: 4 voices | |
| """) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Text Input") | |
| text_input = gr.Textbox( | |
| lines=5, | |
| placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!", | |
| value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.", | |
| label="Text to synthesize" | |
| ) | |
| generate_btn = gr.Button("π€ Generate Speech", variant="primary", size="lg") | |
| status_text = gr.Textbox(label="Status", interactive=False) | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="numpy", | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### π Model Information | |
| - **Model**: Kokoro-82M | |
| - **Architecture**: StyleTTS 2 + ISTFTNet | |
| - **Parameters**: 82 Million | |
| - **License**: Apache 2.0 | |
| - **Training**: Few hundred hours of permissive data | |
| """) | |
| # Connect event | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_selector, speed], | |
| outputs=[audio_output, status_text] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **Note**: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading. | |
| Subsequent generations are faster (~2-5 seconds). | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |