ybtts / app.py
masbudjj's picture
Major Update: Kokoro-82M with 54 Premium Voices (#7)
cda5546 verified
"""
Kokoro-82M TTS with 54 Voices
Built on StyleTTS 2 Architecture
"""
import gradio as gr
import numpy as np
import scipy.io.wavfile as wavfile
from io import BytesIO
import requests
import json
# Voice database - 54 voices
VOICES = {
"American Female": {
"af_heart": "Heart - Warm & Friendly",
"af_bella": "Bella - Elegant & Smooth",
"af_nicole": "Nicole - Professional",
"af_aoede": "Aoede - Cheerful",
"af_kore": "Kore - Gentle",
"af_sarah": "Sarah - Clear",
"af_nova": "Nova - Modern",
"af_sky": "Sky - Light",
"af_alloy": "Alloy - Versatile",
"af_jessica": "Jessica - Natural",
"af_river": "River - Calm"
},
"American Male": {
"am_michael": "Michael - Deep & Authoritative",
"am_fenrir": "Fenrir - Strong",
"am_puck": "Puck - Playful",
"am_echo": "Echo - Resonant",
"am_eric": "Eric - Professional",
"am_liam": "Liam - Friendly",
"am_onyx": "Onyx - Rich",
"am_adam": "Adam - Natural"
},
"British Female": {
"bf_emma": "Emma - Refined",
"bf_isabella": "Isabella - Elegant",
"bf_alice": "Alice - Clear",
"bf_lily": "Lily - Soft"
},
"British Male": {
"bm_george": "George - Distinguished",
"bm_fable": "Fable - Storyteller",
"bm_lewis": "Lewis - Smooth",
"bm_daniel": "Daniel - Professional"
}
}
# Flatten voice dict for dropdown
def get_voice_list():
voice_list = []
for category, voices in VOICES.items():
for voice_id, desc in voices.items():
voice_list.append(f"{desc} ({voice_id})")
return voice_list
def generate_speech(text, voice_dropdown, speed):
"""Generate speech using Kokoro-82M via HF API"""
if not text.strip():
return None, "❌ Please enter some text"
# Extract voice_id from dropdown selection
voice_id = voice_dropdown.split("(")[-1].strip(")")
try:
# Use Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M"
headers = {
"Content-Type": "application/json"
}
payload = {
"inputs": text,
"parameters": {
"voice": voice_id,
"speed": speed
}
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
# Save audio
audio_bytes = response.content
# Return audio for playback
return audio_bytes, f"βœ… Generated with {voice_id} at {speed}x speed"
else:
return None, f"❌ API Error: {response.status_code}"
except Exception as e:
return None, f"❌ Error: {str(e)}"
# Build Gradio interface
with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸŽ™οΈ Kokoro-82M Text-to-Speech
**82 Million Parameters β€’ 54 Premium Voices β€’ StyleTTS 2 Architecture**
Choose from American & British voices with unique characteristics!
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🎭 Voice Selection")
voice_selector = gr.Dropdown(
choices=get_voice_list(),
value=get_voice_list()[0],
label="Choose Voice (54 options)",
interactive=True
)
gr.Markdown("### βš™οΈ Settings")
speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.05,
label="Speed",
interactive=True
)
gr.Markdown("""
### 🌟 Voice Categories
- πŸ‡ΊπŸ‡Έ **American Female**: 11 voices
- πŸ‡ΊπŸ‡Έ **American Male**: 8 voices
- πŸ‡¬πŸ‡§ **British Female**: 4 voices
- πŸ‡¬πŸ‡§ **British Male**: 4 voices
""")
with gr.Column(scale=2):
gr.Markdown("### πŸ“ Text Input")
text_input = gr.Textbox(
lines=5,
placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!",
value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.",
label="Text to synthesize"
)
generate_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg")
status_text = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(
label="Generated Audio",
type="numpy",
interactive=False
)
gr.Markdown("""
### πŸ“Š Model Information
- **Model**: Kokoro-82M
- **Architecture**: StyleTTS 2 + ISTFTNet
- **Parameters**: 82 Million
- **License**: Apache 2.0
- **Training**: Few hundred hours of permissive data
""")
# Connect event
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_selector, speed],
outputs=[audio_output, status_text]
)
gr.Markdown("""
---
**Note**: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading.
Subsequent generations are faster (~2-5 seconds).
""")
if __name__ == "__main__":
demo.launch()