Spaces:
Running
Running
File size: 5,510 Bytes
cda5546 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
"""
Kokoro-82M TTS with 54 Voices
Built on StyleTTS 2 Architecture
"""
import gradio as gr
import numpy as np
import scipy.io.wavfile as wavfile
from io import BytesIO
import requests
import json
# Voice database - 54 voices
VOICES = {
"American Female": {
"af_heart": "Heart - Warm & Friendly",
"af_bella": "Bella - Elegant & Smooth",
"af_nicole": "Nicole - Professional",
"af_aoede": "Aoede - Cheerful",
"af_kore": "Kore - Gentle",
"af_sarah": "Sarah - Clear",
"af_nova": "Nova - Modern",
"af_sky": "Sky - Light",
"af_alloy": "Alloy - Versatile",
"af_jessica": "Jessica - Natural",
"af_river": "River - Calm"
},
"American Male": {
"am_michael": "Michael - Deep & Authoritative",
"am_fenrir": "Fenrir - Strong",
"am_puck": "Puck - Playful",
"am_echo": "Echo - Resonant",
"am_eric": "Eric - Professional",
"am_liam": "Liam - Friendly",
"am_onyx": "Onyx - Rich",
"am_adam": "Adam - Natural"
},
"British Female": {
"bf_emma": "Emma - Refined",
"bf_isabella": "Isabella - Elegant",
"bf_alice": "Alice - Clear",
"bf_lily": "Lily - Soft"
},
"British Male": {
"bm_george": "George - Distinguished",
"bm_fable": "Fable - Storyteller",
"bm_lewis": "Lewis - Smooth",
"bm_daniel": "Daniel - Professional"
}
}
# Flatten voice dict for dropdown
def get_voice_list():
voice_list = []
for category, voices in VOICES.items():
for voice_id, desc in voices.items():
voice_list.append(f"{desc} ({voice_id})")
return voice_list
def generate_speech(text, voice_dropdown, speed):
"""Generate speech using Kokoro-82M via HF API"""
if not text.strip():
return None, "β Please enter some text"
# Extract voice_id from dropdown selection
voice_id = voice_dropdown.split("(")[-1].strip(")")
try:
# Use Hugging Face Inference API
API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M"
headers = {
"Content-Type": "application/json"
}
payload = {
"inputs": text,
"parameters": {
"voice": voice_id,
"speed": speed
}
}
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 200:
# Save audio
audio_bytes = response.content
# Return audio for playback
return audio_bytes, f"β
Generated with {voice_id} at {speed}x speed"
else:
return None, f"β API Error: {response.status_code}"
except Exception as e:
return None, f"β Error: {str(e)}"
# Build Gradio interface
with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ποΈ Kokoro-82M Text-to-Speech
**82 Million Parameters β’ 54 Premium Voices β’ StyleTTS 2 Architecture**
Choose from American & British voices with unique characteristics!
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π Voice Selection")
voice_selector = gr.Dropdown(
choices=get_voice_list(),
value=get_voice_list()[0],
label="Choose Voice (54 options)",
interactive=True
)
gr.Markdown("### βοΈ Settings")
speed = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.05,
label="Speed",
interactive=True
)
gr.Markdown("""
### π Voice Categories
- πΊπΈ **American Female**: 11 voices
- πΊπΈ **American Male**: 8 voices
- π¬π§ **British Female**: 4 voices
- π¬π§ **British Male**: 4 voices
""")
with gr.Column(scale=2):
gr.Markdown("### π Text Input")
text_input = gr.Textbox(
lines=5,
placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!",
value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.",
label="Text to synthesize"
)
generate_btn = gr.Button("π€ Generate Speech", variant="primary", size="lg")
status_text = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(
label="Generated Audio",
type="numpy",
interactive=False
)
gr.Markdown("""
### π Model Information
- **Model**: Kokoro-82M
- **Architecture**: StyleTTS 2 + ISTFTNet
- **Parameters**: 82 Million
- **License**: Apache 2.0
- **Training**: Few hundred hours of permissive data
""")
# Connect event
generate_btn.click(
fn=generate_speech,
inputs=[text_input, voice_selector, speed],
outputs=[audio_output, status_text]
)
gr.Markdown("""
---
**Note**: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading.
Subsequent generations are faster (~2-5 seconds).
""")
if __name__ == "__main__":
demo.launch()
|