File size: 5,510 Bytes
cda5546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Kokoro-82M TTS with 54 Voices
Built on StyleTTS 2 Architecture
"""

import gradio as gr
import numpy as np
import scipy.io.wavfile as wavfile
from io import BytesIO
import requests
import json

# Voice database - 54 voices
VOICES = {
    "American Female": {
        "af_heart": "Heart - Warm & Friendly",
        "af_bella": "Bella - Elegant & Smooth",
        "af_nicole": "Nicole - Professional",
        "af_aoede": "Aoede - Cheerful",
        "af_kore": "Kore - Gentle",
        "af_sarah": "Sarah - Clear",
        "af_nova": "Nova - Modern",
        "af_sky": "Sky - Light",
        "af_alloy": "Alloy - Versatile",
        "af_jessica": "Jessica - Natural",
        "af_river": "River - Calm"
    },
    "American Male": {
        "am_michael": "Michael - Deep & Authoritative",
        "am_fenrir": "Fenrir - Strong",
        "am_puck": "Puck - Playful",
        "am_echo": "Echo - Resonant",
        "am_eric": "Eric - Professional",
        "am_liam": "Liam - Friendly",
        "am_onyx": "Onyx - Rich",
        "am_adam": "Adam - Natural"
    },
    "British Female": {
        "bf_emma": "Emma - Refined",
        "bf_isabella": "Isabella - Elegant",
        "bf_alice": "Alice - Clear",
        "bf_lily": "Lily - Soft"
    },
    "British Male": {
        "bm_george": "George - Distinguished",
        "bm_fable": "Fable - Storyteller",
        "bm_lewis": "Lewis - Smooth",
        "bm_daniel": "Daniel - Professional"
    }
}

# Flatten voice dict for dropdown
def get_voice_list():
    voice_list = []
    for category, voices in VOICES.items():
        for voice_id, desc in voices.items():
            voice_list.append(f"{desc} ({voice_id})")
    return voice_list

def generate_speech(text, voice_dropdown, speed):
    """Generate speech using Kokoro-82M via HF API"""

    if not text.strip():
        return None, "❌ Please enter some text"

    # Extract voice_id from dropdown selection
    voice_id = voice_dropdown.split("(")[-1].strip(")")

    try:
        # Use Hugging Face Inference API
        API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M"

        headers = {
            "Content-Type": "application/json"
        }

        payload = {
            "inputs": text,
            "parameters": {
                "voice": voice_id,
                "speed": speed
            }
        }

        response = requests.post(API_URL, headers=headers, json=payload)

        if response.status_code == 200:
            # Save audio
            audio_bytes = response.content

            # Return audio for playback
            return audio_bytes, f"βœ… Generated with {voice_id} at {speed}x speed"
        else:
            return None, f"❌ API Error: {response.status_code}"

    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Build Gradio interface
with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo:

    gr.Markdown("""
    # πŸŽ™οΈ Kokoro-82M Text-to-Speech

    **82 Million Parameters β€’ 54 Premium Voices β€’ StyleTTS 2 Architecture**

    Choose from American & British voices with unique characteristics!
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🎭 Voice Selection")

            voice_selector = gr.Dropdown(
                choices=get_voice_list(),
                value=get_voice_list()[0],
                label="Choose Voice (54 options)",
                interactive=True
            )

            gr.Markdown("### βš™οΈ Settings")
            speed = gr.Slider(
                minimum=0.5,
                maximum=2.0,
                value=1.0,
                step=0.05,
                label="Speed",
                interactive=True
            )

            gr.Markdown("""
            ### 🌟 Voice Categories
            - πŸ‡ΊπŸ‡Έ **American Female**: 11 voices
            - πŸ‡ΊπŸ‡Έ **American Male**: 8 voices
            - πŸ‡¬πŸ‡§ **British Female**: 4 voices
            - πŸ‡¬πŸ‡§ **British Male**: 4 voices
            """)

        with gr.Column(scale=2):
            gr.Markdown("### πŸ“ Text Input")

            text_input = gr.Textbox(
                lines=5,
                placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!",
                value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.",
                label="Text to synthesize"
            )

            generate_btn = gr.Button("🎀 Generate Speech", variant="primary", size="lg")

            status_text = gr.Textbox(label="Status", interactive=False)

            audio_output = gr.Audio(
                label="Generated Audio",
                type="numpy",
                interactive=False
            )

            gr.Markdown("""
            ### πŸ“Š Model Information
            - **Model**: Kokoro-82M
            - **Architecture**: StyleTTS 2 + ISTFTNet
            - **Parameters**: 82 Million
            - **License**: Apache 2.0
            - **Training**: Few hundred hours of permissive data
            """)

    # Connect event
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_selector, speed],
        outputs=[audio_output, status_text]
    )

    gr.Markdown("""
    ---
    **Note**: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading.
    Subsequent generations are faster (~2-5 seconds).
    """)

if __name__ == "__main__":
    demo.launch()