Spaces:

WSYBYT
/

ybtts

Running

App Files Files Community

ybtts / app.py

masbudjj

Major Update: Kokoro-82M with 54 Premium Voices (#7)

cda5546 verified about 2 months ago

raw

history blame contribute delete

5.51 kB

	"""
	Kokoro-82M TTS with 54 Voices
	Built on StyleTTS 2 Architecture
	"""

	import gradio as gr
	import numpy as np
	import scipy.io.wavfile as wavfile
	from io import BytesIO
	import requests
	import json

	# Voice database - 54 voices
	VOICES = {
	"American Female": {
	"af_heart": "Heart - Warm & Friendly",
	"af_bella": "Bella - Elegant & Smooth",
	"af_nicole": "Nicole - Professional",
	"af_aoede": "Aoede - Cheerful",
	"af_kore": "Kore - Gentle",
	"af_sarah": "Sarah - Clear",
	"af_nova": "Nova - Modern",
	"af_sky": "Sky - Light",
	"af_alloy": "Alloy - Versatile",
	"af_jessica": "Jessica - Natural",
	"af_river": "River - Calm"
	},
	"American Male": {
	"am_michael": "Michael - Deep & Authoritative",
	"am_fenrir": "Fenrir - Strong",
	"am_puck": "Puck - Playful",
	"am_echo": "Echo - Resonant",
	"am_eric": "Eric - Professional",
	"am_liam": "Liam - Friendly",
	"am_onyx": "Onyx - Rich",
	"am_adam": "Adam - Natural"
	},
	"British Female": {
	"bf_emma": "Emma - Refined",
	"bf_isabella": "Isabella - Elegant",
	"bf_alice": "Alice - Clear",
	"bf_lily": "Lily - Soft"
	},
	"British Male": {
	"bm_george": "George - Distinguished",
	"bm_fable": "Fable - Storyteller",
	"bm_lewis": "Lewis - Smooth",
	"bm_daniel": "Daniel - Professional"
	}
	}

	# Flatten voice dict for dropdown
	def get_voice_list():
	voice_list = []
	for category, voices in VOICES.items():
	for voice_id, desc in voices.items():
	voice_list.append(f"{desc} ({voice_id})")
	return voice_list

	def generate_speech(text, voice_dropdown, speed):
	"""Generate speech using Kokoro-82M via HF API"""

	if not text.strip():
	return None, "❌ Please enter some text"

	# Extract voice_id from dropdown selection
	voice_id = voice_dropdown.split("(")[-1].strip(")")

	try:
	# Use Hugging Face Inference API
	API_URL = "https://api-inference.huggingface.co/models/hexgrad/Kokoro-82M"

	headers = {
	"Content-Type": "application/json"
	}

	payload = {
	"inputs": text,
	"parameters": {
	"voice": voice_id,
	"speed": speed
	}
	}

	response = requests.post(API_URL, headers=headers, json=payload)

	if response.status_code == 200:
	# Save audio
	audio_bytes = response.content

	# Return audio for playback
	return audio_bytes, f"✅ Generated with {voice_id} at {speed}x speed"
	else:
	return None, f"❌ API Error: {response.status_code}"

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# Build Gradio interface
	with gr.Blocks(title="Kokoro-82M TTS - 54 Voices", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🎙️ Kokoro-82M Text-to-Speech

	82 Million Parameters • 54 Premium Voices • StyleTTS 2 Architecture

	Choose from American & British voices with unique characteristics!
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎭 Voice Selection")

	voice_selector = gr.Dropdown(
	choices=get_voice_list(),
	value=get_voice_list()[0],
	label="Choose Voice (54 options)",
	interactive=True
	)

	gr.Markdown("### ⚙️ Settings")
	speed = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.05,
	label="Speed",
	interactive=True
	)

	gr.Markdown("""
	### 🌟 Voice Categories
	- 🇺🇸 American Female: 11 voices
	- 🇺🇸 American Male: 8 voices
	- 🇬🇧 British Female: 4 voices
	- 🇬🇧 British Male: 4 voices
	""")

	with gr.Column(scale=2):
	gr.Markdown("### 📝 Text Input")

	text_input = gr.Textbox(
	lines=5,
	placeholder="Enter your text here... Kokoro-82M supports natural prosody and emotion!",
	value="Welcome to Kokoro-82M! Choose from 54 premium voices powered by StyleTTS 2.",
	label="Text to synthesize"
	)

	generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")

	status_text = gr.Textbox(label="Status", interactive=False)

	audio_output = gr.Audio(
	label="Generated Audio",
	type="numpy",
	interactive=False
	)

	gr.Markdown("""
	### 📊 Model Information
	- Model: Kokoro-82M
	- Architecture: StyleTTS 2 + ISTFTNet
	- Parameters: 82 Million
	- License: Apache 2.0
	- Training: Few hundred hours of permissive data
	""")

	# Connect event
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, voice_selector, speed],
	outputs=[audio_output, status_text]
	)

	gr.Markdown("""
	---
	Note: This uses Hugging Face Inference API. First generation may take 20-30 seconds for model loading.
	Subsequent generations are faster (~2-5 seconds).
	""")

	if __name__ == "__main__":
	demo.launch()