Supertonic-66M / app.py
Nymbo's picture
Update app.py
0ece291 verified
import gradio as gr
import os
import io
import wave
import numpy as np
import soundfile as sf
from huggingface_hub import snapshot_download
from helper import load_text_to_speech, load_voice_style
_SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
def _init_supertonic() -> None:
if _SUPERTONIC_STATE["initialized"]:
return
print("Initializing Supertonic...")
# Download models if not present
assets_dir = os.path.join(os.path.dirname(__file__), "assets")
if not os.path.exists(assets_dir):
print(f"Downloading Supertonic models to {assets_dir}...")
snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
onnx_dir = os.path.join(assets_dir, "onnx")
tts = load_text_to_speech(onnx_dir, use_gpu=False)
_SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
print("Supertonic initialized.")
def get_supertonic_voices():
"""Get list of available Supertonic voice styles."""
# Ensure assets are downloaded to list voices
assets_dir = os.path.join(os.path.dirname(__file__), "assets")
if not os.path.exists(assets_dir):
# If not initialized/downloaded yet, we might not see voices.
# But we can try to download just to list, or just init.
_init_supertonic()
assets_dir = _SUPERTONIC_STATE["assets_dir"]
voice_styles_dir = os.path.join(assets_dir, "voice_styles")
if not os.path.exists(voice_styles_dir):
return []
files = os.listdir(voice_styles_dir)
voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
return sorted(voices)
def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
audio_clipped = np.clip(audio_np, -1.0, 1.0)
return (audio_clipped * 32767.0).astype(np.int16)
def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
buffer = io.BytesIO()
with wave.open(buffer, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio_int16.tobytes())
return buffer.getvalue()
def supertonic_tts(text: str, speed: float, voice: str, steps: int, silence_duration: float, max_len: int):
if not text or not text.strip():
raise gr.Error("Please enter text to synthesize.")
_init_supertonic()
tts = _SUPERTONIC_STATE["tts"]
assets_dir = _SUPERTONIC_STATE["assets_dir"]
voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
if not os.path.exists(voice_path):
raise gr.Error(f"Voice style {voice} not found.")
style = load_voice_style([voice_path])
try:
sr = tts.sample_rate
for audio_chunk in tts.stream(text, style, steps, speed, silence_duration, max_len):
audio_int16 = _audio_np_to_int16(audio_chunk)
yield _wav_bytes_from_int16(audio_int16, sr)
except Exception as e:
raise gr.Error(f"Error during speech generation: {str(e)}")
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'>Supertonic-TTS</h1><p style='text-align: center;'>Powered by Supertone/Supertonic on CPU</p>")
# We need to initialize to get voices, but we don't want to block startup too long if download is needed.
# For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
# Or we can just list a default if not found.
try:
available_voices = get_supertonic_voices()
except Exception:
available_voices = []
default_voice = available_voices[0] if available_voices else None
with gr.Row(variant='panel'):
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label='Speed'
)
steps_slider = gr.Slider(
minimum=1,
maximum=50,
value=5,
step=1,
label='Steps (Quality vs Speed)'
)
voice_dropdown = gr.Dropdown(
choices=available_voices,
label='Voice',
value=default_voice,
allow_custom_value=True
)
silence_slider = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.3,
step=0.1,
label='Silence Duration (s)'
)
maxlen_slider = gr.Slider(
minimum=50,
maximum=1000,
value=300,
step=10,
label='Max Chunk Length'
)
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter the text you want to convert to speech here...",
lines=5,
value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
)
generate_btn = gr.Button(
"Generate Speech",
variant="primary",
)
audio_output = gr.Audio(
label="Generated Speech",
streaming=True,
autoplay=True
)
generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider, silence_slider, maxlen_slider]
generate_btn.click(
fn=supertonic_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech"
)
text_input.submit(
fn=supertonic_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech_enter"
)
if __name__ == "__main__":
demo.queue().launch(theme='Nymbo/Nymbo_Theme')