Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import io | |
| import wave | |
| import numpy as np | |
| import soundfile as sf | |
| from huggingface_hub import snapshot_download | |
| from helper import load_text_to_speech, load_voice_style | |
| _SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None} | |
| def _init_supertonic() -> None: | |
| if _SUPERTONIC_STATE["initialized"]: | |
| return | |
| print("Initializing Supertonic...") | |
| # Download models if not present | |
| assets_dir = os.path.join(os.path.dirname(__file__), "assets") | |
| if not os.path.exists(assets_dir): | |
| print(f"Downloading Supertonic models to {assets_dir}...") | |
| snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir) | |
| onnx_dir = os.path.join(assets_dir, "onnx") | |
| tts = load_text_to_speech(onnx_dir, use_gpu=False) | |
| _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir}) | |
| print("Supertonic initialized.") | |
| def get_supertonic_voices(): | |
| """Get list of available Supertonic voice styles.""" | |
| # Ensure assets are downloaded to list voices | |
| assets_dir = os.path.join(os.path.dirname(__file__), "assets") | |
| if not os.path.exists(assets_dir): | |
| # If not initialized/downloaded yet, we might not see voices. | |
| # But we can try to download just to list, or just init. | |
| _init_supertonic() | |
| assets_dir = _SUPERTONIC_STATE["assets_dir"] | |
| voice_styles_dir = os.path.join(assets_dir, "voice_styles") | |
| if not os.path.exists(voice_styles_dir): | |
| return [] | |
| files = os.listdir(voice_styles_dir) | |
| voices = [f.replace('.json', '') for f in files if f.endswith('.json')] | |
| return sorted(voices) | |
| def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray: | |
| audio_clipped = np.clip(audio_np, -1.0, 1.0) | |
| return (audio_clipped * 32767.0).astype(np.int16) | |
| def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes: | |
| buffer = io.BytesIO() | |
| with wave.open(buffer, "wb") as wf: | |
| wf.setnchannels(1) | |
| wf.setsampwidth(2) | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(audio_int16.tobytes()) | |
| return buffer.getvalue() | |
| def supertonic_tts(text: str, speed: float, voice: str, steps: int, silence_duration: float, max_len: int): | |
| if not text or not text.strip(): | |
| raise gr.Error("Please enter text to synthesize.") | |
| _init_supertonic() | |
| tts = _SUPERTONIC_STATE["tts"] | |
| assets_dir = _SUPERTONIC_STATE["assets_dir"] | |
| voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json") | |
| if not os.path.exists(voice_path): | |
| raise gr.Error(f"Voice style {voice} not found.") | |
| style = load_voice_style([voice_path]) | |
| try: | |
| sr = tts.sample_rate | |
| for audio_chunk in tts.stream(text, style, steps, speed, silence_duration, max_len): | |
| audio_int16 = _audio_np_to_int16(audio_chunk) | |
| yield _wav_bytes_from_int16(audio_int16, sr) | |
| except Exception as e: | |
| raise gr.Error(f"Error during speech generation: {str(e)}") | |
| with gr.Blocks() as demo: | |
| gr.HTML("<h1 style='text-align: center;'>Supertonic-TTS</h1><p style='text-align: center;'>Powered by Supertone/Supertonic on CPU</p>") | |
| # We need to initialize to get voices, but we don't want to block startup too long if download is needed. | |
| # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first? | |
| # Or we can just list a default if not found. | |
| try: | |
| available_voices = get_supertonic_voices() | |
| except Exception: | |
| available_voices = [] | |
| default_voice = available_voices[0] if available_voices else None | |
| with gr.Row(variant='panel'): | |
| speed_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label='Speed' | |
| ) | |
| steps_slider = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=5, | |
| step=1, | |
| label='Steps (Quality vs Speed)' | |
| ) | |
| voice_dropdown = gr.Dropdown( | |
| choices=available_voices, | |
| label='Voice', | |
| value=default_voice, | |
| allow_custom_value=True | |
| ) | |
| silence_slider = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.3, | |
| step=0.1, | |
| label='Silence Duration (s)' | |
| ) | |
| maxlen_slider = gr.Slider( | |
| minimum=50, | |
| maximum=1000, | |
| value=300, | |
| step=10, | |
| label='Max Chunk Length' | |
| ) | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter the text you want to convert to speech here...", | |
| lines=5, | |
| value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen." | |
| ) | |
| generate_btn = gr.Button( | |
| "Generate Speech", | |
| variant="primary", | |
| ) | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| streaming=True, | |
| autoplay=True | |
| ) | |
| generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider, silence_slider, maxlen_slider] | |
| generate_btn.click( | |
| fn=supertonic_tts, | |
| inputs=generate_inputs, | |
| outputs=audio_output, | |
| api_name="generate_speech" | |
| ) | |
| text_input.submit( | |
| fn=supertonic_tts, | |
| inputs=generate_inputs, | |
| outputs=audio_output, | |
| api_name="generate_speech_enter" | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch(theme='Nymbo/Nymbo_Theme') | |