Spaces:
Running
Running
Upload 36 files
Browse files- app.py +161 -0
- assets/.cache/huggingface/.gitignore +1 -0
- assets/.cache/huggingface/download/.gitattributes.metadata +3 -0
- assets/.cache/huggingface/download/.gitignore.metadata +3 -0
- assets/.cache/huggingface/download/LICENSE.metadata +3 -0
- assets/.cache/huggingface/download/README.md.metadata +3 -0
- assets/.cache/huggingface/download/config.json.metadata +3 -0
- assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata +3 -0
- assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata +3 -0
- assets/.cache/huggingface/download/onnx/tts.json.metadata +3 -0
- assets/.cache/huggingface/download/onnx/tts.yml.metadata +3 -0
- assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata +3 -0
- assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata +3 -0
- assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata +3 -0
- assets/.cache/huggingface/download/voice_styles/F1.json.metadata +3 -0
- assets/.cache/huggingface/download/voice_styles/F2.json.metadata +3 -0
- assets/.cache/huggingface/download/voice_styles/M1.json.metadata +3 -0
- assets/.cache/huggingface/download/voice_styles/M2.json.metadata +3 -0
- assets/.gitattributes +35 -0
- assets/.gitignore +4 -0
- assets/LICENSE +209 -0
- assets/README.md +161 -0
- assets/config.json +5 -0
- assets/onnx/duration_predictor.onnx +3 -0
- assets/onnx/text_encoder.onnx +3 -0
- assets/onnx/tts.json +316 -0
- assets/onnx/tts.yml +223 -0
- assets/onnx/unicode_indexer.json +0 -0
- assets/onnx/vector_estimator.onnx +3 -0
- assets/onnx/vocoder.onnx +3 -0
- assets/voice_styles/F1.json +0 -0
- assets/voice_styles/F2.json +0 -0
- assets/voice_styles/M1.json +0 -0
- assets/voice_styles/M2.json +0 -0
- helper.py +349 -0
- requirements.txt +7 -0
app.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
import io
|
| 4 |
+
import wave
|
| 5 |
+
import numpy as np
|
| 6 |
+
import soundfile as sf
|
| 7 |
+
from huggingface_hub import snapshot_download
|
| 8 |
+
from helper import load_text_to_speech, load_voice_style
|
| 9 |
+
|
| 10 |
+
_SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
|
| 11 |
+
|
| 12 |
+
def _init_supertonic() -> None:
|
| 13 |
+
if _SUPERTONIC_STATE["initialized"]:
|
| 14 |
+
return
|
| 15 |
+
|
| 16 |
+
print("Initializing Supertonic...")
|
| 17 |
+
# Download models if not present
|
| 18 |
+
assets_dir = os.path.join(os.path.dirname(__file__), "assets")
|
| 19 |
+
if not os.path.exists(assets_dir):
|
| 20 |
+
print(f"Downloading Supertonic models to {assets_dir}...")
|
| 21 |
+
snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
|
| 22 |
+
|
| 23 |
+
onnx_dir = os.path.join(assets_dir, "onnx")
|
| 24 |
+
tts = load_text_to_speech(onnx_dir, use_gpu=False)
|
| 25 |
+
|
| 26 |
+
_SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
|
| 27 |
+
print("Supertonic initialized.")
|
| 28 |
+
|
| 29 |
+
def get_supertonic_voices():
|
| 30 |
+
"""Get list of available Supertonic voice styles."""
|
| 31 |
+
# Ensure assets are downloaded to list voices
|
| 32 |
+
assets_dir = os.path.join(os.path.dirname(__file__), "assets")
|
| 33 |
+
if not os.path.exists(assets_dir):
|
| 34 |
+
# If not initialized/downloaded yet, we might not see voices.
|
| 35 |
+
# But we can try to download just to list, or just init.
|
| 36 |
+
_init_supertonic()
|
| 37 |
+
assets_dir = _SUPERTONIC_STATE["assets_dir"]
|
| 38 |
+
|
| 39 |
+
voice_styles_dir = os.path.join(assets_dir, "voice_styles")
|
| 40 |
+
if not os.path.exists(voice_styles_dir):
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
files = os.listdir(voice_styles_dir)
|
| 44 |
+
voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
|
| 45 |
+
return sorted(voices)
|
| 46 |
+
|
| 47 |
+
def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
|
| 48 |
+
audio_clipped = np.clip(audio_np, -1.0, 1.0)
|
| 49 |
+
return (audio_clipped * 32767.0).astype(np.int16)
|
| 50 |
+
|
| 51 |
+
def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
|
| 52 |
+
buffer = io.BytesIO()
|
| 53 |
+
with wave.open(buffer, "wb") as wf:
|
| 54 |
+
wf.setnchannels(1)
|
| 55 |
+
wf.setsampwidth(2)
|
| 56 |
+
wf.setframerate(sample_rate)
|
| 57 |
+
wf.writeframes(audio_int16.tobytes())
|
| 58 |
+
return buffer.getvalue()
|
| 59 |
+
|
| 60 |
+
def supertonic_tts(text: str, speed: float, voice: str, steps: int):
|
| 61 |
+
if not text or not text.strip():
|
| 62 |
+
raise gr.Error("Please enter text to synthesize.")
|
| 63 |
+
|
| 64 |
+
_init_supertonic()
|
| 65 |
+
tts = _SUPERTONIC_STATE["tts"]
|
| 66 |
+
assets_dir = _SUPERTONIC_STATE["assets_dir"]
|
| 67 |
+
|
| 68 |
+
voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
|
| 69 |
+
if not os.path.exists(voice_path):
|
| 70 |
+
raise gr.Error(f"Voice style {voice} not found.")
|
| 71 |
+
|
| 72 |
+
style = load_voice_style([voice_path])
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
sr = tts.sample_rate
|
| 76 |
+
for audio_chunk in tts.stream(text, style, steps, speed):
|
| 77 |
+
audio_int16 = _audio_np_to_int16(audio_chunk)
|
| 78 |
+
yield _wav_bytes_from_int16(audio_int16, sr)
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
raise gr.Error(f"Error during speech generation: {str(e)}")
|
| 82 |
+
|
| 83 |
+
with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
|
| 84 |
+
gr.HTML("<h1 style='text-align: center;'>Supertonic-Hub</h1><p style='text-align: center;'>Powered by Supertone/supertonic</p>")
|
| 85 |
+
|
| 86 |
+
# We need to initialize to get voices, but we don't want to block startup too long if download is needed.
|
| 87 |
+
# For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
|
| 88 |
+
# Or we can just list a default if not found.
|
| 89 |
+
try:
|
| 90 |
+
available_voices = get_supertonic_voices()
|
| 91 |
+
except Exception:
|
| 92 |
+
available_voices = []
|
| 93 |
+
|
| 94 |
+
default_voice = available_voices[0] if available_voices else None
|
| 95 |
+
|
| 96 |
+
with gr.Row(variant='panel'):
|
| 97 |
+
speed_slider = gr.Slider(
|
| 98 |
+
minimum=0.5,
|
| 99 |
+
maximum=2.0,
|
| 100 |
+
value=1.0,
|
| 101 |
+
step=0.1,
|
| 102 |
+
label='Speed'
|
| 103 |
+
)
|
| 104 |
+
steps_slider = gr.Slider(
|
| 105 |
+
minimum=1,
|
| 106 |
+
maximum=50,
|
| 107 |
+
value=5,
|
| 108 |
+
step=1,
|
| 109 |
+
label='Steps (Quality vs Speed)'
|
| 110 |
+
)
|
| 111 |
+
voice_dropdown = gr.Dropdown(
|
| 112 |
+
choices=available_voices,
|
| 113 |
+
label='Voice',
|
| 114 |
+
value=default_voice,
|
| 115 |
+
allow_custom_value=True
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
text_input = gr.Textbox(
|
| 119 |
+
label="Input Text",
|
| 120 |
+
placeholder="Enter the text you want to convert to speech here...",
|
| 121 |
+
lines=5,
|
| 122 |
+
value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
generate_btn = gr.Button(
|
| 126 |
+
"Generate Speech",
|
| 127 |
+
variant="primary",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
audio_output = gr.Audio(
|
| 131 |
+
label="Generated Speech",
|
| 132 |
+
streaming=True,
|
| 133 |
+
autoplay=True
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
def update_voices():
|
| 137 |
+
voices = get_supertonic_voices()
|
| 138 |
+
return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
|
| 139 |
+
|
| 140 |
+
# Add a refresh button for voices in case they weren't loaded initially
|
| 141 |
+
refresh_btn = gr.Button("Refresh Voices (Downloads Model if needed)")
|
| 142 |
+
refresh_btn.click(fn=update_voices, outputs=voice_dropdown)
|
| 143 |
+
|
| 144 |
+
generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider]
|
| 145 |
+
|
| 146 |
+
generate_btn.click(
|
| 147 |
+
fn=supertonic_tts,
|
| 148 |
+
inputs=generate_inputs,
|
| 149 |
+
outputs=audio_output,
|
| 150 |
+
api_name="generate_speech"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
text_input.submit(
|
| 154 |
+
fn=supertonic_tts,
|
| 155 |
+
inputs=generate_inputs,
|
| 156 |
+
outputs=audio_output,
|
| 157 |
+
api_name="generate_speech_enter"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
demo.queue().launch()
|
assets/.cache/huggingface/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*
|
assets/.cache/huggingface/download/.gitattributes.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
a6344aac8c09253b3b630fb776ae94478aa0275b
|
| 3 |
+
1763671228.0332673
|
assets/.cache/huggingface/download/.gitignore.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
2f152fc303670993b3cd5f4089406fb87ef8821e
|
| 3 |
+
1763671228.1495774
|
assets/.cache/huggingface/download/LICENSE.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
1e2cbe6cef94c8cdf3ed1fcebc0f5317ca7ad5a1
|
| 3 |
+
1763671228.058201
|
assets/.cache/huggingface/download/README.md.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
20caed64917ab1c78e5122b3ee3aee22b9f644d6
|
| 3 |
+
1763671227.940027
|
assets/.cache/huggingface/download/config.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
163e41dac1144faedf93a23b333d728863b31ba1
|
| 3 |
+
1763671228.0821016
|
assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
|
| 3 |
+
1763671228.404064
|
assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
|
| 3 |
+
1763671228.9067116
|
assets/.cache/huggingface/download/onnx/tts.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
2129ec38aa31ad06ee171b8cd44e75a5a41b5da4
|
| 3 |
+
1763671228.0873225
|
assets/.cache/huggingface/download/onnx/tts.yml.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
d37684b0c9b7891bfaf2946921f6895ec924cc6d
|
| 3 |
+
1763671228.3533428
|
assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
2d0dadf8d5d7388ff8614b33172a1c64ee3ca2ae
|
| 3 |
+
1763671228.4996374
|
assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
|
| 3 |
+
1763671231.884892
|
assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
|
| 3 |
+
1763671231.4259956
|
assets/.cache/huggingface/download/voice_styles/F1.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
842522854be041eabfef70e97393ffb8cbc77d37
|
| 3 |
+
1763671228.7745795
|
assets/.cache/huggingface/download/voice_styles/F2.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
ade0e075c325a27d7ab1d19a8f5ab3f8b8f54bee
|
| 3 |
+
1763671228.9370666
|
assets/.cache/huggingface/download/voice_styles/M1.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
f6e6b25e6ee6aa603d19bb1fdcf3cd9f35f528c6
|
| 3 |
+
1763671228.946357
|
assets/.cache/huggingface/download/voice_styles/M2.json.metadata
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
3110200b250c90179f67c387b3dcde326cc7ff43
|
| 2 |
+
a38b1a1327156f27310bcc55223e0914ddf8a615
|
| 3 |
+
1763671229.2795043
|
assets/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
assets/.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
window.json
|
| 2 |
+
filter_bank.json
|
| 3 |
+
style_extractor.onnx
|
| 4 |
+
*.npy
|
assets/LICENSE
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BigScience Open RAIL-M License
|
| 2 |
+
dated August 18, 2022
|
| 3 |
+
|
| 4 |
+
Section I: PREAMBLE
|
| 5 |
+
|
| 6 |
+
This Open RAIL-M License was created by BigScience, a collaborative open innovation project aimed at
|
| 7 |
+
the responsible development and use of large multilingual datasets and Large Language Models
|
| 8 |
+
(“LLMs”). While a similar license was originally designed for the BLOOM model, we decided to adapt it
|
| 9 |
+
and create this license in order to propose a general open and responsible license applicable to other
|
| 10 |
+
machine learning based AI models (e.g. multimodal generative models).
|
| 11 |
+
In short, this license strives for both the open and responsible downstream use of the accompanying
|
| 12 |
+
model. When it comes to the open character, we took inspiration from open source permissive licenses
|
| 13 |
+
regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based
|
| 14 |
+
restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be
|
| 15 |
+
able to enforce the license in case potential misuses of the Model may occur. Even though downstream
|
| 16 |
+
derivative versions of the model could be released under different licensing terms, the latter will always
|
| 17 |
+
have to include - at minimum - the same use-based restrictions as the ones in the original license (this
|
| 18 |
+
license).
|
| 19 |
+
The development and use of artificial intelligence (“AI”), does not come without concerns. The world has
|
| 20 |
+
witnessed how AI techniques may, in some instances, become risky for the public in general. These risks
|
| 21 |
+
come in many forms, from racial discrimination to the misuse of sensitive information.
|
| 22 |
+
BigScience believes in the intersection between open and responsible AI development; thus, this License
|
| 23 |
+
aims to strike a balance between both in order to enable responsible open-science in the field of AI.
|
| 24 |
+
This License governs the use of the model (and its derivatives) and is informed by the model card
|
| 25 |
+
associated with the model.
|
| 26 |
+
|
| 27 |
+
NOW THEREFORE, You and Licensor agree as follows:
|
| 28 |
+
|
| 29 |
+
1. Definitions
|
| 30 |
+
(a) "License" means the terms and conditions for use, reproduction, and Distribution as defined in
|
| 31 |
+
this document.
|
| 32 |
+
(b) “Data” means a collection of information and/or content extracted from the dataset used with the
|
| 33 |
+
Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under
|
| 34 |
+
this License.
|
| 35 |
+
(c)“Output” means the results of operating a Model as embodied in informational content resulting
|
| 36 |
+
therefrom.
|
| 37 |
+
(d)“Model” means any accompanying machine-learning based assemblies (including checkpoints),
|
| 38 |
+
consisting of learnt weights, parameters (including optimizer states), corresponding to the model
|
| 39 |
+
architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or
|
| 40 |
+
in part on the Data, using the Complementary Material.
|
| 41 |
+
(e) “Derivatives of the Model” means all modifications to the Model, works based on the Model, or any
|
| 42 |
+
other model which is created or initialized by transfer of patterns of the weights, parameters,
|
| 43 |
+
activations or output of the Model, to the other model, in order to cause the other model to perform
|
| 44 |
+
similarly to the Model, including - but not limited to - distillation methods entailing the use of
|
| 45 |
+
intermediate data representations or methods based on the generation of synthetic data by the Model
|
| 46 |
+
for training the other model.
|
| 47 |
+
(f)“Complementary Material” means the accompanying source code and scripts used to define,
|
| 48 |
+
run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if
|
| 49 |
+
any. This includes any accompanying documentation, tutorials, examples, etc, if any.
|
| 50 |
+
(g) “Distribution” means any transmission, reproduction, publication or other sharing of the Model or
|
| 51 |
+
Derivatives of the Model to a third party, including providing the Model as a hosted service made
|
| 52 |
+
available by electronic or other remote means - e.g. API-based or web access.
|
| 53 |
+
(h) “Licensor” means the copyright owner or entity authorized by the copyright owner that is
|
| 54 |
+
granting the License, including the persons or entities that may have rights in the Model and/or
|
| 55 |
+
distributing the Model.
|
| 56 |
+
(i) "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this
|
| 57 |
+
License and/or making use of the Model for whichever purpose and in any field of use, including
|
| 58 |
+
usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
|
| 59 |
+
(j) “Third Parties” means individuals or legal entities that are not under common control with
|
| 60 |
+
Licensor or You.
|
| 61 |
+
(k) "Contribution" means any work of authorship, including the original version of the Model and
|
| 62 |
+
any modifications or additions to that Model or Derivatives of the Model thereof, that is
|
| 63 |
+
intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an
|
| 64 |
+
individual or Legal Entity authorized to submit on behalf of the copyright owner. For the
|
| 65 |
+
purposes of this definition,
|
| 66 |
+
“submitted” means any form of electronic, verbal, or written
|
| 67 |
+
communication sent to the Licensor or its representatives, including but not limited to
|
| 68 |
+
communication on electronic mailing lists, source code control systems, and issue tracking
|
| 69 |
+
systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and
|
| 70 |
+
improving the Model, but excluding communication that is conspicuously marked or otherwise
|
| 71 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 72 |
+
(l) "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a
|
| 73 |
+
Contribution has been received by Licensor and subsequently incorporated within the Model.
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
Section II: INTELLECTUAL PROPERTY RIGHTS
|
| 77 |
+
|
| 78 |
+
Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary
|
| 79 |
+
Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
|
| 80 |
+
|
| 81 |
+
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor
|
| 82 |
+
hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the
|
| 83 |
+
Complementary Material, the Model, and Derivatives of the Model.
|
| 84 |
+
|
| 85 |
+
3. Grant of Patent License. Subject to the terms and conditions of this License and where and as
|
| 86 |
+
applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
|
| 87 |
+
royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer
|
| 88 |
+
to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such
|
| 89 |
+
license applies only to those patent claims licensable by such Contributor that are necessarily infringed by
|
| 90 |
+
their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such
|
| 91 |
+
Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim
|
| 92 |
+
or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution
|
| 93 |
+
incorporated within the Model and/or Complementary Material constitutes direct or contributory patent
|
| 94 |
+
infringement, then any patent licenses granted to You under this License for the Model and/or Work shall
|
| 95 |
+
terminate as of the date such litigation is asserted or filed.
|
| 96 |
+
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
|
| 97 |
+
|
| 98 |
+
4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g.
|
| 99 |
+
software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof
|
| 100 |
+
in any medium, with or without modifications, provided that You meet the following conditions:
|
| 101 |
+
|
| 102 |
+
a. Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision
|
| 103 |
+
by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the
|
| 104 |
+
Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to,
|
| 105 |
+
that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply
|
| 106 |
+
to the use of Complementary Material.
|
| 107 |
+
|
| 108 |
+
b. You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this
|
| 109 |
+
License;
|
| 110 |
+
|
| 111 |
+
c. You must cause any modified files to carry prominent notices stating that You changed the files;
|
| 112 |
+
|
| 113 |
+
d. You must retain all copyright, patent, trademark, and attribution notices excluding those notices
|
| 114 |
+
that do not pertain to any part of the Model, Derivatives of the Model.
|
| 115 |
+
You may add Your own copyright statement to Your modifications and may provide additional or
|
| 116 |
+
different license terms and conditions - respecting paragraph 4.a.
|
| 117 |
+
- for use, reproduction, or Distribution
|
| 118 |
+
of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use,
|
| 119 |
+
reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
|
| 120 |
+
|
| 121 |
+
5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions.
|
| 122 |
+
Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You
|
| 123 |
+
may use the Model subject to this License, including only for lawful purposes and in accordance with the
|
| 124 |
+
License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or
|
| 125 |
+
reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model
|
| 126 |
+
to comply with the terms of this paragraph (paragraph 5).
|
| 127 |
+
|
| 128 |
+
6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You
|
| 129 |
+
generate using the Model. You are accountable for the Output you generate and its subsequent uses. No
|
| 130 |
+
use of the output can contravene any provision as stated in the License.
|
| 131 |
+
|
| 132 |
+
Section IV: OTHER PROVISIONS
|
| 133 |
+
|
| 134 |
+
7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the
|
| 135 |
+
right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model
|
| 136 |
+
through electronic means, or modify the Output of the Model based on updates. You shall undertake
|
| 137 |
+
reasonable efforts to use the latest version of the Model.
|
| 138 |
+
|
| 139 |
+
8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks,
|
| 140 |
+
trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the
|
| 141 |
+
parties; and any rights not expressly granted herein are reserved by the Licensors.
|
| 142 |
+
|
| 143 |
+
9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides
|
| 144 |
+
the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS
|
| 145 |
+
IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
|
| 146 |
+
including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
|
| 147 |
+
MERCHANTABILITY , or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for
|
| 148 |
+
determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the
|
| 149 |
+
Complementary Material and assume any risks associated with Your exercise of permissions under this
|
| 150 |
+
License.
|
| 151 |
+
|
| 152 |
+
10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence),
|
| 153 |
+
contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or
|
| 154 |
+
agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect,
|
| 155 |
+
special, incidental, or consequential damages of any character arising as a result of this License or out of
|
| 156 |
+
the use or inability to use the Model and the Complementary Material (including but not limited to
|
| 157 |
+
damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other
|
| 158 |
+
commercial damages or losses), even if such Contributor has been advised of the possibility of such
|
| 159 |
+
damages.
|
| 160 |
+
|
| 161 |
+
11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the
|
| 162 |
+
Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance
|
| 163 |
+
of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License.
|
| 164 |
+
However, in accepting such obligations, You may act only on Your own behalf and on Your sole
|
| 165 |
+
responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and
|
| 166 |
+
hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor
|
| 167 |
+
by reason of your accepting any such warranty or additional liability.
|
| 168 |
+
|
| 169 |
+
12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining
|
| 170 |
+
provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
|
| 171 |
+
|
| 172 |
+
END OF TERMS AND CONDITIONS
|
| 173 |
+
|
| 174 |
+
Attachment A
|
| 175 |
+
|
| 176 |
+
Use Restrictions
|
| 177 |
+
|
| 178 |
+
You agree not to use the Model or Derivatives of the Model:
|
| 179 |
+
(a) In any way that violates any applicable national, federal, state, local or international law
|
| 180 |
+
or regulation;
|
| 181 |
+
(b) For the purpose of exploiting, harming or attempting to exploit or harm minors in any
|
| 182 |
+
way;
|
| 183 |
+
(c) To generate or disseminate verifiably false information and/or content with the purpose of
|
| 184 |
+
harming others;
|
| 185 |
+
(d) To generate or disseminate personal identifiable information that can be used to harm an
|
| 186 |
+
individual;
|
| 187 |
+
(e) To generate or disseminate information and/or content (e.g. images, code, posts, articles),
|
| 188 |
+
and place the information and/or content in any context (e.g. bot generating tweets)
|
| 189 |
+
without expressly and intelligibly disclaiming that the information and/or content is
|
| 190 |
+
machine generated;
|
| 191 |
+
(f) To defame, disparage or otherwise harass others;
|
| 192 |
+
(g) To impersonate or attempt to impersonate (e.g. deepfakes) others without their consent;
|
| 193 |
+
(h) For fully automated decision making that adversely impacts an individual’s legal rights or
|
| 194 |
+
otherwise creates or modifies a binding, enforceable obligation;
|
| 195 |
+
(i) For any use intended to or which has the effect of discriminating against or harming
|
| 196 |
+
individuals or groups based on online or offline social behavior or known or predicted
|
| 197 |
+
personal or personality characteristics;
|
| 198 |
+
(j) To exploit any of the vulnerabilities of a specific group of persons based on their age,
|
| 199 |
+
social, physical or mental characteristics, in order to materially distort the behavior of a
|
| 200 |
+
person pertaining to that group in a manner that causes or is likely to cause that person or
|
| 201 |
+
another person physical or psychological harm;
|
| 202 |
+
(k) For any use intended to or which has the effect of discriminating against individuals or
|
| 203 |
+
groups based on legally protected characteristics or categories;
|
| 204 |
+
(l) To provide medical advice and medical results interpretation;
|
| 205 |
+
(m) To generate or disseminate information for the purpose to be used for administration of
|
| 206 |
+
justice, law enforcement, immigration or asylum processes, such as predicting an
|
| 207 |
+
individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal
|
| 208 |
+
relationships between assertions made in documents, indiscriminate and
|
| 209 |
+
arbitrarily-targeted use).
|
assets/README.md
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: openrail
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: text-to-speech
|
| 6 |
+
library_name: transformers.js
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# Supertonic — Lightning Fast, On-Device TTS
|
| 10 |
+
|
| 11 |
+
**Supertonic** is a lightning-fast, on-device text-to-speech system designed for **extreme performance** with minimal computational overhead. Powered by ONNX Runtime, it runs entirely on your device—no cloud, no API calls, no privacy concerns.
|
| 12 |
+
|
| 13 |
+
> 🎧 **Try it now**: Experience Supertonic in your browser with our [**Interactive Demo**](https://huggingface.co/spaces/Supertone/supertonic#interactive-demo), or [**Hugging Face app**](https://huggingface.co/spaces/akhaliq/supertonic) or get started with pre-trained models from [**Hugging Face Hub**](https://huggingface.co/Supertone/supertonic)
|
| 14 |
+
|
| 15 |
+
> 🛠 **GitHub Repository**
|
| 16 |
+
> To use Supertonic most easily, visit the official GitHub repository:
|
| 17 |
+
> https://github.com/supertone-inc/supertonic
|
| 18 |
+
> You’ll find multi-language example codes.
|
| 19 |
+
|
| 20 |
+
### Table of Contents
|
| 21 |
+
|
| 22 |
+
- [Why Supertonic?](#why-supertonic)
|
| 23 |
+
- [Language Support](#language-support)
|
| 24 |
+
- [Getting Started](#getting-started)
|
| 25 |
+
- [Performance](#performance)
|
| 26 |
+
- [Citation](#citation)
|
| 27 |
+
- [License](#license)
|
| 28 |
+
|
| 29 |
+
## Why Supertonic?
|
| 30 |
+
|
| 31 |
+
- **⚡ Blazingly Fast**: Generates speech up to **167× faster than real-time** on consumer hardware (M4 Pro)—unmatched by any other TTS system
|
| 32 |
+
- **🪶 Ultra Lightweight**: Only **66M parameters**, optimized for efficient on-device performance with minimal footprint
|
| 33 |
+
- **📱 On-Device Capable**: **Complete privacy** and **zero latency**—all processing happens locally on your device
|
| 34 |
+
- **🎨 Natural Text Handling**: Seamlessly processes numbers, dates, currency, abbreviations, and complex expressions without pre-processing
|
| 35 |
+
- **⚙️ Highly Configurable**: Adjust inference steps, batch processing, and other parameters to match your specific needs
|
| 36 |
+
- **🧩 Flexible Deployment**: Deploy seamlessly across servers, browsers, and edge devices with multiple runtime backends.
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
## Language Support
|
| 40 |
+
|
| 41 |
+
We provide ready-to-use TTS inference examples across multiple ecosystems:
|
| 42 |
+
|
| 43 |
+
| Language/Platform | Path | Description |
|
| 44 |
+
|-------------------|------|-------------|
|
| 45 |
+
| [**Python**] | `py/` | ONNX Runtime inference |
|
| 46 |
+
| [**Node.js**] | `nodejs/` | Server-side JavaScript |
|
| 47 |
+
| [**Browser**] | `web/` | WebGPU/WASM inference |
|
| 48 |
+
| [**Java**] | `java/` | Cross-platform JVM |
|
| 49 |
+
| [**C++**] | `cpp/` | High-performance C++ |
|
| 50 |
+
| [**C#**] | `csharp/` | .NET ecosystem |
|
| 51 |
+
| [**Go**] | `go/` | Go implementation |
|
| 52 |
+
| [**Swift**] | `swift/` | macOS applications |
|
| 53 |
+
| [**iOS**] | `ios/` | Native iOS apps |
|
| 54 |
+
| [**Rust**] | `rust/` | Memory-safe systems |
|
| 55 |
+
|
| 56 |
+
> For detailed usage instructions, please refer to the README.md in each language directory.
|
| 57 |
+
|
| 58 |
+
## Getting Started
|
| 59 |
+
|
| 60 |
+
First, clone the repository:
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
git clone https://github.com/supertone-inc/supertonic.git
|
| 64 |
+
cd supertonic
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### Prerequisites
|
| 68 |
+
|
| 69 |
+
Before running the examples, download the ONNX models and preset voices, and place them in the `assets` directory:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
git clone https://huggingface.co/Supertone/supertonic assets
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
> **Note:** The Hugging Face repository uses Git LFS. Please ensure Git LFS is installed and initialized before cloning or pulling large model files.
|
| 76 |
+
> - macOS: `brew install git-lfs && git lfs install`
|
| 77 |
+
> - Generic: see `https://git-lfs.com` for installers
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
### Technical Details
|
| 81 |
+
|
| 82 |
+
- **Runtime**: ONNX Runtime for cross-platform inference (CPU-optimized; GPU mode is not tested)
|
| 83 |
+
- **Browser Support**: onnxruntime-web for client-side inference
|
| 84 |
+
- **Batch Processing**: Supports batch inference for improved throughput
|
| 85 |
+
- **Audio Output**: Outputs 16-bit WAV files
|
| 86 |
+
|
| 87 |
+
## Performance
|
| 88 |
+
|
| 89 |
+
We evaluated Supertonic's performance (with 2 inference steps) using two key metrics across input texts of varying lengths: Short (59 chars), Mid (152 chars), and Long (266 chars).
|
| 90 |
+
|
| 91 |
+
**Metrics:**
|
| 92 |
+
- **Characters per Second**: Measures throughput by dividing the number of input characters by the time required to generate audio. Higher is better.
|
| 93 |
+
- **Real-time Factor (RTF)**: Measures the time taken to synthesize audio relative to its duration. Lower is better (e.g., RTF of 0.1 means it takes 0.1 seconds to generate one second of audio).
|
| 94 |
+
|
| 95 |
+
### Characters per Second
|
| 96 |
+
| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
|
| 97 |
+
|--------|-----------------|----------------|-----------------|
|
| 98 |
+
| **Supertonic** (M4 pro - CPU) | 912 | 1048 | 1263 |
|
| 99 |
+
| **Supertonic** (M4 pro - WebGPU) | 996 | 1801 | 2509 |
|
| 100 |
+
| **Supertonic** (RTX4090) | 2615 | 6548 | 12164 |
|
| 101 |
+
| `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 144 | 209 | 287 |
|
| 102 |
+
| `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 37 | 55 | 82 |
|
| 103 |
+
| `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 12 | 18 | 24 |
|
| 104 |
+
| `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 38 | 64 | 92 |
|
| 105 |
+
| `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 104 | 107 | 117 |
|
| 106 |
+
| `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 37 | 42 | 47 |
|
| 107 |
+
|
| 108 |
+
> **Notes:**
|
| 109 |
+
> `API` = Cloud-based API services (measured from Seoul)
|
| 110 |
+
> `Open` = Open-source models
|
| 111 |
+
> Supertonic (M4 pro - CPU) and (M4 pro - WebGPU): Tested with ONNX
|
| 112 |
+
> Supertonic (RTX4090): Tested with PyTorch model
|
| 113 |
+
> Kokoro: Tested on M4 Pro CPU with ONNX
|
| 114 |
+
> NeuTTS Air: Tested on M4 Pro CPU with Q8-GGUF
|
| 115 |
+
|
| 116 |
+
### Real-time Factor
|
| 117 |
+
|
| 118 |
+
| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
|
| 119 |
+
|--------|-----------------|----------------|-----------------|
|
| 120 |
+
| **Supertonic** (M4 pro - CPU) | 0.015 | 0.013 | 0.012 |
|
| 121 |
+
| **Supertonic** (M4 pro - WebGPU) | 0.014 | 0.007 | 0.006 |
|
| 122 |
+
| **Supertonic** (RTX4090) | 0.005 | 0.002 | 0.001 |
|
| 123 |
+
| `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 0.133 | 0.077 | 0.057 |
|
| 124 |
+
| `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 0.471 | 0.302 | 0.201 |
|
| 125 |
+
| `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 1.060 | 0.673 | 0.541 |
|
| 126 |
+
| `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 0.372 | 0.206 | 0.163 |
|
| 127 |
+
| `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 0.144 | 0.124 | 0.126 |
|
| 128 |
+
| `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 0.390 | 0.338 | 0.343 |
|
| 129 |
+
|
| 130 |
+
<details>
|
| 131 |
+
<summary><b>Additional Performance Data (5-step inference)</b></summary>
|
| 132 |
+
|
| 133 |
+
<br>
|
| 134 |
+
|
| 135 |
+
**Characters per Second (5-step)**
|
| 136 |
+
|
| 137 |
+
| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
|
| 138 |
+
|--------|-----------------|----------------|-----------------|
|
| 139 |
+
| **Supertonic** (M4 pro - CPU) | 596 | 691 | 850 |
|
| 140 |
+
| **Supertonic** (M4 pro - WebGPU) | 570 | 1118 | 1546 |
|
| 141 |
+
| **Supertonic** (RTX4090) | 1286 | 3757 | 6242 |
|
| 142 |
+
|
| 143 |
+
**Real-time Factor (5-step)**
|
| 144 |
+
|
| 145 |
+
| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
|
| 146 |
+
|--------|-----------------|----------------|-----------------|
|
| 147 |
+
| **Supertonic** (M4 pro - CPU) | 0.023 | 0.019 | 0.018 |
|
| 148 |
+
| **Supertonic** (M4 pro - WebGPU) | 0.024 | 0.012 | 0.010 |
|
| 149 |
+
| **Supertonic** (RTX4090) | 0.011 | 0.004 | 0.002 |
|
| 150 |
+
|
| 151 |
+
</details>
|
| 152 |
+
|
| 153 |
+
## License
|
| 154 |
+
|
| 155 |
+
This project’s sample code is released under the MIT License. - see the [LICENSE](https://github.com/supertone-inc/supertonic?tab=MIT-1-ov-file) for details.
|
| 156 |
+
|
| 157 |
+
The accompanying model is released under the OpenRAIL-M License. - see the [LICENSE](https://huggingface.co/Supertone/supertonic/blob/main/LICENSE) file for details.
|
| 158 |
+
|
| 159 |
+
This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. - see the [LICENSE](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
|
| 160 |
+
|
| 161 |
+
Copyright (c) 2025 Supertone Inc.
|
assets/config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Supertonic",
|
| 3 |
+
"model_type": "onnx",
|
| 4 |
+
"description": "This is a stub config for Hugging Face download counting. The actual model is located at onnx/"
|
| 5 |
+
}
|
assets/onnx/duration_predictor.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
|
| 3 |
+
size 1590703
|
assets/onnx/text_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
|
| 3 |
+
size 27978387
|
assets/onnx/tts.json
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tts_version": "v1.5.0",
|
| 3 |
+
"split": "opensource-en",
|
| 4 |
+
"ttl_ckpt_path": "unknown.pt",
|
| 5 |
+
"dp_ckpt_path": "unknown.pt",
|
| 6 |
+
"ae_ckpt_path": "unknown.pt",
|
| 7 |
+
"ttl_train": "unknown",
|
| 8 |
+
"dp_train": "unknown",
|
| 9 |
+
"ae_train": "unknown",
|
| 10 |
+
"ttl": {
|
| 11 |
+
"latent_dim": 24,
|
| 12 |
+
"chunk_compress_factor": 6,
|
| 13 |
+
"batch_expander": {
|
| 14 |
+
"n_batch_expand": 6
|
| 15 |
+
},
|
| 16 |
+
"normalizer": {
|
| 17 |
+
"scale": 0.25
|
| 18 |
+
},
|
| 19 |
+
"text_encoder": {
|
| 20 |
+
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
| 21 |
+
"text_embedder": {
|
| 22 |
+
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
| 23 |
+
"char_emb_dim": 256
|
| 24 |
+
},
|
| 25 |
+
"convnext": {
|
| 26 |
+
"idim": 256,
|
| 27 |
+
"ksz": 5,
|
| 28 |
+
"intermediate_dim": 1024,
|
| 29 |
+
"num_layers": 6,
|
| 30 |
+
"dilation_lst": [
|
| 31 |
+
1,
|
| 32 |
+
1,
|
| 33 |
+
1,
|
| 34 |
+
1,
|
| 35 |
+
1,
|
| 36 |
+
1
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
"attn_encoder": {
|
| 40 |
+
"hidden_channels": 256,
|
| 41 |
+
"filter_channels": 1024,
|
| 42 |
+
"n_heads": 4,
|
| 43 |
+
"n_layers": 4,
|
| 44 |
+
"p_dropout": 0.0
|
| 45 |
+
},
|
| 46 |
+
"proj_out": {
|
| 47 |
+
"idim": 256,
|
| 48 |
+
"odim": 256
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
"flow_matching": {
|
| 52 |
+
"sig_min": 0
|
| 53 |
+
},
|
| 54 |
+
"style_encoder": {
|
| 55 |
+
"proj_in": {
|
| 56 |
+
"ldim": 24,
|
| 57 |
+
"chunk_compress_factor": 6,
|
| 58 |
+
"odim": 256
|
| 59 |
+
},
|
| 60 |
+
"convnext": {
|
| 61 |
+
"idim": 256,
|
| 62 |
+
"ksz": 5,
|
| 63 |
+
"intermediate_dim": 1024,
|
| 64 |
+
"num_layers": 6,
|
| 65 |
+
"dilation_lst": [
|
| 66 |
+
1,
|
| 67 |
+
1,
|
| 68 |
+
1,
|
| 69 |
+
1,
|
| 70 |
+
1,
|
| 71 |
+
1
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
"style_token_layer": {
|
| 75 |
+
"input_dim": 256,
|
| 76 |
+
"n_style": 50,
|
| 77 |
+
"style_key_dim": 256,
|
| 78 |
+
"style_value_dim": 256,
|
| 79 |
+
"prototype_dim": 256,
|
| 80 |
+
"n_units": 256,
|
| 81 |
+
"n_heads": 2
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
"speech_prompted_text_encoder": {
|
| 85 |
+
"text_dim": 256,
|
| 86 |
+
"style_dim": 256,
|
| 87 |
+
"n_units": 256,
|
| 88 |
+
"n_heads": 2
|
| 89 |
+
},
|
| 90 |
+
"uncond_masker": {
|
| 91 |
+
"prob_both_uncond": 0.04,
|
| 92 |
+
"prob_text_uncond": 0.01,
|
| 93 |
+
"std": 0.1,
|
| 94 |
+
"text_dim": 256,
|
| 95 |
+
"n_style": 50,
|
| 96 |
+
"style_key_dim": 256,
|
| 97 |
+
"style_value_dim": 256
|
| 98 |
+
},
|
| 99 |
+
"vector_field": {
|
| 100 |
+
"proj_in": {
|
| 101 |
+
"ldim": 24,
|
| 102 |
+
"chunk_compress_factor": 6,
|
| 103 |
+
"odim": 512
|
| 104 |
+
},
|
| 105 |
+
"time_encoder": {
|
| 106 |
+
"time_dim": 64,
|
| 107 |
+
"hdim": 256
|
| 108 |
+
},
|
| 109 |
+
"main_blocks": {
|
| 110 |
+
"n_blocks": 4,
|
| 111 |
+
"time_cond_layer": {
|
| 112 |
+
"idim": 512,
|
| 113 |
+
"time_dim": 64
|
| 114 |
+
},
|
| 115 |
+
"style_cond_layer": {
|
| 116 |
+
"idim": 512,
|
| 117 |
+
"style_dim": 256
|
| 118 |
+
},
|
| 119 |
+
"text_cond_layer": {
|
| 120 |
+
"idim": 512,
|
| 121 |
+
"text_dim": 256,
|
| 122 |
+
"n_heads": 4,
|
| 123 |
+
"use_residual": true,
|
| 124 |
+
"rotary_base": 10000,
|
| 125 |
+
"rotary_scale": 10
|
| 126 |
+
},
|
| 127 |
+
"convnext_0": {
|
| 128 |
+
"idim": 512,
|
| 129 |
+
"ksz": 5,
|
| 130 |
+
"intermediate_dim": 1024,
|
| 131 |
+
"num_layers": 4,
|
| 132 |
+
"dilation_lst": [
|
| 133 |
+
1,
|
| 134 |
+
2,
|
| 135 |
+
4,
|
| 136 |
+
8
|
| 137 |
+
]
|
| 138 |
+
},
|
| 139 |
+
"convnext_1": {
|
| 140 |
+
"idim": 512,
|
| 141 |
+
"ksz": 5,
|
| 142 |
+
"intermediate_dim": 1024,
|
| 143 |
+
"num_layers": 1,
|
| 144 |
+
"dilation_lst": [
|
| 145 |
+
1
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
"convnext_2": {
|
| 149 |
+
"idim": 512,
|
| 150 |
+
"ksz": 5,
|
| 151 |
+
"intermediate_dim": 1024,
|
| 152 |
+
"num_layers": 1,
|
| 153 |
+
"dilation_lst": [
|
| 154 |
+
1
|
| 155 |
+
]
|
| 156 |
+
}
|
| 157 |
+
},
|
| 158 |
+
"last_convnext": {
|
| 159 |
+
"idim": 512,
|
| 160 |
+
"ksz": 5,
|
| 161 |
+
"intermediate_dim": 1024,
|
| 162 |
+
"num_layers": 4,
|
| 163 |
+
"dilation_lst": [
|
| 164 |
+
1,
|
| 165 |
+
1,
|
| 166 |
+
1,
|
| 167 |
+
1
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
"proj_out": {
|
| 171 |
+
"idim": 512,
|
| 172 |
+
"chunk_compress_factor": 6,
|
| 173 |
+
"ldim": 24
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
},
|
| 177 |
+
"ae": {
|
| 178 |
+
"sample_rate": 44100,
|
| 179 |
+
"n_delay": 0,
|
| 180 |
+
"base_chunk_size": 512,
|
| 181 |
+
"chunk_compress_factor": 1,
|
| 182 |
+
"ldim": 24,
|
| 183 |
+
"encoder": {
|
| 184 |
+
"spec_processor": {
|
| 185 |
+
"n_fft": 2048,
|
| 186 |
+
"win_length": 2048,
|
| 187 |
+
"hop_length": 512,
|
| 188 |
+
"n_mels": 228,
|
| 189 |
+
"sample_rate": 44100,
|
| 190 |
+
"eps": 1e-05,
|
| 191 |
+
"norm_mean": 0.0,
|
| 192 |
+
"norm_std": 1.0
|
| 193 |
+
},
|
| 194 |
+
"ksz_init": 7,
|
| 195 |
+
"ksz": 7,
|
| 196 |
+
"num_layers": 10,
|
| 197 |
+
"dilation_lst": [
|
| 198 |
+
1,
|
| 199 |
+
1,
|
| 200 |
+
1,
|
| 201 |
+
1,
|
| 202 |
+
1,
|
| 203 |
+
1,
|
| 204 |
+
1,
|
| 205 |
+
1,
|
| 206 |
+
1,
|
| 207 |
+
1
|
| 208 |
+
],
|
| 209 |
+
"intermediate_dim": 2048,
|
| 210 |
+
"idim": 1253,
|
| 211 |
+
"hdim": 512,
|
| 212 |
+
"odim": 24
|
| 213 |
+
},
|
| 214 |
+
"decoder": {
|
| 215 |
+
"ksz_init": 7,
|
| 216 |
+
"ksz": 7,
|
| 217 |
+
"num_layers": 10,
|
| 218 |
+
"dilation_lst": [
|
| 219 |
+
1,
|
| 220 |
+
2,
|
| 221 |
+
4,
|
| 222 |
+
1,
|
| 223 |
+
2,
|
| 224 |
+
4,
|
| 225 |
+
1,
|
| 226 |
+
1,
|
| 227 |
+
1,
|
| 228 |
+
1
|
| 229 |
+
],
|
| 230 |
+
"intermediate_dim": 2048,
|
| 231 |
+
"idim": 24,
|
| 232 |
+
"hdim": 512,
|
| 233 |
+
"head": {
|
| 234 |
+
"idim": 512,
|
| 235 |
+
"hdim": 2048,
|
| 236 |
+
"odim": 512,
|
| 237 |
+
"ksz": 3
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
},
|
| 241 |
+
"dp": {
|
| 242 |
+
"latent_dim": 24,
|
| 243 |
+
"chunk_compress_factor": 6,
|
| 244 |
+
"normalizer": {
|
| 245 |
+
"scale": 1.0
|
| 246 |
+
},
|
| 247 |
+
"sentence_encoder": {
|
| 248 |
+
"char_emb_dim": 64,
|
| 249 |
+
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
| 250 |
+
"text_embedder": {
|
| 251 |
+
"char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
|
| 252 |
+
"char_emb_dim": 64
|
| 253 |
+
},
|
| 254 |
+
"convnext": {
|
| 255 |
+
"idim": 64,
|
| 256 |
+
"ksz": 5,
|
| 257 |
+
"intermediate_dim": 256,
|
| 258 |
+
"num_layers": 6,
|
| 259 |
+
"dilation_lst": [
|
| 260 |
+
1,
|
| 261 |
+
1,
|
| 262 |
+
1,
|
| 263 |
+
1,
|
| 264 |
+
1,
|
| 265 |
+
1
|
| 266 |
+
]
|
| 267 |
+
},
|
| 268 |
+
"attn_encoder": {
|
| 269 |
+
"hidden_channels": 64,
|
| 270 |
+
"filter_channels": 256,
|
| 271 |
+
"n_heads": 2,
|
| 272 |
+
"n_layers": 2,
|
| 273 |
+
"p_dropout": 0.0
|
| 274 |
+
},
|
| 275 |
+
"proj_out": {
|
| 276 |
+
"idim": 64,
|
| 277 |
+
"odim": 64
|
| 278 |
+
}
|
| 279 |
+
},
|
| 280 |
+
"style_encoder": {
|
| 281 |
+
"proj_in": {
|
| 282 |
+
"ldim": 24,
|
| 283 |
+
"chunk_compress_factor": 6,
|
| 284 |
+
"odim": 64
|
| 285 |
+
},
|
| 286 |
+
"convnext": {
|
| 287 |
+
"idim": 64,
|
| 288 |
+
"ksz": 5,
|
| 289 |
+
"intermediate_dim": 256,
|
| 290 |
+
"num_layers": 4,
|
| 291 |
+
"dilation_lst": [
|
| 292 |
+
1,
|
| 293 |
+
1,
|
| 294 |
+
1,
|
| 295 |
+
1
|
| 296 |
+
]
|
| 297 |
+
},
|
| 298 |
+
"style_token_layer": {
|
| 299 |
+
"input_dim": 64,
|
| 300 |
+
"n_style": 8,
|
| 301 |
+
"style_key_dim": 0,
|
| 302 |
+
"style_value_dim": 16,
|
| 303 |
+
"prototype_dim": 64,
|
| 304 |
+
"n_units": 64,
|
| 305 |
+
"n_heads": 2
|
| 306 |
+
}
|
| 307 |
+
},
|
| 308 |
+
"predictor": {
|
| 309 |
+
"sentence_dim": 64,
|
| 310 |
+
"n_style": 8,
|
| 311 |
+
"style_dim": 16,
|
| 312 |
+
"hdim": 128,
|
| 313 |
+
"n_layer": 2
|
| 314 |
+
}
|
| 315 |
+
}
|
| 316 |
+
}
|
assets/onnx/tts.yml
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tts_version: "v1.5.0"
|
| 2 |
+
|
| 3 |
+
split: "opensource-en"
|
| 4 |
+
|
| 5 |
+
ttl_ckpt_path: "unknown.pt"
|
| 6 |
+
|
| 7 |
+
dp_ckpt_path: "unknown.pt"
|
| 8 |
+
|
| 9 |
+
ae_ckpt_path: "unknown.pt"
|
| 10 |
+
|
| 11 |
+
ttl_train: "unknown"
|
| 12 |
+
|
| 13 |
+
dp_train: "unknown"
|
| 14 |
+
|
| 15 |
+
ae_train: "unknown"
|
| 16 |
+
|
| 17 |
+
ttl:
|
| 18 |
+
latent_dim: 24
|
| 19 |
+
chunk_compress_factor: 6
|
| 20 |
+
batch_expander:
|
| 21 |
+
n_batch_expand: 6
|
| 22 |
+
normalizer:
|
| 23 |
+
scale: 0.25
|
| 24 |
+
text_encoder:
|
| 25 |
+
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
| 26 |
+
text_embedder:
|
| 27 |
+
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
| 28 |
+
char_emb_dim: 256
|
| 29 |
+
convnext:
|
| 30 |
+
idim: 256
|
| 31 |
+
ksz: 5
|
| 32 |
+
intermediate_dim: 1024
|
| 33 |
+
num_layers: 6
|
| 34 |
+
dilation_lst: [1, 1, 1, 1, 1, 1]
|
| 35 |
+
attn_encoder:
|
| 36 |
+
hidden_channels: 256
|
| 37 |
+
filter_channels: 1024
|
| 38 |
+
n_heads: 4
|
| 39 |
+
n_layers: 4
|
| 40 |
+
p_dropout: 0.0
|
| 41 |
+
proj_out:
|
| 42 |
+
idim: 256
|
| 43 |
+
odim: 256
|
| 44 |
+
flow_matching:
|
| 45 |
+
sig_min: 0
|
| 46 |
+
style_encoder:
|
| 47 |
+
proj_in:
|
| 48 |
+
ldim: 24
|
| 49 |
+
chunk_compress_factor: 6
|
| 50 |
+
odim: 256
|
| 51 |
+
convnext:
|
| 52 |
+
idim: 256
|
| 53 |
+
ksz: 5
|
| 54 |
+
intermediate_dim: 1024
|
| 55 |
+
num_layers: 6
|
| 56 |
+
dilation_lst: [1, 1, 1, 1, 1, 1]
|
| 57 |
+
style_token_layer:
|
| 58 |
+
input_dim: 256
|
| 59 |
+
n_style: 50
|
| 60 |
+
style_key_dim: 256
|
| 61 |
+
style_value_dim: 256
|
| 62 |
+
prototype_dim: 256
|
| 63 |
+
n_units: 256
|
| 64 |
+
n_heads: 2
|
| 65 |
+
speech_prompted_text_encoder:
|
| 66 |
+
text_dim: 256
|
| 67 |
+
style_dim: 256
|
| 68 |
+
n_units: 256
|
| 69 |
+
n_heads: 2
|
| 70 |
+
uncond_masker:
|
| 71 |
+
prob_both_uncond: 0.04
|
| 72 |
+
prob_text_uncond: 0.01
|
| 73 |
+
std: 0.1
|
| 74 |
+
text_dim: 256
|
| 75 |
+
n_style: 50
|
| 76 |
+
style_key_dim: 256
|
| 77 |
+
style_value_dim: 256
|
| 78 |
+
vector_field:
|
| 79 |
+
proj_in:
|
| 80 |
+
ldim: 24
|
| 81 |
+
chunk_compress_factor: 6
|
| 82 |
+
odim: 512
|
| 83 |
+
time_encoder:
|
| 84 |
+
time_dim: 64
|
| 85 |
+
hdim: 256
|
| 86 |
+
main_blocks:
|
| 87 |
+
n_blocks: 4
|
| 88 |
+
time_cond_layer:
|
| 89 |
+
idim: 512
|
| 90 |
+
time_dim: 64
|
| 91 |
+
style_cond_layer:
|
| 92 |
+
idim: 512
|
| 93 |
+
style_dim: 256
|
| 94 |
+
text_cond_layer:
|
| 95 |
+
idim: 512
|
| 96 |
+
text_dim: 256
|
| 97 |
+
n_heads: 4
|
| 98 |
+
use_residual: True
|
| 99 |
+
rotary_base: 10000
|
| 100 |
+
rotary_scale: 10
|
| 101 |
+
convnext_0:
|
| 102 |
+
idim: 512
|
| 103 |
+
ksz: 5
|
| 104 |
+
intermediate_dim: 1024
|
| 105 |
+
num_layers: 4
|
| 106 |
+
dilation_lst: [1, 2, 4, 8]
|
| 107 |
+
convnext_1:
|
| 108 |
+
idim: 512
|
| 109 |
+
ksz: 5
|
| 110 |
+
intermediate_dim: 1024
|
| 111 |
+
num_layers: 1
|
| 112 |
+
dilation_lst: [1]
|
| 113 |
+
convnext_2:
|
| 114 |
+
idim: 512
|
| 115 |
+
ksz: 5
|
| 116 |
+
intermediate_dim: 1024
|
| 117 |
+
num_layers: 1
|
| 118 |
+
dilation_lst: [1]
|
| 119 |
+
last_convnext:
|
| 120 |
+
idim: 512
|
| 121 |
+
ksz: 5
|
| 122 |
+
intermediate_dim: 1024
|
| 123 |
+
num_layers: 4
|
| 124 |
+
dilation_lst: [1, 1, 1, 1]
|
| 125 |
+
proj_out:
|
| 126 |
+
idim: 512
|
| 127 |
+
chunk_compress_factor: 6
|
| 128 |
+
ldim: 24
|
| 129 |
+
|
| 130 |
+
ae:
|
| 131 |
+
sample_rate: 44100
|
| 132 |
+
n_delay: 0
|
| 133 |
+
base_chunk_size: 512
|
| 134 |
+
chunk_compress_factor: 1
|
| 135 |
+
ldim: 24
|
| 136 |
+
encoder:
|
| 137 |
+
spec_processor:
|
| 138 |
+
n_fft: 2048
|
| 139 |
+
win_length: 2048
|
| 140 |
+
hop_length: 512
|
| 141 |
+
n_mels: 228
|
| 142 |
+
sample_rate: 44100
|
| 143 |
+
eps: 1e-05
|
| 144 |
+
norm_mean: 0.0
|
| 145 |
+
norm_std: 1.0
|
| 146 |
+
ksz_init: 7
|
| 147 |
+
ksz: 7
|
| 148 |
+
num_layers: 10
|
| 149 |
+
dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
| 150 |
+
intermediate_dim: 2048
|
| 151 |
+
idim: 1253
|
| 152 |
+
hdim: 512
|
| 153 |
+
odim: 24
|
| 154 |
+
decoder:
|
| 155 |
+
ksz_init: 7
|
| 156 |
+
ksz: 7
|
| 157 |
+
num_layers: 10
|
| 158 |
+
dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
|
| 159 |
+
intermediate_dim: 2048
|
| 160 |
+
idim: 24
|
| 161 |
+
hdim: 512
|
| 162 |
+
head:
|
| 163 |
+
idim: 512
|
| 164 |
+
hdim: 2048
|
| 165 |
+
odim: 512
|
| 166 |
+
ksz: 3
|
| 167 |
+
|
| 168 |
+
dp:
|
| 169 |
+
latent_dim: 24
|
| 170 |
+
chunk_compress_factor: 6
|
| 171 |
+
normalizer:
|
| 172 |
+
scale: 1.0
|
| 173 |
+
sentence_encoder:
|
| 174 |
+
char_emb_dim: 64
|
| 175 |
+
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
| 176 |
+
text_embedder:
|
| 177 |
+
char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
|
| 178 |
+
char_emb_dim: 64
|
| 179 |
+
convnext:
|
| 180 |
+
idim: 64
|
| 181 |
+
ksz: 5
|
| 182 |
+
intermediate_dim: 256
|
| 183 |
+
num_layers: 6
|
| 184 |
+
dilation_lst: [1, 1, 1, 1, 1, 1]
|
| 185 |
+
attn_encoder:
|
| 186 |
+
hidden_channels: 64
|
| 187 |
+
filter_channels: 256
|
| 188 |
+
n_heads: 2
|
| 189 |
+
n_layers: 2
|
| 190 |
+
p_dropout: 0.0
|
| 191 |
+
proj_out:
|
| 192 |
+
idim: 64
|
| 193 |
+
odim: 64
|
| 194 |
+
style_encoder:
|
| 195 |
+
proj_in:
|
| 196 |
+
ldim: 24
|
| 197 |
+
chunk_compress_factor: 6
|
| 198 |
+
odim: 64
|
| 199 |
+
convnext:
|
| 200 |
+
idim: 64
|
| 201 |
+
ksz: 5
|
| 202 |
+
intermediate_dim: 256
|
| 203 |
+
num_layers: 4
|
| 204 |
+
dilation_lst: [1, 1, 1, 1]
|
| 205 |
+
style_token_layer:
|
| 206 |
+
input_dim: 64
|
| 207 |
+
n_style: 8
|
| 208 |
+
style_key_dim: 0
|
| 209 |
+
style_value_dim: 16
|
| 210 |
+
prototype_dim: 64
|
| 211 |
+
n_units: 64
|
| 212 |
+
n_heads: 2
|
| 213 |
+
predictor:
|
| 214 |
+
sentence_dim: 64
|
| 215 |
+
n_style: 8
|
| 216 |
+
style_dim: 16
|
| 217 |
+
hdim: 128
|
| 218 |
+
n_layer: 2
|
| 219 |
+
|
| 220 |
+
unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
|
| 221 |
+
unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
|
| 222 |
+
window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
|
| 223 |
+
filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
|
assets/onnx/unicode_indexer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/onnx/vector_estimator.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
|
| 3 |
+
size 132517477
|
assets/onnx/vocoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
|
| 3 |
+
size 101424195
|
assets/voice_styles/F1.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/voice_styles/F2.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/voice_styles/M1.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/voice_styles/M2.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
helper.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
from contextlib import contextmanager
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from unicodedata import normalize
|
| 7 |
+
import re
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import onnxruntime as ort
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class UnicodeProcessor:
|
| 14 |
+
def __init__(self, unicode_indexer_path: str):
|
| 15 |
+
with open(unicode_indexer_path, "r") as f:
|
| 16 |
+
self.indexer = json.load(f)
|
| 17 |
+
|
| 18 |
+
def _preprocess_text(self, text: str) -> str:
|
| 19 |
+
# TODO: add more preprocessing
|
| 20 |
+
text = normalize("NFKD", text)
|
| 21 |
+
return text
|
| 22 |
+
|
| 23 |
+
def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
|
| 24 |
+
text_mask = length_to_mask(text_ids_lengths)
|
| 25 |
+
return text_mask
|
| 26 |
+
|
| 27 |
+
def _text_to_unicode_values(self, text: str) -> np.ndarray:
|
| 28 |
+
unicode_values = np.array(
|
| 29 |
+
[ord(char) for char in text], dtype=np.uint16
|
| 30 |
+
) # 2 bytes
|
| 31 |
+
return unicode_values
|
| 32 |
+
|
| 33 |
+
def __call__(self, text_list: list[str]) -> tuple[np.ndarray, np.ndarray]:
|
| 34 |
+
text_list = [self._preprocess_text(t) for t in text_list]
|
| 35 |
+
text_ids_lengths = np.array([len(text) for text in text_list], dtype=np.int64)
|
| 36 |
+
text_ids = np.zeros((len(text_list), text_ids_lengths.max()), dtype=np.int64)
|
| 37 |
+
for i, text in enumerate(text_list):
|
| 38 |
+
unicode_vals = self._text_to_unicode_values(text)
|
| 39 |
+
text_ids[i, : len(unicode_vals)] = np.array(
|
| 40 |
+
[self.indexer[val] for val in unicode_vals], dtype=np.int64
|
| 41 |
+
)
|
| 42 |
+
text_mask = self._get_text_mask(text_ids_lengths)
|
| 43 |
+
return text_ids, text_mask
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class Style:
|
| 47 |
+
def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
|
| 48 |
+
self.ttl = style_ttl_onnx
|
| 49 |
+
self.dp = style_dp_onnx
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class TextToSpeech:
|
| 53 |
+
def __init__(
|
| 54 |
+
self,
|
| 55 |
+
cfgs: dict,
|
| 56 |
+
text_processor: UnicodeProcessor,
|
| 57 |
+
dp_ort: ort.InferenceSession,
|
| 58 |
+
text_enc_ort: ort.InferenceSession,
|
| 59 |
+
vector_est_ort: ort.InferenceSession,
|
| 60 |
+
vocoder_ort: ort.InferenceSession,
|
| 61 |
+
):
|
| 62 |
+
self.cfgs = cfgs
|
| 63 |
+
self.text_processor = text_processor
|
| 64 |
+
self.dp_ort = dp_ort
|
| 65 |
+
self.text_enc_ort = text_enc_ort
|
| 66 |
+
self.vector_est_ort = vector_est_ort
|
| 67 |
+
self.vocoder_ort = vocoder_ort
|
| 68 |
+
self.sample_rate = cfgs["ae"]["sample_rate"]
|
| 69 |
+
self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
|
| 70 |
+
self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
|
| 71 |
+
self.ldim = cfgs["ttl"]["latent_dim"]
|
| 72 |
+
|
| 73 |
+
def sample_noisy_latent(
|
| 74 |
+
self, duration: np.ndarray
|
| 75 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 76 |
+
bsz = len(duration)
|
| 77 |
+
wav_len_max = duration.max() * self.sample_rate
|
| 78 |
+
wav_lengths = (duration * self.sample_rate).astype(np.int64)
|
| 79 |
+
chunk_size = self.base_chunk_size * self.chunk_compress_factor
|
| 80 |
+
latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
|
| 81 |
+
latent_dim = self.ldim * self.chunk_compress_factor
|
| 82 |
+
noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
|
| 83 |
+
latent_mask = get_latent_mask(
|
| 84 |
+
wav_lengths, self.base_chunk_size, self.chunk_compress_factor
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
noisy_latent = noisy_latent * latent_mask
|
| 88 |
+
return noisy_latent, latent_mask
|
| 89 |
+
|
| 90 |
+
def _infer(
|
| 91 |
+
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
|
| 92 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 93 |
+
assert (
|
| 94 |
+
len(text_list) == style.ttl.shape[0]
|
| 95 |
+
), "Number of texts must match number of style vectors"
|
| 96 |
+
bsz = len(text_list)
|
| 97 |
+
text_ids, text_mask = self.text_processor(text_list)
|
| 98 |
+
dur_onnx, *_ = self.dp_ort.run(
|
| 99 |
+
None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
|
| 100 |
+
)
|
| 101 |
+
dur_onnx = dur_onnx / speed
|
| 102 |
+
text_emb_onnx, *_ = self.text_enc_ort.run(
|
| 103 |
+
None,
|
| 104 |
+
{"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
|
| 105 |
+
) # dur_onnx: [bsz]
|
| 106 |
+
xt, latent_mask = self.sample_noisy_latent(dur_onnx)
|
| 107 |
+
total_step_np = np.array([total_step] * bsz, dtype=np.float32)
|
| 108 |
+
for step in range(total_step):
|
| 109 |
+
current_step = np.array([step] * bsz, dtype=np.float32)
|
| 110 |
+
xt, *_ = self.vector_est_ort.run(
|
| 111 |
+
None,
|
| 112 |
+
{
|
| 113 |
+
"noisy_latent": xt,
|
| 114 |
+
"text_emb": text_emb_onnx,
|
| 115 |
+
"style_ttl": style.ttl,
|
| 116 |
+
"text_mask": text_mask,
|
| 117 |
+
"latent_mask": latent_mask,
|
| 118 |
+
"current_step": current_step,
|
| 119 |
+
"total_step": total_step_np,
|
| 120 |
+
},
|
| 121 |
+
)
|
| 122 |
+
wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
|
| 123 |
+
return wav, dur_onnx
|
| 124 |
+
|
| 125 |
+
def __call__(
|
| 126 |
+
self,
|
| 127 |
+
text: str,
|
| 128 |
+
style: Style,
|
| 129 |
+
total_step: int,
|
| 130 |
+
speed: float = 1.05,
|
| 131 |
+
silence_duration: float = 0.3,
|
| 132 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 133 |
+
assert (
|
| 134 |
+
style.ttl.shape[0] == 1
|
| 135 |
+
), "Single speaker text to speech only supports single style"
|
| 136 |
+
text_list = chunk_text(text)
|
| 137 |
+
wav_cat = None
|
| 138 |
+
dur_cat = None
|
| 139 |
+
for text in text_list:
|
| 140 |
+
wav, dur_onnx = self._infer([text], style, total_step, speed)
|
| 141 |
+
if wav_cat is None:
|
| 142 |
+
wav_cat = wav
|
| 143 |
+
dur_cat = dur_onnx
|
| 144 |
+
else:
|
| 145 |
+
silence = np.zeros(
|
| 146 |
+
(1, int(silence_duration * self.sample_rate)), dtype=np.float32
|
| 147 |
+
)
|
| 148 |
+
wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
|
| 149 |
+
dur_cat += dur_onnx + silence_duration
|
| 150 |
+
return wav_cat, dur_cat
|
| 151 |
+
|
| 152 |
+
def stream(
|
| 153 |
+
self,
|
| 154 |
+
text: str,
|
| 155 |
+
style: Style,
|
| 156 |
+
total_step: int,
|
| 157 |
+
speed: float = 1.05,
|
| 158 |
+
silence_duration: float = 0.3,
|
| 159 |
+
):
|
| 160 |
+
assert (
|
| 161 |
+
style.ttl.shape[0] == 1
|
| 162 |
+
), "Single speaker text to speech only supports single style"
|
| 163 |
+
text_list = chunk_text(text)
|
| 164 |
+
|
| 165 |
+
for i, text in enumerate(text_list):
|
| 166 |
+
wav, _ = self._infer([text], style, total_step, speed)
|
| 167 |
+
yield wav.flatten()
|
| 168 |
+
|
| 169 |
+
if i < len(text_list) - 1:
|
| 170 |
+
silence = np.zeros(
|
| 171 |
+
(int(silence_duration * self.sample_rate),), dtype=np.float32
|
| 172 |
+
)
|
| 173 |
+
yield silence
|
| 174 |
+
|
| 175 |
+
def batch(
|
| 176 |
+
self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
|
| 177 |
+
) -> tuple[np.ndarray, np.ndarray]:
|
| 178 |
+
return self._infer(text_list, style, total_step, speed)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
|
| 182 |
+
"""
|
| 183 |
+
Convert lengths to binary mask.
|
| 184 |
+
|
| 185 |
+
Args:
|
| 186 |
+
lengths: (B,)
|
| 187 |
+
max_len: int
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
mask: (B, 1, max_len)
|
| 191 |
+
"""
|
| 192 |
+
max_len = max_len or lengths.max()
|
| 193 |
+
ids = np.arange(0, max_len)
|
| 194 |
+
mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
|
| 195 |
+
return mask.reshape(-1, 1, max_len)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def get_latent_mask(
|
| 199 |
+
wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
|
| 200 |
+
) -> np.ndarray:
|
| 201 |
+
latent_size = base_chunk_size * chunk_compress_factor
|
| 202 |
+
latent_lengths = (wav_lengths + latent_size - 1) // latent_size
|
| 203 |
+
latent_mask = length_to_mask(latent_lengths)
|
| 204 |
+
return latent_mask
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def load_onnx(
|
| 208 |
+
onnx_path: str, opts: ort.SessionOptions, providers: list[str]
|
| 209 |
+
) -> ort.InferenceSession:
|
| 210 |
+
return ort.InferenceSession(onnx_path, sess_options=opts, providers=providers)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def load_onnx_all(
|
| 214 |
+
onnx_dir: str, opts: ort.SessionOptions, providers: list[str]
|
| 215 |
+
) -> tuple[
|
| 216 |
+
ort.InferenceSession,
|
| 217 |
+
ort.InferenceSession,
|
| 218 |
+
ort.InferenceSession,
|
| 219 |
+
ort.InferenceSession,
|
| 220 |
+
]:
|
| 221 |
+
dp_onnx_path = os.path.join(onnx_dir, "duration_predictor.onnx")
|
| 222 |
+
text_enc_onnx_path = os.path.join(onnx_dir, "text_encoder.onnx")
|
| 223 |
+
vector_est_onnx_path = os.path.join(onnx_dir, "vector_estimator.onnx")
|
| 224 |
+
vocoder_onnx_path = os.path.join(onnx_dir, "vocoder.onnx")
|
| 225 |
+
|
| 226 |
+
dp_ort = load_onnx(dp_onnx_path, opts, providers)
|
| 227 |
+
text_enc_ort = load_onnx(text_enc_onnx_path, opts, providers)
|
| 228 |
+
vector_est_ort = load_onnx(vector_est_onnx_path, opts, providers)
|
| 229 |
+
vocoder_ort = load_onnx(vocoder_onnx_path, opts, providers)
|
| 230 |
+
return dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def load_cfgs(onnx_dir: str) -> dict:
|
| 234 |
+
cfg_path = os.path.join(onnx_dir, "tts.json")
|
| 235 |
+
with open(cfg_path, "r") as f:
|
| 236 |
+
cfgs = json.load(f)
|
| 237 |
+
return cfgs
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def load_text_processor(onnx_dir: str) -> UnicodeProcessor:
|
| 241 |
+
unicode_indexer_path = os.path.join(onnx_dir, "unicode_indexer.json")
|
| 242 |
+
text_processor = UnicodeProcessor(unicode_indexer_path)
|
| 243 |
+
return text_processor
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def load_text_to_speech(onnx_dir: str, use_gpu: bool = False) -> TextToSpeech:
|
| 247 |
+
opts = ort.SessionOptions()
|
| 248 |
+
if use_gpu:
|
| 249 |
+
raise NotImplementedError("GPU mode is not fully tested")
|
| 250 |
+
else:
|
| 251 |
+
providers = ["CPUExecutionProvider"]
|
| 252 |
+
print("Using CPU for inference")
|
| 253 |
+
cfgs = load_cfgs(onnx_dir)
|
| 254 |
+
dp_ort, text_enc_ort, vector_est_ort, vocoder_ort = load_onnx_all(
|
| 255 |
+
onnx_dir, opts, providers
|
| 256 |
+
)
|
| 257 |
+
text_processor = load_text_processor(onnx_dir)
|
| 258 |
+
return TextToSpeech(
|
| 259 |
+
cfgs, text_processor, dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def load_voice_style(voice_style_paths: list[str], verbose: bool = False) -> Style:
|
| 264 |
+
bsz = len(voice_style_paths)
|
| 265 |
+
|
| 266 |
+
# Read first file to get dimensions
|
| 267 |
+
with open(voice_style_paths[0], "r") as f:
|
| 268 |
+
first_style = json.load(f)
|
| 269 |
+
ttl_dims = first_style["style_ttl"]["dims"]
|
| 270 |
+
dp_dims = first_style["style_dp"]["dims"]
|
| 271 |
+
|
| 272 |
+
# Pre-allocate arrays with full batch size
|
| 273 |
+
ttl_style = np.zeros([bsz, ttl_dims[1], ttl_dims[2]], dtype=np.float32)
|
| 274 |
+
dp_style = np.zeros([bsz, dp_dims[1], dp_dims[2]], dtype=np.float32)
|
| 275 |
+
|
| 276 |
+
# Fill in the data
|
| 277 |
+
for i, voice_style_path in enumerate(voice_style_paths):
|
| 278 |
+
with open(voice_style_path, "r") as f:
|
| 279 |
+
voice_style = json.load(f)
|
| 280 |
+
|
| 281 |
+
ttl_data = np.array(
|
| 282 |
+
voice_style["style_ttl"]["data"], dtype=np.float32
|
| 283 |
+
).flatten()
|
| 284 |
+
ttl_style[i] = ttl_data.reshape(ttl_dims[1], ttl_dims[2])
|
| 285 |
+
|
| 286 |
+
dp_data = np.array(
|
| 287 |
+
voice_style["style_dp"]["data"], dtype=np.float32
|
| 288 |
+
).flatten()
|
| 289 |
+
dp_style[i] = dp_data.reshape(dp_dims[1], dp_dims[2])
|
| 290 |
+
|
| 291 |
+
if verbose:
|
| 292 |
+
print(f"Loaded {bsz} voice styles")
|
| 293 |
+
return Style(ttl_style, dp_style)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
@contextmanager
|
| 297 |
+
def timer(name: str):
|
| 298 |
+
start = time.time()
|
| 299 |
+
print(f"{name}...")
|
| 300 |
+
yield
|
| 301 |
+
print(f" -> {name} completed in {time.time() - start:.2f} sec")
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def sanitize_filename(text: str, max_len: int) -> str:
|
| 305 |
+
"""Sanitize filename by replacing non-alphanumeric characters with underscores"""
|
| 306 |
+
prefix = text[:max_len]
|
| 307 |
+
return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def chunk_text(text: str, max_len: int = 300) -> list[str]:
|
| 311 |
+
"""
|
| 312 |
+
Split text into chunks by paragraphs and sentences.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
text: Input text to chunk
|
| 316 |
+
max_len: Maximum length of each chunk (default: 300)
|
| 317 |
+
|
| 318 |
+
Returns:
|
| 319 |
+
List of text chunks
|
| 320 |
+
"""
|
| 321 |
+
# Split by paragraph (two or more newlines)
|
| 322 |
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
|
| 323 |
+
|
| 324 |
+
chunks = []
|
| 325 |
+
|
| 326 |
+
for paragraph in paragraphs:
|
| 327 |
+
paragraph = paragraph.strip()
|
| 328 |
+
if not paragraph:
|
| 329 |
+
continue
|
| 330 |
+
|
| 331 |
+
# Split by sentence boundaries (period, question mark, exclamation mark followed by space)
|
| 332 |
+
# But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
|
| 333 |
+
pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
|
| 334 |
+
sentences = re.split(pattern, paragraph)
|
| 335 |
+
|
| 336 |
+
current_chunk = ""
|
| 337 |
+
|
| 338 |
+
for sentence in sentences:
|
| 339 |
+
if len(current_chunk) + len(sentence) + 1 <= max_len:
|
| 340 |
+
current_chunk += (" " if current_chunk else "") + sentence
|
| 341 |
+
else:
|
| 342 |
+
if current_chunk:
|
| 343 |
+
chunks.append(current_chunk.strip())
|
| 344 |
+
current_chunk = sentence
|
| 345 |
+
|
| 346 |
+
if current_chunk:
|
| 347 |
+
chunks.append(current_chunk.strip())
|
| 348 |
+
|
| 349 |
+
return chunks
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
numpy>=1.26.0
|
| 3 |
+
onnxruntime==1.23.1
|
| 4 |
+
soundfile>=0.12.1
|
| 5 |
+
librosa>=0.10.0
|
| 6 |
+
PyYAML>=6.0
|
| 7 |
+
huggingface_hub
|