Nymbo commited on
Commit
19f48a2
·
verified ·
1 Parent(s): ed01d74

Upload 36 files

Browse files
Files changed (36) hide show
  1. app.py +161 -0
  2. assets/.cache/huggingface/.gitignore +1 -0
  3. assets/.cache/huggingface/download/.gitattributes.metadata +3 -0
  4. assets/.cache/huggingface/download/.gitignore.metadata +3 -0
  5. assets/.cache/huggingface/download/LICENSE.metadata +3 -0
  6. assets/.cache/huggingface/download/README.md.metadata +3 -0
  7. assets/.cache/huggingface/download/config.json.metadata +3 -0
  8. assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata +3 -0
  9. assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata +3 -0
  10. assets/.cache/huggingface/download/onnx/tts.json.metadata +3 -0
  11. assets/.cache/huggingface/download/onnx/tts.yml.metadata +3 -0
  12. assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata +3 -0
  13. assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata +3 -0
  14. assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata +3 -0
  15. assets/.cache/huggingface/download/voice_styles/F1.json.metadata +3 -0
  16. assets/.cache/huggingface/download/voice_styles/F2.json.metadata +3 -0
  17. assets/.cache/huggingface/download/voice_styles/M1.json.metadata +3 -0
  18. assets/.cache/huggingface/download/voice_styles/M2.json.metadata +3 -0
  19. assets/.gitattributes +35 -0
  20. assets/.gitignore +4 -0
  21. assets/LICENSE +209 -0
  22. assets/README.md +161 -0
  23. assets/config.json +5 -0
  24. assets/onnx/duration_predictor.onnx +3 -0
  25. assets/onnx/text_encoder.onnx +3 -0
  26. assets/onnx/tts.json +316 -0
  27. assets/onnx/tts.yml +223 -0
  28. assets/onnx/unicode_indexer.json +0 -0
  29. assets/onnx/vector_estimator.onnx +3 -0
  30. assets/onnx/vocoder.onnx +3 -0
  31. assets/voice_styles/F1.json +0 -0
  32. assets/voice_styles/F2.json +0 -0
  33. assets/voice_styles/M1.json +0 -0
  34. assets/voice_styles/M2.json +0 -0
  35. helper.py +349 -0
  36. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import io
4
+ import wave
5
+ import numpy as np
6
+ import soundfile as sf
7
+ from huggingface_hub import snapshot_download
8
+ from helper import load_text_to_speech, load_voice_style
9
+
10
+ _SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
11
+
12
+ def _init_supertonic() -> None:
13
+ if _SUPERTONIC_STATE["initialized"]:
14
+ return
15
+
16
+ print("Initializing Supertonic...")
17
+ # Download models if not present
18
+ assets_dir = os.path.join(os.path.dirname(__file__), "assets")
19
+ if not os.path.exists(assets_dir):
20
+ print(f"Downloading Supertonic models to {assets_dir}...")
21
+ snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
22
+
23
+ onnx_dir = os.path.join(assets_dir, "onnx")
24
+ tts = load_text_to_speech(onnx_dir, use_gpu=False)
25
+
26
+ _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
27
+ print("Supertonic initialized.")
28
+
29
+ def get_supertonic_voices():
30
+ """Get list of available Supertonic voice styles."""
31
+ # Ensure assets are downloaded to list voices
32
+ assets_dir = os.path.join(os.path.dirname(__file__), "assets")
33
+ if not os.path.exists(assets_dir):
34
+ # If not initialized/downloaded yet, we might not see voices.
35
+ # But we can try to download just to list, or just init.
36
+ _init_supertonic()
37
+ assets_dir = _SUPERTONIC_STATE["assets_dir"]
38
+
39
+ voice_styles_dir = os.path.join(assets_dir, "voice_styles")
40
+ if not os.path.exists(voice_styles_dir):
41
+ return []
42
+
43
+ files = os.listdir(voice_styles_dir)
44
+ voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
45
+ return sorted(voices)
46
+
47
+ def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
48
+ audio_clipped = np.clip(audio_np, -1.0, 1.0)
49
+ return (audio_clipped * 32767.0).astype(np.int16)
50
+
51
+ def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
52
+ buffer = io.BytesIO()
53
+ with wave.open(buffer, "wb") as wf:
54
+ wf.setnchannels(1)
55
+ wf.setsampwidth(2)
56
+ wf.setframerate(sample_rate)
57
+ wf.writeframes(audio_int16.tobytes())
58
+ return buffer.getvalue()
59
+
60
+ def supertonic_tts(text: str, speed: float, voice: str, steps: int):
61
+ if not text or not text.strip():
62
+ raise gr.Error("Please enter text to synthesize.")
63
+
64
+ _init_supertonic()
65
+ tts = _SUPERTONIC_STATE["tts"]
66
+ assets_dir = _SUPERTONIC_STATE["assets_dir"]
67
+
68
+ voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
69
+ if not os.path.exists(voice_path):
70
+ raise gr.Error(f"Voice style {voice} not found.")
71
+
72
+ style = load_voice_style([voice_path])
73
+
74
+ try:
75
+ sr = tts.sample_rate
76
+ for audio_chunk in tts.stream(text, style, steps, speed):
77
+ audio_int16 = _audio_np_to_int16(audio_chunk)
78
+ yield _wav_bytes_from_int16(audio_int16, sr)
79
+
80
+ except Exception as e:
81
+ raise gr.Error(f"Error during speech generation: {str(e)}")
82
+
83
+ with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
84
+ gr.HTML("<h1 style='text-align: center;'>Supertonic-Hub</h1><p style='text-align: center;'>Powered by Supertone/supertonic</p>")
85
+
86
+ # We need to initialize to get voices, but we don't want to block startup too long if download is needed.
87
+ # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
88
+ # Or we can just list a default if not found.
89
+ try:
90
+ available_voices = get_supertonic_voices()
91
+ except Exception:
92
+ available_voices = []
93
+
94
+ default_voice = available_voices[0] if available_voices else None
95
+
96
+ with gr.Row(variant='panel'):
97
+ speed_slider = gr.Slider(
98
+ minimum=0.5,
99
+ maximum=2.0,
100
+ value=1.0,
101
+ step=0.1,
102
+ label='Speed'
103
+ )
104
+ steps_slider = gr.Slider(
105
+ minimum=1,
106
+ maximum=50,
107
+ value=5,
108
+ step=1,
109
+ label='Steps (Quality vs Speed)'
110
+ )
111
+ voice_dropdown = gr.Dropdown(
112
+ choices=available_voices,
113
+ label='Voice',
114
+ value=default_voice,
115
+ allow_custom_value=True
116
+ )
117
+
118
+ text_input = gr.Textbox(
119
+ label="Input Text",
120
+ placeholder="Enter the text you want to convert to speech here...",
121
+ lines=5,
122
+ value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
123
+ )
124
+
125
+ generate_btn = gr.Button(
126
+ "Generate Speech",
127
+ variant="primary",
128
+ )
129
+
130
+ audio_output = gr.Audio(
131
+ label="Generated Speech",
132
+ streaming=True,
133
+ autoplay=True
134
+ )
135
+
136
+ def update_voices():
137
+ voices = get_supertonic_voices()
138
+ return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
139
+
140
+ # Add a refresh button for voices in case they weren't loaded initially
141
+ refresh_btn = gr.Button("Refresh Voices (Downloads Model if needed)")
142
+ refresh_btn.click(fn=update_voices, outputs=voice_dropdown)
143
+
144
+ generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider]
145
+
146
+ generate_btn.click(
147
+ fn=supertonic_tts,
148
+ inputs=generate_inputs,
149
+ outputs=audio_output,
150
+ api_name="generate_speech"
151
+ )
152
+
153
+ text_input.submit(
154
+ fn=supertonic_tts,
155
+ inputs=generate_inputs,
156
+ outputs=audio_output,
157
+ api_name="generate_speech_enter"
158
+ )
159
+
160
+ if __name__ == "__main__":
161
+ demo.queue().launch()
assets/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
assets/.cache/huggingface/download/.gitattributes.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ a6344aac8c09253b3b630fb776ae94478aa0275b
3
+ 1763671228.0332673
assets/.cache/huggingface/download/.gitignore.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 2f152fc303670993b3cd5f4089406fb87ef8821e
3
+ 1763671228.1495774
assets/.cache/huggingface/download/LICENSE.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 1e2cbe6cef94c8cdf3ed1fcebc0f5317ca7ad5a1
3
+ 1763671228.058201
assets/.cache/huggingface/download/README.md.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 20caed64917ab1c78e5122b3ee3aee22b9f644d6
3
+ 1763671227.940027
assets/.cache/huggingface/download/config.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 163e41dac1144faedf93a23b333d728863b31ba1
3
+ 1763671228.0821016
assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
3
+ 1763671228.404064
assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
3
+ 1763671228.9067116
assets/.cache/huggingface/download/onnx/tts.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 2129ec38aa31ad06ee171b8cd44e75a5a41b5da4
3
+ 1763671228.0873225
assets/.cache/huggingface/download/onnx/tts.yml.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ d37684b0c9b7891bfaf2946921f6895ec924cc6d
3
+ 1763671228.3533428
assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 2d0dadf8d5d7388ff8614b33172a1c64ee3ca2ae
3
+ 1763671228.4996374
assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
3
+ 1763671231.884892
assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
3
+ 1763671231.4259956
assets/.cache/huggingface/download/voice_styles/F1.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ 842522854be041eabfef70e97393ffb8cbc77d37
3
+ 1763671228.7745795
assets/.cache/huggingface/download/voice_styles/F2.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ ade0e075c325a27d7ab1d19a8f5ab3f8b8f54bee
3
+ 1763671228.9370666
assets/.cache/huggingface/download/voice_styles/M1.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ f6e6b25e6ee6aa603d19bb1fdcf3cd9f35f528c6
3
+ 1763671228.946357
assets/.cache/huggingface/download/voice_styles/M2.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 3110200b250c90179f67c387b3dcde326cc7ff43
2
+ a38b1a1327156f27310bcc55223e0914ddf8a615
3
+ 1763671229.2795043
assets/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
assets/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ window.json
2
+ filter_bank.json
3
+ style_extractor.onnx
4
+ *.npy
assets/LICENSE ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BigScience Open RAIL-M License
2
+ dated August 18, 2022
3
+
4
+ Section I: PREAMBLE
5
+
6
+ This Open RAIL-M License was created by BigScience, a collaborative open innovation project aimed at
7
+ the responsible development and use of large multilingual datasets and Large Language Models
8
+ (“LLMs”). While a similar license was originally designed for the BLOOM model, we decided to adapt it
9
+ and create this license in order to propose a general open and responsible license applicable to other
10
+ machine learning based AI models (e.g. multimodal generative models).
11
+ In short, this license strives for both the open and responsible downstream use of the accompanying
12
+ model. When it comes to the open character, we took inspiration from open source permissive licenses
13
+ regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based
14
+ restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be
15
+ able to enforce the license in case potential misuses of the Model may occur. Even though downstream
16
+ derivative versions of the model could be released under different licensing terms, the latter will always
17
+ have to include - at minimum - the same use-based restrictions as the ones in the original license (this
18
+ license).
19
+ The development and use of artificial intelligence (“AI”), does not come without concerns. The world has
20
+ witnessed how AI techniques may, in some instances, become risky for the public in general. These risks
21
+ come in many forms, from racial discrimination to the misuse of sensitive information.
22
+ BigScience believes in the intersection between open and responsible AI development; thus, this License
23
+ aims to strike a balance between both in order to enable responsible open-science in the field of AI.
24
+ This License governs the use of the model (and its derivatives) and is informed by the model card
25
+ associated with the model.
26
+
27
+ NOW THEREFORE, You and Licensor agree as follows:
28
+
29
+ 1. Definitions
30
+ (a) "License" means the terms and conditions for use, reproduction, and Distribution as defined in
31
+ this document.
32
+ (b) “Data” means a collection of information and/or content extracted from the dataset used with the
33
+ Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under
34
+ this License.
35
+ (c)“Output” means the results of operating a Model as embodied in informational content resulting
36
+ therefrom.
37
+ (d)“Model” means any accompanying machine-learning based assemblies (including checkpoints),
38
+ consisting of learnt weights, parameters (including optimizer states), corresponding to the model
39
+ architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or
40
+ in part on the Data, using the Complementary Material.
41
+ (e) “Derivatives of the Model” means all modifications to the Model, works based on the Model, or any
42
+ other model which is created or initialized by transfer of patterns of the weights, parameters,
43
+ activations or output of the Model, to the other model, in order to cause the other model to perform
44
+ similarly to the Model, including - but not limited to - distillation methods entailing the use of
45
+ intermediate data representations or methods based on the generation of synthetic data by the Model
46
+ for training the other model.
47
+ (f)“Complementary Material” means the accompanying source code and scripts used to define,
48
+ run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if
49
+ any. This includes any accompanying documentation, tutorials, examples, etc, if any.
50
+ (g) “Distribution” means any transmission, reproduction, publication or other sharing of the Model or
51
+ Derivatives of the Model to a third party, including providing the Model as a hosted service made
52
+ available by electronic or other remote means - e.g. API-based or web access.
53
+ (h) “Licensor” means the copyright owner or entity authorized by the copyright owner that is
54
+ granting the License, including the persons or entities that may have rights in the Model and/or
55
+ distributing the Model.
56
+ (i) "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this
57
+ License and/or making use of the Model for whichever purpose and in any field of use, including
58
+ usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
59
+ (j) “Third Parties” means individuals or legal entities that are not under common control with
60
+ Licensor or You.
61
+ (k) "Contribution" means any work of authorship, including the original version of the Model and
62
+ any modifications or additions to that Model or Derivatives of the Model thereof, that is
63
+ intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an
64
+ individual or Legal Entity authorized to submit on behalf of the copyright owner. For the
65
+ purposes of this definition,
66
+ “submitted” means any form of electronic, verbal, or written
67
+ communication sent to the Licensor or its representatives, including but not limited to
68
+ communication on electronic mailing lists, source code control systems, and issue tracking
69
+ systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and
70
+ improving the Model, but excluding communication that is conspicuously marked or otherwise
71
+ designated in writing by the copyright owner as "Not a Contribution."
72
+ (l) "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a
73
+ Contribution has been received by Licensor and subsequently incorporated within the Model.
74
+
75
+
76
+ Section II: INTELLECTUAL PROPERTY RIGHTS
77
+
78
+ Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary
79
+ Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
80
+
81
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor
82
+ hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the
83
+ Complementary Material, the Model, and Derivatives of the Model.
84
+
85
+ 3. Grant of Patent License. Subject to the terms and conditions of this License and where and as
86
+ applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
87
+ royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer
88
+ to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such
89
+ license applies only to those patent claims licensable by such Contributor that are necessarily infringed by
90
+ their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such
91
+ Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim
92
+ or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution
93
+ incorporated within the Model and/or Complementary Material constitutes direct or contributory patent
94
+ infringement, then any patent licenses granted to You under this License for the Model and/or Work shall
95
+ terminate as of the date such litigation is asserted or filed.
96
+ Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
97
+
98
+ 4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g.
99
+ software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof
100
+ in any medium, with or without modifications, provided that You meet the following conditions:
101
+
102
+ a. Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision
103
+ by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the
104
+ Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to,
105
+ that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply
106
+ to the use of Complementary Material.
107
+
108
+ b. You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this
109
+ License;
110
+
111
+ c. You must cause any modified files to carry prominent notices stating that You changed the files;
112
+
113
+ d. You must retain all copyright, patent, trademark, and attribution notices excluding those notices
114
+ that do not pertain to any part of the Model, Derivatives of the Model.
115
+ You may add Your own copyright statement to Your modifications and may provide additional or
116
+ different license terms and conditions - respecting paragraph 4.a.
117
+ - for use, reproduction, or Distribution
118
+ of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use,
119
+ reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
120
+
121
+ 5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions.
122
+ Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You
123
+ may use the Model subject to this License, including only for lawful purposes and in accordance with the
124
+ License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or
125
+ reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model
126
+ to comply with the terms of this paragraph (paragraph 5).
127
+
128
+ 6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You
129
+ generate using the Model. You are accountable for the Output you generate and its subsequent uses. No
130
+ use of the output can contravene any provision as stated in the License.
131
+
132
+ Section IV: OTHER PROVISIONS
133
+
134
+ 7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the
135
+ right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model
136
+ through electronic means, or modify the Output of the Model based on updates. You shall undertake
137
+ reasonable efforts to use the latest version of the Model.
138
+
139
+ 8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks,
140
+ trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the
141
+ parties; and any rights not expressly granted herein are reserved by the Licensors.
142
+
143
+ 9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides
144
+ the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS
145
+ IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
146
+ including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
147
+ MERCHANTABILITY , or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for
148
+ determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the
149
+ Complementary Material and assume any risks associated with Your exercise of permissions under this
150
+ License.
151
+
152
+ 10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence),
153
+ contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or
154
+ agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect,
155
+ special, incidental, or consequential damages of any character arising as a result of this License or out of
156
+ the use or inability to use the Model and the Complementary Material (including but not limited to
157
+ damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other
158
+ commercial damages or losses), even if such Contributor has been advised of the possibility of such
159
+ damages.
160
+
161
+ 11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the
162
+ Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance
163
+ of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License.
164
+ However, in accepting such obligations, You may act only on Your own behalf and on Your sole
165
+ responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and
166
+ hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor
167
+ by reason of your accepting any such warranty or additional liability.
168
+
169
+ 12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining
170
+ provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
171
+
172
+ END OF TERMS AND CONDITIONS
173
+
174
+ Attachment A
175
+
176
+ Use Restrictions
177
+
178
+ You agree not to use the Model or Derivatives of the Model:
179
+ (a) In any way that violates any applicable national, federal, state, local or international law
180
+ or regulation;
181
+ (b) For the purpose of exploiting, harming or attempting to exploit or harm minors in any
182
+ way;
183
+ (c) To generate or disseminate verifiably false information and/or content with the purpose of
184
+ harming others;
185
+ (d) To generate or disseminate personal identifiable information that can be used to harm an
186
+ individual;
187
+ (e) To generate or disseminate information and/or content (e.g. images, code, posts, articles),
188
+ and place the information and/or content in any context (e.g. bot generating tweets)
189
+ without expressly and intelligibly disclaiming that the information and/or content is
190
+ machine generated;
191
+ (f) To defame, disparage or otherwise harass others;
192
+ (g) To impersonate or attempt to impersonate (e.g. deepfakes) others without their consent;
193
+ (h) For fully automated decision making that adversely impacts an individual’s legal rights or
194
+ otherwise creates or modifies a binding, enforceable obligation;
195
+ (i) For any use intended to or which has the effect of discriminating against or harming
196
+ individuals or groups based on online or offline social behavior or known or predicted
197
+ personal or personality characteristics;
198
+ (j) To exploit any of the vulnerabilities of a specific group of persons based on their age,
199
+ social, physical or mental characteristics, in order to materially distort the behavior of a
200
+ person pertaining to that group in a manner that causes or is likely to cause that person or
201
+ another person physical or psychological harm;
202
+ (k) For any use intended to or which has the effect of discriminating against individuals or
203
+ groups based on legally protected characteristics or categories;
204
+ (l) To provide medical advice and medical results interpretation;
205
+ (m) To generate or disseminate information for the purpose to be used for administration of
206
+ justice, law enforcement, immigration or asylum processes, such as predicting an
207
+ individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal
208
+ relationships between assertions made in documents, indiscriminate and
209
+ arbitrarily-targeted use).
assets/README.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: openrail
3
+ language:
4
+ - en
5
+ pipeline_tag: text-to-speech
6
+ library_name: transformers.js
7
+ ---
8
+
9
+ # Supertonic — Lightning Fast, On-Device TTS
10
+
11
+ **Supertonic** is a lightning-fast, on-device text-to-speech system designed for **extreme performance** with minimal computational overhead. Powered by ONNX Runtime, it runs entirely on your device—no cloud, no API calls, no privacy concerns.
12
+
13
+ > 🎧 **Try it now**: Experience Supertonic in your browser with our [**Interactive Demo**](https://huggingface.co/spaces/Supertone/supertonic#interactive-demo), or [**Hugging Face app**](https://huggingface.co/spaces/akhaliq/supertonic) or get started with pre-trained models from [**Hugging Face Hub**](https://huggingface.co/Supertone/supertonic)
14
+
15
+ > 🛠 **GitHub Repository**
16
+ > To use Supertonic most easily, visit the official GitHub repository:
17
+ > https://github.com/supertone-inc/supertonic
18
+ > You’ll find multi-language example codes.
19
+
20
+ ### Table of Contents
21
+
22
+ - [Why Supertonic?](#why-supertonic)
23
+ - [Language Support](#language-support)
24
+ - [Getting Started](#getting-started)
25
+ - [Performance](#performance)
26
+ - [Citation](#citation)
27
+ - [License](#license)
28
+
29
+ ## Why Supertonic?
30
+
31
+ - **⚡ Blazingly Fast**: Generates speech up to **167× faster than real-time** on consumer hardware (M4 Pro)—unmatched by any other TTS system
32
+ - **🪶 Ultra Lightweight**: Only **66M parameters**, optimized for efficient on-device performance with minimal footprint
33
+ - **📱 On-Device Capable**: **Complete privacy** and **zero latency**—all processing happens locally on your device
34
+ - **🎨 Natural Text Handling**: Seamlessly processes numbers, dates, currency, abbreviations, and complex expressions without pre-processing
35
+ - **⚙️ Highly Configurable**: Adjust inference steps, batch processing, and other parameters to match your specific needs
36
+ - **🧩 Flexible Deployment**: Deploy seamlessly across servers, browsers, and edge devices with multiple runtime backends.
37
+
38
+
39
+ ## Language Support
40
+
41
+ We provide ready-to-use TTS inference examples across multiple ecosystems:
42
+
43
+ | Language/Platform | Path | Description |
44
+ |-------------------|------|-------------|
45
+ | [**Python**] | `py/` | ONNX Runtime inference |
46
+ | [**Node.js**] | `nodejs/` | Server-side JavaScript |
47
+ | [**Browser**] | `web/` | WebGPU/WASM inference |
48
+ | [**Java**] | `java/` | Cross-platform JVM |
49
+ | [**C++**] | `cpp/` | High-performance C++ |
50
+ | [**C#**] | `csharp/` | .NET ecosystem |
51
+ | [**Go**] | `go/` | Go implementation |
52
+ | [**Swift**] | `swift/` | macOS applications |
53
+ | [**iOS**] | `ios/` | Native iOS apps |
54
+ | [**Rust**] | `rust/` | Memory-safe systems |
55
+
56
+ > For detailed usage instructions, please refer to the README.md in each language directory.
57
+
58
+ ## Getting Started
59
+
60
+ First, clone the repository:
61
+
62
+ ```bash
63
+ git clone https://github.com/supertone-inc/supertonic.git
64
+ cd supertonic
65
+ ```
66
+
67
+ ### Prerequisites
68
+
69
+ Before running the examples, download the ONNX models and preset voices, and place them in the `assets` directory:
70
+
71
+ ```bash
72
+ git clone https://huggingface.co/Supertone/supertonic assets
73
+ ```
74
+
75
+ > **Note:** The Hugging Face repository uses Git LFS. Please ensure Git LFS is installed and initialized before cloning or pulling large model files.
76
+ > - macOS: `brew install git-lfs && git lfs install`
77
+ > - Generic: see `https://git-lfs.com` for installers
78
+
79
+
80
+ ### Technical Details
81
+
82
+ - **Runtime**: ONNX Runtime for cross-platform inference (CPU-optimized; GPU mode is not tested)
83
+ - **Browser Support**: onnxruntime-web for client-side inference
84
+ - **Batch Processing**: Supports batch inference for improved throughput
85
+ - **Audio Output**: Outputs 16-bit WAV files
86
+
87
+ ## Performance
88
+
89
+ We evaluated Supertonic's performance (with 2 inference steps) using two key metrics across input texts of varying lengths: Short (59 chars), Mid (152 chars), and Long (266 chars).
90
+
91
+ **Metrics:**
92
+ - **Characters per Second**: Measures throughput by dividing the number of input characters by the time required to generate audio. Higher is better.
93
+ - **Real-time Factor (RTF)**: Measures the time taken to synthesize audio relative to its duration. Lower is better (e.g., RTF of 0.1 means it takes 0.1 seconds to generate one second of audio).
94
+
95
+ ### Characters per Second
96
+ | System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
97
+ |--------|-----------------|----------------|-----------------|
98
+ | **Supertonic** (M4 pro - CPU) | 912 | 1048 | 1263 |
99
+ | **Supertonic** (M4 pro - WebGPU) | 996 | 1801 | 2509 |
100
+ | **Supertonic** (RTX4090) | 2615 | 6548 | 12164 |
101
+ | `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 144 | 209 | 287 |
102
+ | `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 37 | 55 | 82 |
103
+ | `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 12 | 18 | 24 |
104
+ | `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 38 | 64 | 92 |
105
+ | `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 104 | 107 | 117 |
106
+ | `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 37 | 42 | 47 |
107
+
108
+ > **Notes:**
109
+ > `API` = Cloud-based API services (measured from Seoul)
110
+ > `Open` = Open-source models
111
+ > Supertonic (M4 pro - CPU) and (M4 pro - WebGPU): Tested with ONNX
112
+ > Supertonic (RTX4090): Tested with PyTorch model
113
+ > Kokoro: Tested on M4 Pro CPU with ONNX
114
+ > NeuTTS Air: Tested on M4 Pro CPU with Q8-GGUF
115
+
116
+ ### Real-time Factor
117
+
118
+ | System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
119
+ |--------|-----------------|----------------|-----------------|
120
+ | **Supertonic** (M4 pro - CPU) | 0.015 | 0.013 | 0.012 |
121
+ | **Supertonic** (M4 pro - WebGPU) | 0.014 | 0.007 | 0.006 |
122
+ | **Supertonic** (RTX4090) | 0.005 | 0.002 | 0.001 |
123
+ | `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 0.133 | 0.077 | 0.057 |
124
+ | `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 0.471 | 0.302 | 0.201 |
125
+ | `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 1.060 | 0.673 | 0.541 |
126
+ | `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 0.372 | 0.206 | 0.163 |
127
+ | `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 0.144 | 0.124 | 0.126 |
128
+ | `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 0.390 | 0.338 | 0.343 |
129
+
130
+ <details>
131
+ <summary><b>Additional Performance Data (5-step inference)</b></summary>
132
+
133
+ <br>
134
+
135
+ **Characters per Second (5-step)**
136
+
137
+ | System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
138
+ |--------|-----------------|----------------|-----------------|
139
+ | **Supertonic** (M4 pro - CPU) | 596 | 691 | 850 |
140
+ | **Supertonic** (M4 pro - WebGPU) | 570 | 1118 | 1546 |
141
+ | **Supertonic** (RTX4090) | 1286 | 3757 | 6242 |
142
+
143
+ **Real-time Factor (5-step)**
144
+
145
+ | System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
146
+ |--------|-----------------|----------------|-----------------|
147
+ | **Supertonic** (M4 pro - CPU) | 0.023 | 0.019 | 0.018 |
148
+ | **Supertonic** (M4 pro - WebGPU) | 0.024 | 0.012 | 0.010 |
149
+ | **Supertonic** (RTX4090) | 0.011 | 0.004 | 0.002 |
150
+
151
+ </details>
152
+
153
+ ## License
154
+
155
+ This project’s sample code is released under the MIT License. - see the [LICENSE](https://github.com/supertone-inc/supertonic?tab=MIT-1-ov-file) for details.
156
+
157
+ The accompanying model is released under the OpenRAIL-M License. - see the [LICENSE](https://huggingface.co/Supertone/supertonic/blob/main/LICENSE) file for details.
158
+
159
+ This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. - see the [LICENSE](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
160
+
161
+ Copyright (c) 2025 Supertone Inc.
assets/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Supertonic",
3
+ "model_type": "onnx",
4
+ "description": "This is a stub config for Hugging Face download counting. The actual model is located at onnx/"
5
+ }
assets/onnx/duration_predictor.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
3
+ size 1590703
assets/onnx/text_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
3
+ size 27978387
assets/onnx/tts.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_version": "v1.5.0",
3
+ "split": "opensource-en",
4
+ "ttl_ckpt_path": "unknown.pt",
5
+ "dp_ckpt_path": "unknown.pt",
6
+ "ae_ckpt_path": "unknown.pt",
7
+ "ttl_train": "unknown",
8
+ "dp_train": "unknown",
9
+ "ae_train": "unknown",
10
+ "ttl": {
11
+ "latent_dim": 24,
12
+ "chunk_compress_factor": 6,
13
+ "batch_expander": {
14
+ "n_batch_expand": 6
15
+ },
16
+ "normalizer": {
17
+ "scale": 0.25
18
+ },
19
+ "text_encoder": {
20
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
21
+ "text_embedder": {
22
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
23
+ "char_emb_dim": 256
24
+ },
25
+ "convnext": {
26
+ "idim": 256,
27
+ "ksz": 5,
28
+ "intermediate_dim": 1024,
29
+ "num_layers": 6,
30
+ "dilation_lst": [
31
+ 1,
32
+ 1,
33
+ 1,
34
+ 1,
35
+ 1,
36
+ 1
37
+ ]
38
+ },
39
+ "attn_encoder": {
40
+ "hidden_channels": 256,
41
+ "filter_channels": 1024,
42
+ "n_heads": 4,
43
+ "n_layers": 4,
44
+ "p_dropout": 0.0
45
+ },
46
+ "proj_out": {
47
+ "idim": 256,
48
+ "odim": 256
49
+ }
50
+ },
51
+ "flow_matching": {
52
+ "sig_min": 0
53
+ },
54
+ "style_encoder": {
55
+ "proj_in": {
56
+ "ldim": 24,
57
+ "chunk_compress_factor": 6,
58
+ "odim": 256
59
+ },
60
+ "convnext": {
61
+ "idim": 256,
62
+ "ksz": 5,
63
+ "intermediate_dim": 1024,
64
+ "num_layers": 6,
65
+ "dilation_lst": [
66
+ 1,
67
+ 1,
68
+ 1,
69
+ 1,
70
+ 1,
71
+ 1
72
+ ]
73
+ },
74
+ "style_token_layer": {
75
+ "input_dim": 256,
76
+ "n_style": 50,
77
+ "style_key_dim": 256,
78
+ "style_value_dim": 256,
79
+ "prototype_dim": 256,
80
+ "n_units": 256,
81
+ "n_heads": 2
82
+ }
83
+ },
84
+ "speech_prompted_text_encoder": {
85
+ "text_dim": 256,
86
+ "style_dim": 256,
87
+ "n_units": 256,
88
+ "n_heads": 2
89
+ },
90
+ "uncond_masker": {
91
+ "prob_both_uncond": 0.04,
92
+ "prob_text_uncond": 0.01,
93
+ "std": 0.1,
94
+ "text_dim": 256,
95
+ "n_style": 50,
96
+ "style_key_dim": 256,
97
+ "style_value_dim": 256
98
+ },
99
+ "vector_field": {
100
+ "proj_in": {
101
+ "ldim": 24,
102
+ "chunk_compress_factor": 6,
103
+ "odim": 512
104
+ },
105
+ "time_encoder": {
106
+ "time_dim": 64,
107
+ "hdim": 256
108
+ },
109
+ "main_blocks": {
110
+ "n_blocks": 4,
111
+ "time_cond_layer": {
112
+ "idim": 512,
113
+ "time_dim": 64
114
+ },
115
+ "style_cond_layer": {
116
+ "idim": 512,
117
+ "style_dim": 256
118
+ },
119
+ "text_cond_layer": {
120
+ "idim": 512,
121
+ "text_dim": 256,
122
+ "n_heads": 4,
123
+ "use_residual": true,
124
+ "rotary_base": 10000,
125
+ "rotary_scale": 10
126
+ },
127
+ "convnext_0": {
128
+ "idim": 512,
129
+ "ksz": 5,
130
+ "intermediate_dim": 1024,
131
+ "num_layers": 4,
132
+ "dilation_lst": [
133
+ 1,
134
+ 2,
135
+ 4,
136
+ 8
137
+ ]
138
+ },
139
+ "convnext_1": {
140
+ "idim": 512,
141
+ "ksz": 5,
142
+ "intermediate_dim": 1024,
143
+ "num_layers": 1,
144
+ "dilation_lst": [
145
+ 1
146
+ ]
147
+ },
148
+ "convnext_2": {
149
+ "idim": 512,
150
+ "ksz": 5,
151
+ "intermediate_dim": 1024,
152
+ "num_layers": 1,
153
+ "dilation_lst": [
154
+ 1
155
+ ]
156
+ }
157
+ },
158
+ "last_convnext": {
159
+ "idim": 512,
160
+ "ksz": 5,
161
+ "intermediate_dim": 1024,
162
+ "num_layers": 4,
163
+ "dilation_lst": [
164
+ 1,
165
+ 1,
166
+ 1,
167
+ 1
168
+ ]
169
+ },
170
+ "proj_out": {
171
+ "idim": 512,
172
+ "chunk_compress_factor": 6,
173
+ "ldim": 24
174
+ }
175
+ }
176
+ },
177
+ "ae": {
178
+ "sample_rate": 44100,
179
+ "n_delay": 0,
180
+ "base_chunk_size": 512,
181
+ "chunk_compress_factor": 1,
182
+ "ldim": 24,
183
+ "encoder": {
184
+ "spec_processor": {
185
+ "n_fft": 2048,
186
+ "win_length": 2048,
187
+ "hop_length": 512,
188
+ "n_mels": 228,
189
+ "sample_rate": 44100,
190
+ "eps": 1e-05,
191
+ "norm_mean": 0.0,
192
+ "norm_std": 1.0
193
+ },
194
+ "ksz_init": 7,
195
+ "ksz": 7,
196
+ "num_layers": 10,
197
+ "dilation_lst": [
198
+ 1,
199
+ 1,
200
+ 1,
201
+ 1,
202
+ 1,
203
+ 1,
204
+ 1,
205
+ 1,
206
+ 1,
207
+ 1
208
+ ],
209
+ "intermediate_dim": 2048,
210
+ "idim": 1253,
211
+ "hdim": 512,
212
+ "odim": 24
213
+ },
214
+ "decoder": {
215
+ "ksz_init": 7,
216
+ "ksz": 7,
217
+ "num_layers": 10,
218
+ "dilation_lst": [
219
+ 1,
220
+ 2,
221
+ 4,
222
+ 1,
223
+ 2,
224
+ 4,
225
+ 1,
226
+ 1,
227
+ 1,
228
+ 1
229
+ ],
230
+ "intermediate_dim": 2048,
231
+ "idim": 24,
232
+ "hdim": 512,
233
+ "head": {
234
+ "idim": 512,
235
+ "hdim": 2048,
236
+ "odim": 512,
237
+ "ksz": 3
238
+ }
239
+ }
240
+ },
241
+ "dp": {
242
+ "latent_dim": 24,
243
+ "chunk_compress_factor": 6,
244
+ "normalizer": {
245
+ "scale": 1.0
246
+ },
247
+ "sentence_encoder": {
248
+ "char_emb_dim": 64,
249
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
250
+ "text_embedder": {
251
+ "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
252
+ "char_emb_dim": 64
253
+ },
254
+ "convnext": {
255
+ "idim": 64,
256
+ "ksz": 5,
257
+ "intermediate_dim": 256,
258
+ "num_layers": 6,
259
+ "dilation_lst": [
260
+ 1,
261
+ 1,
262
+ 1,
263
+ 1,
264
+ 1,
265
+ 1
266
+ ]
267
+ },
268
+ "attn_encoder": {
269
+ "hidden_channels": 64,
270
+ "filter_channels": 256,
271
+ "n_heads": 2,
272
+ "n_layers": 2,
273
+ "p_dropout": 0.0
274
+ },
275
+ "proj_out": {
276
+ "idim": 64,
277
+ "odim": 64
278
+ }
279
+ },
280
+ "style_encoder": {
281
+ "proj_in": {
282
+ "ldim": 24,
283
+ "chunk_compress_factor": 6,
284
+ "odim": 64
285
+ },
286
+ "convnext": {
287
+ "idim": 64,
288
+ "ksz": 5,
289
+ "intermediate_dim": 256,
290
+ "num_layers": 4,
291
+ "dilation_lst": [
292
+ 1,
293
+ 1,
294
+ 1,
295
+ 1
296
+ ]
297
+ },
298
+ "style_token_layer": {
299
+ "input_dim": 64,
300
+ "n_style": 8,
301
+ "style_key_dim": 0,
302
+ "style_value_dim": 16,
303
+ "prototype_dim": 64,
304
+ "n_units": 64,
305
+ "n_heads": 2
306
+ }
307
+ },
308
+ "predictor": {
309
+ "sentence_dim": 64,
310
+ "n_style": 8,
311
+ "style_dim": 16,
312
+ "hdim": 128,
313
+ "n_layer": 2
314
+ }
315
+ }
316
+ }
assets/onnx/tts.yml ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tts_version: "v1.5.0"
2
+
3
+ split: "opensource-en"
4
+
5
+ ttl_ckpt_path: "unknown.pt"
6
+
7
+ dp_ckpt_path: "unknown.pt"
8
+
9
+ ae_ckpt_path: "unknown.pt"
10
+
11
+ ttl_train: "unknown"
12
+
13
+ dp_train: "unknown"
14
+
15
+ ae_train: "unknown"
16
+
17
+ ttl:
18
+ latent_dim: 24
19
+ chunk_compress_factor: 6
20
+ batch_expander:
21
+ n_batch_expand: 6
22
+ normalizer:
23
+ scale: 0.25
24
+ text_encoder:
25
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
26
+ text_embedder:
27
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
28
+ char_emb_dim: 256
29
+ convnext:
30
+ idim: 256
31
+ ksz: 5
32
+ intermediate_dim: 1024
33
+ num_layers: 6
34
+ dilation_lst: [1, 1, 1, 1, 1, 1]
35
+ attn_encoder:
36
+ hidden_channels: 256
37
+ filter_channels: 1024
38
+ n_heads: 4
39
+ n_layers: 4
40
+ p_dropout: 0.0
41
+ proj_out:
42
+ idim: 256
43
+ odim: 256
44
+ flow_matching:
45
+ sig_min: 0
46
+ style_encoder:
47
+ proj_in:
48
+ ldim: 24
49
+ chunk_compress_factor: 6
50
+ odim: 256
51
+ convnext:
52
+ idim: 256
53
+ ksz: 5
54
+ intermediate_dim: 1024
55
+ num_layers: 6
56
+ dilation_lst: [1, 1, 1, 1, 1, 1]
57
+ style_token_layer:
58
+ input_dim: 256
59
+ n_style: 50
60
+ style_key_dim: 256
61
+ style_value_dim: 256
62
+ prototype_dim: 256
63
+ n_units: 256
64
+ n_heads: 2
65
+ speech_prompted_text_encoder:
66
+ text_dim: 256
67
+ style_dim: 256
68
+ n_units: 256
69
+ n_heads: 2
70
+ uncond_masker:
71
+ prob_both_uncond: 0.04
72
+ prob_text_uncond: 0.01
73
+ std: 0.1
74
+ text_dim: 256
75
+ n_style: 50
76
+ style_key_dim: 256
77
+ style_value_dim: 256
78
+ vector_field:
79
+ proj_in:
80
+ ldim: 24
81
+ chunk_compress_factor: 6
82
+ odim: 512
83
+ time_encoder:
84
+ time_dim: 64
85
+ hdim: 256
86
+ main_blocks:
87
+ n_blocks: 4
88
+ time_cond_layer:
89
+ idim: 512
90
+ time_dim: 64
91
+ style_cond_layer:
92
+ idim: 512
93
+ style_dim: 256
94
+ text_cond_layer:
95
+ idim: 512
96
+ text_dim: 256
97
+ n_heads: 4
98
+ use_residual: True
99
+ rotary_base: 10000
100
+ rotary_scale: 10
101
+ convnext_0:
102
+ idim: 512
103
+ ksz: 5
104
+ intermediate_dim: 1024
105
+ num_layers: 4
106
+ dilation_lst: [1, 2, 4, 8]
107
+ convnext_1:
108
+ idim: 512
109
+ ksz: 5
110
+ intermediate_dim: 1024
111
+ num_layers: 1
112
+ dilation_lst: [1]
113
+ convnext_2:
114
+ idim: 512
115
+ ksz: 5
116
+ intermediate_dim: 1024
117
+ num_layers: 1
118
+ dilation_lst: [1]
119
+ last_convnext:
120
+ idim: 512
121
+ ksz: 5
122
+ intermediate_dim: 1024
123
+ num_layers: 4
124
+ dilation_lst: [1, 1, 1, 1]
125
+ proj_out:
126
+ idim: 512
127
+ chunk_compress_factor: 6
128
+ ldim: 24
129
+
130
+ ae:
131
+ sample_rate: 44100
132
+ n_delay: 0
133
+ base_chunk_size: 512
134
+ chunk_compress_factor: 1
135
+ ldim: 24
136
+ encoder:
137
+ spec_processor:
138
+ n_fft: 2048
139
+ win_length: 2048
140
+ hop_length: 512
141
+ n_mels: 228
142
+ sample_rate: 44100
143
+ eps: 1e-05
144
+ norm_mean: 0.0
145
+ norm_std: 1.0
146
+ ksz_init: 7
147
+ ksz: 7
148
+ num_layers: 10
149
+ dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
150
+ intermediate_dim: 2048
151
+ idim: 1253
152
+ hdim: 512
153
+ odim: 24
154
+ decoder:
155
+ ksz_init: 7
156
+ ksz: 7
157
+ num_layers: 10
158
+ dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
159
+ intermediate_dim: 2048
160
+ idim: 24
161
+ hdim: 512
162
+ head:
163
+ idim: 512
164
+ hdim: 2048
165
+ odim: 512
166
+ ksz: 3
167
+
168
+ dp:
169
+ latent_dim: 24
170
+ chunk_compress_factor: 6
171
+ normalizer:
172
+ scale: 1.0
173
+ sentence_encoder:
174
+ char_emb_dim: 64
175
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
176
+ text_embedder:
177
+ char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
178
+ char_emb_dim: 64
179
+ convnext:
180
+ idim: 64
181
+ ksz: 5
182
+ intermediate_dim: 256
183
+ num_layers: 6
184
+ dilation_lst: [1, 1, 1, 1, 1, 1]
185
+ attn_encoder:
186
+ hidden_channels: 64
187
+ filter_channels: 256
188
+ n_heads: 2
189
+ n_layers: 2
190
+ p_dropout: 0.0
191
+ proj_out:
192
+ idim: 64
193
+ odim: 64
194
+ style_encoder:
195
+ proj_in:
196
+ ldim: 24
197
+ chunk_compress_factor: 6
198
+ odim: 64
199
+ convnext:
200
+ idim: 64
201
+ ksz: 5
202
+ intermediate_dim: 256
203
+ num_layers: 4
204
+ dilation_lst: [1, 1, 1, 1]
205
+ style_token_layer:
206
+ input_dim: 64
207
+ n_style: 8
208
+ style_key_dim: 0
209
+ style_value_dim: 16
210
+ prototype_dim: 64
211
+ n_units: 64
212
+ n_heads: 2
213
+ predictor:
214
+ sentence_dim: 64
215
+ n_style: 8
216
+ style_dim: 16
217
+ hdim: 128
218
+ n_layer: 2
219
+
220
+ unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
221
+ unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
222
+ window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
223
+ filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"
assets/onnx/unicode_indexer.json ADDED
The diff for this file is too large to render. See raw diff
 
assets/onnx/vector_estimator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
3
+ size 132517477
assets/onnx/vocoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
3
+ size 101424195
assets/voice_styles/F1.json ADDED
The diff for this file is too large to render. See raw diff
 
assets/voice_styles/F2.json ADDED
The diff for this file is too large to render. See raw diff
 
assets/voice_styles/M1.json ADDED
The diff for this file is too large to render. See raw diff
 
assets/voice_styles/M2.json ADDED
The diff for this file is too large to render. See raw diff
 
helper.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ from contextlib import contextmanager
5
+ from typing import Optional
6
+ from unicodedata import normalize
7
+ import re
8
+
9
+ import numpy as np
10
+ import onnxruntime as ort
11
+
12
+
13
+ class UnicodeProcessor:
14
+ def __init__(self, unicode_indexer_path: str):
15
+ with open(unicode_indexer_path, "r") as f:
16
+ self.indexer = json.load(f)
17
+
18
+ def _preprocess_text(self, text: str) -> str:
19
+ # TODO: add more preprocessing
20
+ text = normalize("NFKD", text)
21
+ return text
22
+
23
+ def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
24
+ text_mask = length_to_mask(text_ids_lengths)
25
+ return text_mask
26
+
27
+ def _text_to_unicode_values(self, text: str) -> np.ndarray:
28
+ unicode_values = np.array(
29
+ [ord(char) for char in text], dtype=np.uint16
30
+ ) # 2 bytes
31
+ return unicode_values
32
+
33
+ def __call__(self, text_list: list[str]) -> tuple[np.ndarray, np.ndarray]:
34
+ text_list = [self._preprocess_text(t) for t in text_list]
35
+ text_ids_lengths = np.array([len(text) for text in text_list], dtype=np.int64)
36
+ text_ids = np.zeros((len(text_list), text_ids_lengths.max()), dtype=np.int64)
37
+ for i, text in enumerate(text_list):
38
+ unicode_vals = self._text_to_unicode_values(text)
39
+ text_ids[i, : len(unicode_vals)] = np.array(
40
+ [self.indexer[val] for val in unicode_vals], dtype=np.int64
41
+ )
42
+ text_mask = self._get_text_mask(text_ids_lengths)
43
+ return text_ids, text_mask
44
+
45
+
46
+ class Style:
47
+ def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
48
+ self.ttl = style_ttl_onnx
49
+ self.dp = style_dp_onnx
50
+
51
+
52
+ class TextToSpeech:
53
+ def __init__(
54
+ self,
55
+ cfgs: dict,
56
+ text_processor: UnicodeProcessor,
57
+ dp_ort: ort.InferenceSession,
58
+ text_enc_ort: ort.InferenceSession,
59
+ vector_est_ort: ort.InferenceSession,
60
+ vocoder_ort: ort.InferenceSession,
61
+ ):
62
+ self.cfgs = cfgs
63
+ self.text_processor = text_processor
64
+ self.dp_ort = dp_ort
65
+ self.text_enc_ort = text_enc_ort
66
+ self.vector_est_ort = vector_est_ort
67
+ self.vocoder_ort = vocoder_ort
68
+ self.sample_rate = cfgs["ae"]["sample_rate"]
69
+ self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
70
+ self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
71
+ self.ldim = cfgs["ttl"]["latent_dim"]
72
+
73
+ def sample_noisy_latent(
74
+ self, duration: np.ndarray
75
+ ) -> tuple[np.ndarray, np.ndarray]:
76
+ bsz = len(duration)
77
+ wav_len_max = duration.max() * self.sample_rate
78
+ wav_lengths = (duration * self.sample_rate).astype(np.int64)
79
+ chunk_size = self.base_chunk_size * self.chunk_compress_factor
80
+ latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
81
+ latent_dim = self.ldim * self.chunk_compress_factor
82
+ noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
83
+ latent_mask = get_latent_mask(
84
+ wav_lengths, self.base_chunk_size, self.chunk_compress_factor
85
+ )
86
+
87
+ noisy_latent = noisy_latent * latent_mask
88
+ return noisy_latent, latent_mask
89
+
90
+ def _infer(
91
+ self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
92
+ ) -> tuple[np.ndarray, np.ndarray]:
93
+ assert (
94
+ len(text_list) == style.ttl.shape[0]
95
+ ), "Number of texts must match number of style vectors"
96
+ bsz = len(text_list)
97
+ text_ids, text_mask = self.text_processor(text_list)
98
+ dur_onnx, *_ = self.dp_ort.run(
99
+ None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
100
+ )
101
+ dur_onnx = dur_onnx / speed
102
+ text_emb_onnx, *_ = self.text_enc_ort.run(
103
+ None,
104
+ {"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
105
+ ) # dur_onnx: [bsz]
106
+ xt, latent_mask = self.sample_noisy_latent(dur_onnx)
107
+ total_step_np = np.array([total_step] * bsz, dtype=np.float32)
108
+ for step in range(total_step):
109
+ current_step = np.array([step] * bsz, dtype=np.float32)
110
+ xt, *_ = self.vector_est_ort.run(
111
+ None,
112
+ {
113
+ "noisy_latent": xt,
114
+ "text_emb": text_emb_onnx,
115
+ "style_ttl": style.ttl,
116
+ "text_mask": text_mask,
117
+ "latent_mask": latent_mask,
118
+ "current_step": current_step,
119
+ "total_step": total_step_np,
120
+ },
121
+ )
122
+ wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
123
+ return wav, dur_onnx
124
+
125
+ def __call__(
126
+ self,
127
+ text: str,
128
+ style: Style,
129
+ total_step: int,
130
+ speed: float = 1.05,
131
+ silence_duration: float = 0.3,
132
+ ) -> tuple[np.ndarray, np.ndarray]:
133
+ assert (
134
+ style.ttl.shape[0] == 1
135
+ ), "Single speaker text to speech only supports single style"
136
+ text_list = chunk_text(text)
137
+ wav_cat = None
138
+ dur_cat = None
139
+ for text in text_list:
140
+ wav, dur_onnx = self._infer([text], style, total_step, speed)
141
+ if wav_cat is None:
142
+ wav_cat = wav
143
+ dur_cat = dur_onnx
144
+ else:
145
+ silence = np.zeros(
146
+ (1, int(silence_duration * self.sample_rate)), dtype=np.float32
147
+ )
148
+ wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
149
+ dur_cat += dur_onnx + silence_duration
150
+ return wav_cat, dur_cat
151
+
152
+ def stream(
153
+ self,
154
+ text: str,
155
+ style: Style,
156
+ total_step: int,
157
+ speed: float = 1.05,
158
+ silence_duration: float = 0.3,
159
+ ):
160
+ assert (
161
+ style.ttl.shape[0] == 1
162
+ ), "Single speaker text to speech only supports single style"
163
+ text_list = chunk_text(text)
164
+
165
+ for i, text in enumerate(text_list):
166
+ wav, _ = self._infer([text], style, total_step, speed)
167
+ yield wav.flatten()
168
+
169
+ if i < len(text_list) - 1:
170
+ silence = np.zeros(
171
+ (int(silence_duration * self.sample_rate),), dtype=np.float32
172
+ )
173
+ yield silence
174
+
175
+ def batch(
176
+ self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
177
+ ) -> tuple[np.ndarray, np.ndarray]:
178
+ return self._infer(text_list, style, total_step, speed)
179
+
180
+
181
+ def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
182
+ """
183
+ Convert lengths to binary mask.
184
+
185
+ Args:
186
+ lengths: (B,)
187
+ max_len: int
188
+
189
+ Returns:
190
+ mask: (B, 1, max_len)
191
+ """
192
+ max_len = max_len or lengths.max()
193
+ ids = np.arange(0, max_len)
194
+ mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
195
+ return mask.reshape(-1, 1, max_len)
196
+
197
+
198
+ def get_latent_mask(
199
+ wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
200
+ ) -> np.ndarray:
201
+ latent_size = base_chunk_size * chunk_compress_factor
202
+ latent_lengths = (wav_lengths + latent_size - 1) // latent_size
203
+ latent_mask = length_to_mask(latent_lengths)
204
+ return latent_mask
205
+
206
+
207
+ def load_onnx(
208
+ onnx_path: str, opts: ort.SessionOptions, providers: list[str]
209
+ ) -> ort.InferenceSession:
210
+ return ort.InferenceSession(onnx_path, sess_options=opts, providers=providers)
211
+
212
+
213
+ def load_onnx_all(
214
+ onnx_dir: str, opts: ort.SessionOptions, providers: list[str]
215
+ ) -> tuple[
216
+ ort.InferenceSession,
217
+ ort.InferenceSession,
218
+ ort.InferenceSession,
219
+ ort.InferenceSession,
220
+ ]:
221
+ dp_onnx_path = os.path.join(onnx_dir, "duration_predictor.onnx")
222
+ text_enc_onnx_path = os.path.join(onnx_dir, "text_encoder.onnx")
223
+ vector_est_onnx_path = os.path.join(onnx_dir, "vector_estimator.onnx")
224
+ vocoder_onnx_path = os.path.join(onnx_dir, "vocoder.onnx")
225
+
226
+ dp_ort = load_onnx(dp_onnx_path, opts, providers)
227
+ text_enc_ort = load_onnx(text_enc_onnx_path, opts, providers)
228
+ vector_est_ort = load_onnx(vector_est_onnx_path, opts, providers)
229
+ vocoder_ort = load_onnx(vocoder_onnx_path, opts, providers)
230
+ return dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
231
+
232
+
233
+ def load_cfgs(onnx_dir: str) -> dict:
234
+ cfg_path = os.path.join(onnx_dir, "tts.json")
235
+ with open(cfg_path, "r") as f:
236
+ cfgs = json.load(f)
237
+ return cfgs
238
+
239
+
240
+ def load_text_processor(onnx_dir: str) -> UnicodeProcessor:
241
+ unicode_indexer_path = os.path.join(onnx_dir, "unicode_indexer.json")
242
+ text_processor = UnicodeProcessor(unicode_indexer_path)
243
+ return text_processor
244
+
245
+
246
+ def load_text_to_speech(onnx_dir: str, use_gpu: bool = False) -> TextToSpeech:
247
+ opts = ort.SessionOptions()
248
+ if use_gpu:
249
+ raise NotImplementedError("GPU mode is not fully tested")
250
+ else:
251
+ providers = ["CPUExecutionProvider"]
252
+ print("Using CPU for inference")
253
+ cfgs = load_cfgs(onnx_dir)
254
+ dp_ort, text_enc_ort, vector_est_ort, vocoder_ort = load_onnx_all(
255
+ onnx_dir, opts, providers
256
+ )
257
+ text_processor = load_text_processor(onnx_dir)
258
+ return TextToSpeech(
259
+ cfgs, text_processor, dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
260
+ )
261
+
262
+
263
+ def load_voice_style(voice_style_paths: list[str], verbose: bool = False) -> Style:
264
+ bsz = len(voice_style_paths)
265
+
266
+ # Read first file to get dimensions
267
+ with open(voice_style_paths[0], "r") as f:
268
+ first_style = json.load(f)
269
+ ttl_dims = first_style["style_ttl"]["dims"]
270
+ dp_dims = first_style["style_dp"]["dims"]
271
+
272
+ # Pre-allocate arrays with full batch size
273
+ ttl_style = np.zeros([bsz, ttl_dims[1], ttl_dims[2]], dtype=np.float32)
274
+ dp_style = np.zeros([bsz, dp_dims[1], dp_dims[2]], dtype=np.float32)
275
+
276
+ # Fill in the data
277
+ for i, voice_style_path in enumerate(voice_style_paths):
278
+ with open(voice_style_path, "r") as f:
279
+ voice_style = json.load(f)
280
+
281
+ ttl_data = np.array(
282
+ voice_style["style_ttl"]["data"], dtype=np.float32
283
+ ).flatten()
284
+ ttl_style[i] = ttl_data.reshape(ttl_dims[1], ttl_dims[2])
285
+
286
+ dp_data = np.array(
287
+ voice_style["style_dp"]["data"], dtype=np.float32
288
+ ).flatten()
289
+ dp_style[i] = dp_data.reshape(dp_dims[1], dp_dims[2])
290
+
291
+ if verbose:
292
+ print(f"Loaded {bsz} voice styles")
293
+ return Style(ttl_style, dp_style)
294
+
295
+
296
+ @contextmanager
297
+ def timer(name: str):
298
+ start = time.time()
299
+ print(f"{name}...")
300
+ yield
301
+ print(f" -> {name} completed in {time.time() - start:.2f} sec")
302
+
303
+
304
+ def sanitize_filename(text: str, max_len: int) -> str:
305
+ """Sanitize filename by replacing non-alphanumeric characters with underscores"""
306
+ prefix = text[:max_len]
307
+ return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
308
+
309
+
310
+ def chunk_text(text: str, max_len: int = 300) -> list[str]:
311
+ """
312
+ Split text into chunks by paragraphs and sentences.
313
+
314
+ Args:
315
+ text: Input text to chunk
316
+ max_len: Maximum length of each chunk (default: 300)
317
+
318
+ Returns:
319
+ List of text chunks
320
+ """
321
+ # Split by paragraph (two or more newlines)
322
+ paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
323
+
324
+ chunks = []
325
+
326
+ for paragraph in paragraphs:
327
+ paragraph = paragraph.strip()
328
+ if not paragraph:
329
+ continue
330
+
331
+ # Split by sentence boundaries (period, question mark, exclamation mark followed by space)
332
+ # But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
333
+ pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
334
+ sentences = re.split(pattern, paragraph)
335
+
336
+ current_chunk = ""
337
+
338
+ for sentence in sentences:
339
+ if len(current_chunk) + len(sentence) + 1 <= max_len:
340
+ current_chunk += (" " if current_chunk else "") + sentence
341
+ else:
342
+ if current_chunk:
343
+ chunks.append(current_chunk.strip())
344
+ current_chunk = sentence
345
+
346
+ if current_chunk:
347
+ chunks.append(current_chunk.strip())
348
+
349
+ return chunks
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy>=1.26.0
3
+ onnxruntime==1.23.1
4
+ soundfile>=0.12.1
5
+ librosa>=0.10.0
6
+ PyYAML>=6.0
7
+ huggingface_hub