Spaces:

Nymbo
/

Supertonic-66M

Running

App Files Files Community

Nymbo commited on 25 days ago

Commit

19f48a2

verified ·

1 Parent(s): ed01d74

Upload 36 files

Browse files

Files changed (36) hide show

app.py +161 -0
assets/.cache/huggingface/.gitignore +1 -0
assets/.cache/huggingface/download/.gitattributes.metadata +3 -0
assets/.cache/huggingface/download/.gitignore.metadata +3 -0
assets/.cache/huggingface/download/LICENSE.metadata +3 -0
assets/.cache/huggingface/download/README.md.metadata +3 -0
assets/.cache/huggingface/download/config.json.metadata +3 -0
assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata +3 -0
assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata +3 -0
assets/.cache/huggingface/download/onnx/tts.json.metadata +3 -0
assets/.cache/huggingface/download/onnx/tts.yml.metadata +3 -0
assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata +3 -0
assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata +3 -0
assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata +3 -0
assets/.cache/huggingface/download/voice_styles/F1.json.metadata +3 -0
assets/.cache/huggingface/download/voice_styles/F2.json.metadata +3 -0
assets/.cache/huggingface/download/voice_styles/M1.json.metadata +3 -0
assets/.cache/huggingface/download/voice_styles/M2.json.metadata +3 -0
assets/.gitattributes +35 -0
assets/.gitignore +4 -0
assets/LICENSE +209 -0
assets/README.md +161 -0
assets/config.json +5 -0
assets/onnx/duration_predictor.onnx +3 -0
assets/onnx/text_encoder.onnx +3 -0
assets/onnx/tts.json +316 -0
assets/onnx/tts.yml +223 -0
assets/onnx/unicode_indexer.json +0 -0
assets/onnx/vector_estimator.onnx +3 -0
assets/onnx/vocoder.onnx +3 -0
assets/voice_styles/F1.json +0 -0
assets/voice_styles/F2.json +0 -0
assets/voice_styles/M1.json +0 -0
assets/voice_styles/M2.json +0 -0
helper.py +349 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import gradio as gr
+import os
+import io
+import wave
+import numpy as np
+import soundfile as sf
+from huggingface_hub import snapshot_download
+from helper import load_text_to_speech, load_voice_style
+_SUPERTONIC_STATE = {"initialized": False, "tts": None, "assets_dir": None}
+def _init_supertonic() -> None:
+    if _SUPERTONIC_STATE["initialized"]:
+        return
+    print("Initializing Supertonic...")
+    # Download models if not present
+    assets_dir = os.path.join(os.path.dirname(__file__), "assets")
+    if not os.path.exists(assets_dir):
+        print(f"Downloading Supertonic models to {assets_dir}...")
+        snapshot_download(repo_id="Supertone/supertonic", local_dir=assets_dir)
+    onnx_dir = os.path.join(assets_dir, "onnx")
+    tts = load_text_to_speech(onnx_dir, use_gpu=False)
+    _SUPERTONIC_STATE.update({"initialized": True, "tts": tts, "assets_dir": assets_dir})
+    print("Supertonic initialized.")
+def get_supertonic_voices():
+    """Get list of available Supertonic voice styles."""
+    # Ensure assets are downloaded to list voices
+    assets_dir = os.path.join(os.path.dirname(__file__), "assets")
+    if not os.path.exists(assets_dir):
+         # If not initialized/downloaded yet, we might not see voices.
+         # But we can try to download just to list, or just init.
+         _init_supertonic()
+         assets_dir = _SUPERTONIC_STATE["assets_dir"]
+    voice_styles_dir = os.path.join(assets_dir, "voice_styles")
+    if not os.path.exists(voice_styles_dir):
+        return []
+    files = os.listdir(voice_styles_dir)
+    voices = [f.replace('.json', '') for f in files if f.endswith('.json')]
+    return sorted(voices)
+def _audio_np_to_int16(audio_np: np.ndarray) -> np.ndarray:
+    audio_clipped = np.clip(audio_np, -1.0, 1.0)
+    return (audio_clipped * 32767.0).astype(np.int16)
+def _wav_bytes_from_int16(audio_int16: np.ndarray, sample_rate: int) -> bytes:
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio_int16.tobytes())
+    return buffer.getvalue()
+def supertonic_tts(text: str, speed: float, voice: str, steps: int):
+    if not text or not text.strip():
+        raise gr.Error("Please enter text to synthesize.")
+    _init_supertonic()
+    tts = _SUPERTONIC_STATE["tts"]
+    assets_dir = _SUPERTONIC_STATE["assets_dir"]
+    voice_path = os.path.join(assets_dir, "voice_styles", f"{voice}.json")
+    if not os.path.exists(voice_path):
+        raise gr.Error(f"Voice style {voice} not found.")
+    style = load_voice_style([voice_path])
+    try:
+        sr = tts.sample_rate
+        for audio_chunk in tts.stream(text, style, steps, speed):
+             audio_int16 = _audio_np_to_int16(audio_chunk)
+             yield _wav_bytes_from_int16(audio_int16, sr)
+    except Exception as e:
+        raise gr.Error(f"Error during speech generation: {str(e)}")
+with gr.Blocks(theme='Nymbo/Nymbo_Theme') as demo:
+    gr.HTML("<h1 style='text-align: center;'>Supertonic-Hub</h1><p style='text-align: center;'>Powered by Supertone/supertonic</p>")
+    # We need to initialize to get voices, but we don't want to block startup too long if download is needed.
+    # For now, let's try to get voices, if empty, user might need to click generate to trigger download/init first?
+    # Or we can just list a default if not found.
+    try:
+        available_voices = get_supertonic_voices()
+    except Exception:
+        available_voices = []
+    default_voice = available_voices[0] if available_voices else None
+    with gr.Row(variant='panel'):
+        speed_slider = gr.Slider(
+            minimum=0.5,
+            maximum=2.0,
+            value=1.0,
+            step=0.1,
+            label='Speed'
+        )
+        steps_slider = gr.Slider(
+            minimum=1,
+            maximum=50,
+            value=5,
+            step=1,
+            label='Steps (Quality vs Speed)'
+        )
+        voice_dropdown = gr.Dropdown(
+            choices=available_voices,
+            label='Voice',
+            value=default_voice,
+            allow_custom_value=True
+        )
+    text_input = gr.Textbox(
+        label="Input Text",
+        placeholder="Enter the text you want to convert to speech here...",
+        lines=5,
+        value="This morning, I took a walk in the park, and the sound of the birds and the breeze was so pleasant that I stopped for a long time just to listen."
+    )
+    generate_btn = gr.Button(
+        "Generate Speech",
+        variant="primary",
+    )
+    audio_output = gr.Audio(
+        label="Generated Speech",
+        streaming=True,
+        autoplay=True
+    )
+    def update_voices():
+        voices = get_supertonic_voices()
+        return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
+    # Add a refresh button for voices in case they weren't loaded initially
+    refresh_btn = gr.Button("Refresh Voices (Downloads Model if needed)")
+    refresh_btn.click(fn=update_voices, outputs=voice_dropdown)
+    generate_inputs = [text_input, speed_slider, voice_dropdown, steps_slider]
+    generate_btn.click(
+        fn=supertonic_tts,
+        inputs=generate_inputs,
+        outputs=audio_output,
+        api_name="generate_speech"
+    )
+    text_input.submit(
+        fn=supertonic_tts,
+        inputs=generate_inputs,
+        outputs=audio_output,
+        api_name="generate_speech_enter"
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

assets/.cache/huggingface/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

assets/.cache/huggingface/download/.gitattributes.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+a6344aac8c09253b3b630fb776ae94478aa0275b
+1763671228.0332673

assets/.cache/huggingface/download/.gitignore.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+2f152fc303670993b3cd5f4089406fb87ef8821e
+1763671228.1495774

assets/.cache/huggingface/download/LICENSE.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+1e2cbe6cef94c8cdf3ed1fcebc0f5317ca7ad5a1
+1763671228.058201

assets/.cache/huggingface/download/README.md.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+20caed64917ab1c78e5122b3ee3aee22b9f644d6
+1763671227.940027

assets/.cache/huggingface/download/config.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+163e41dac1144faedf93a23b333d728863b31ba1
+1763671228.0821016

assets/.cache/huggingface/download/onnx/duration_predictor.onnx.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
+1763671228.404064

assets/.cache/huggingface/download/onnx/text_encoder.onnx.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
+1763671228.9067116

assets/.cache/huggingface/download/onnx/tts.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+2129ec38aa31ad06ee171b8cd44e75a5a41b5da4
+1763671228.0873225

assets/.cache/huggingface/download/onnx/tts.yml.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+d37684b0c9b7891bfaf2946921f6895ec924cc6d
+1763671228.3533428

assets/.cache/huggingface/download/onnx/unicode_indexer.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+2d0dadf8d5d7388ff8614b33172a1c64ee3ca2ae
+1763671228.4996374

assets/.cache/huggingface/download/onnx/vector_estimator.onnx.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
+1763671231.884892

assets/.cache/huggingface/download/onnx/vocoder.onnx.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
+1763671231.4259956

assets/.cache/huggingface/download/voice_styles/F1.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+842522854be041eabfef70e97393ffb8cbc77d37
+1763671228.7745795

assets/.cache/huggingface/download/voice_styles/F2.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+ade0e075c325a27d7ab1d19a8f5ab3f8b8f54bee
+1763671228.9370666

assets/.cache/huggingface/download/voice_styles/M1.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+f6e6b25e6ee6aa603d19bb1fdcf3cd9f35f528c6
+1763671228.946357

assets/.cache/huggingface/download/voice_styles/M2.json.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+3110200b250c90179f67c387b3dcde326cc7ff43
+a38b1a1327156f27310bcc55223e0914ddf8a615
+1763671229.2795043

assets/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

assets/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+window.json
+filter_bank.json
+style_extractor.onnx
+*.npy

assets/LICENSE ADDED Viewed

	@@ -0,0 +1,209 @@

+BigScience Open RAIL-M License
+dated August 18, 2022
+Section I: PREAMBLE
+This Open RAIL-M License was created by BigScience, a collaborative open innovation project aimed at
+the responsible development and use of large multilingual datasets and Large Language Models
+(“LLMs”). While a similar license was originally designed for the BLOOM model, we decided to adapt it
+and create this license in order to propose a general open and responsible license applicable to other
+machine learning based AI models (e.g. multimodal generative models).
+In short, this license strives for both the open and responsible downstream use of the accompanying
+model. When it comes to the open character, we took inspiration from open source permissive licenses
+regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based
+restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be
+able to enforce the license in case potential misuses of the Model may occur. Even though downstream
+derivative versions of the model could be released under different licensing terms, the latter will always
+have to include - at minimum - the same use-based restrictions as the ones in the original license (this
+license).
+The development and use of artificial intelligence (“AI”), does not come without concerns. The world has
+witnessed how AI techniques may, in some instances, become risky for the public in general. These risks
+come in many forms, from racial discrimination to the misuse of sensitive information.
+BigScience believes in the intersection between open and responsible AI development; thus, this License
+aims to strike a balance between both in order to enable responsible open-science in the field of AI.
+This License governs the use of the model (and its derivatives) and is informed by the model card
+associated with the model.
+NOW THEREFORE, You and Licensor agree as follows:
+1. Definitions
+(a) "License" means the terms and conditions for use, reproduction, and Distribution as defined in
+this document.
+(b) “Data” means a collection of information and/or content extracted from the dataset used with the
+Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under
+this License.
+(c)“Output” means the results of operating a Model as embodied in informational content resulting
+therefrom.
+(d)“Model” means any accompanying machine-learning based assemblies (including checkpoints),
+consisting of learnt weights, parameters (including optimizer states), corresponding to the model
+architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or
+in part on the Data, using the Complementary Material.
+(e) “Derivatives of the Model” means all modifications to the Model, works based on the Model, or any
+other model which is created or initialized by transfer of patterns of the weights, parameters,
+activations or output of the Model, to the other model, in order to cause the other model to perform
+similarly to the Model, including - but not limited to - distillation methods entailing the use of
+intermediate data representations or methods based on the generation of synthetic data by the Model
+for training the other model.
+(f)“Complementary Material” means the accompanying source code and scripts used to define,
+run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if
+any. This includes any accompanying documentation, tutorials, examples, etc, if any.
+(g) “Distribution” means any transmission, reproduction, publication or other sharing of the Model or
+Derivatives of the Model to a third party, including providing the Model as a hosted service made
+available by electronic or other remote means - e.g. API-based or web access.
+(h) “Licensor” means the copyright owner or entity authorized by the copyright owner that is
+granting the License, including the persons or entities that may have rights in the Model and/or
+distributing the Model.
+(i) "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this
+License and/or making use of the Model for whichever purpose and in any field of use, including
+usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
+(j) “Third Parties” means individuals or legal entities that are not under common control with
+Licensor or You.
+(k) "Contribution" means any work of authorship, including the original version of the Model and
+any modifications or additions to that Model or Derivatives of the Model thereof, that is
+intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an
+individual or Legal Entity authorized to submit on behalf of the copyright owner. For the
+purposes of this definition,
+“submitted” means any form of electronic, verbal, or written
+communication sent to the Licensor or its representatives, including but not limited to
+communication on electronic mailing lists, source code control systems, and issue tracking
+systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and
+improving the Model, but excluding communication that is conspicuously marked or otherwise
+designated in writing by the copyright owner as "Not a Contribution."
+(l) "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a
+Contribution has been received by Licensor and subsequently incorporated within the Model.
+Section II: INTELLECTUAL PROPERTY RIGHTS
+Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary
+Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor
+hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the
+Complementary Material, the Model, and Derivatives of the Model.
+3. Grant of Patent License. Subject to the terms and conditions of this License and where and as
+applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer
+to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such
+license applies only to those patent claims licensable by such Contributor that are necessarily infringed by
+their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such
+Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim
+or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution
+incorporated within the Model and/or Complementary Material constitutes direct or contributory patent
+infringement, then any patent licenses granted to You under this License for the Model and/or Work shall
+terminate as of the date such litigation is asserted or filed.
+Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
+4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g.
+software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof
+in any medium, with or without modifications, provided that You meet the following conditions:
+a. Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision
+by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the
+Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to,
+that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply
+to the use of Complementary Material.
+b. You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this
+License;
+c. You must cause any modified files to carry prominent notices stating that You changed the files;
+d. You must retain all copyright, patent, trademark, and attribution notices excluding those notices
+that do not pertain to any part of the Model, Derivatives of the Model.
+You may add Your own copyright statement to Your modifications and may provide additional or
+different license terms and conditions - respecting paragraph 4.a.
+- for use, reproduction, or Distribution
+of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use,
+reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
+5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions.
+Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You
+may use the Model subject to this License, including only for lawful purposes and in accordance with the
+License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or
+reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model
+to comply with the terms of this paragraph (paragraph 5).
+6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You
+generate using the Model. You are accountable for the Output you generate and its subsequent uses. No
+use of the output can contravene any provision as stated in the License.
+Section IV: OTHER PROVISIONS
+7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the
+right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model
+through electronic means, or modify the Output of the Model based on updates. You shall undertake
+reasonable efforts to use the latest version of the Model.
+8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks,
+trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the
+parties; and any rights not expressly granted herein are reserved by the Licensors.
+9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides
+the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS
+IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
+including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
+MERCHANTABILITY , or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for
+determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the
+Complementary Material and assume any risks associated with Your exercise of permissions under this
+License.
+10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence),
+contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or
+agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect,
+special, incidental, or consequential damages of any character arising as a result of this License or out of
+the use or inability to use the Model and the Complementary Material (including but not limited to
+damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other
+commercial damages or losses), even if such Contributor has been advised of the possibility of such
+damages.
+11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the
+Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance
+of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License.
+However, in accepting such obligations, You may act only on Your own behalf and on Your sole
+responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and
+hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor
+by reason of your accepting any such warranty or additional liability.
+12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining
+provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+END OF TERMS AND CONDITIONS
+Attachment A
+Use Restrictions
+You agree not to use the Model or Derivatives of the Model:
+(a) In any way that violates any applicable national, federal, state, local or international law
+or regulation;
+(b) For the purpose of exploiting, harming or attempting to exploit or harm minors in any
+way;
+(c) To generate or disseminate verifiably false information and/or content with the purpose of
+harming others;
+(d) To generate or disseminate personal identifiable information that can be used to harm an
+individual;
+(e) To generate or disseminate information and/or content (e.g. images, code, posts, articles),
+and place the information and/or content in any context (e.g. bot generating tweets)
+without expressly and intelligibly disclaiming that the information and/or content is
+machine generated;
+(f) To defame, disparage or otherwise harass others;
+(g) To impersonate or attempt to impersonate (e.g. deepfakes) others without their consent;
+(h) For fully automated decision making that adversely impacts an individual’s legal rights or
+otherwise creates or modifies a binding, enforceable obligation;
+(i) For any use intended to or which has the effect of discriminating against or harming
+individuals or groups based on online or offline social behavior or known or predicted
+personal or personality characteristics;
+(j) To exploit any of the vulnerabilities of a specific group of persons based on their age,
+social, physical or mental characteristics, in order to materially distort the behavior of a
+person pertaining to that group in a manner that causes or is likely to cause that person or
+another person physical or psychological harm;
+(k) For any use intended to or which has the effect of discriminating against individuals or
+groups based on legally protected characteristics or categories;
+(l) To provide medical advice and medical results interpretation;
+(m) To generate or disseminate information for the purpose to be used for administration of
+justice, law enforcement, immigration or asylum processes, such as predicting an
+individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal
+relationships between assertions made in documents, indiscriminate and
+arbitrarily-targeted use).

assets/README.md ADDED Viewed

	@@ -0,0 +1,161 @@

+---
+license: openrail
+language:
+- en
+pipeline_tag: text-to-speech
+library_name: transformers.js
+---
+# Supertonic — Lightning Fast, On-Device TTS
+**Supertonic** is a lightning-fast, on-device text-to-speech system designed for **extreme performance** with minimal computational overhead. Powered by ONNX Runtime, it runs entirely on your device—no cloud, no API calls, no privacy concerns.
+> 🎧 **Try it now**: Experience Supertonic in your browser with our [**Interactive Demo**](https://huggingface.co/spaces/Supertone/supertonic#interactive-demo), or [**Hugging Face app**](https://huggingface.co/spaces/akhaliq/supertonic) or get started with pre-trained models from [**Hugging Face Hub**](https://huggingface.co/Supertone/supertonic)
+> 🛠 **GitHub Repository**
+> To use Supertonic most easily, visit the official GitHub repository:
+> https://github.com/supertone-inc/supertonic
+> You’ll find multi-language example codes.
+### Table of Contents
+- [Why Supertonic?](#why-supertonic)
+- [Language Support](#language-support)
+- [Getting Started](#getting-started)
+- [Performance](#performance)
+- [Citation](#citation)
+- [License](#license)
+## Why Supertonic?
+- **⚡ Blazingly Fast**: Generates speech up to **167× faster than real-time** on consumer hardware (M4 Pro)—unmatched by any other TTS system
+- **🪶 Ultra Lightweight**: Only **66M parameters**, optimized for efficient on-device performance with minimal footprint
+- **📱 On-Device Capable**: **Complete privacy** and **zero latency**—all processing happens locally on your device
+- **🎨 Natural Text Handling**: Seamlessly processes numbers, dates, currency, abbreviations, and complex expressions without pre-processing
+- **⚙️ Highly Configurable**: Adjust inference steps, batch processing, and other parameters to match your specific needs
+- **🧩 Flexible Deployment**: Deploy seamlessly across servers, browsers, and edge devices with multiple runtime backends.
+## Language Support
+We provide ready-to-use TTS inference examples across multiple ecosystems:
+| Language/Platform | Path | Description |
+|-------------------|------|-------------|
+| [**Python**] | `py/` | ONNX Runtime inference |
+| [**Node.js**] | `nodejs/` | Server-side JavaScript |
+| [**Browser**] | `web/` | WebGPU/WASM inference |
+| [**Java**] | `java/` | Cross-platform JVM |
+| [**C++**] | `cpp/` | High-performance C++ |
+| [**C#**] | `csharp/` | .NET ecosystem |
+| [**Go**] | `go/` | Go implementation |
+| [**Swift**] | `swift/` | macOS applications |
+| [**iOS**] | `ios/` | Native iOS apps |
+| [**Rust**] | `rust/` | Memory-safe systems |
+> For detailed usage instructions, please refer to the README.md in each language directory.
+## Getting Started
+First, clone the repository:
+```bash
+git clone https://github.com/supertone-inc/supertonic.git
+cd supertonic
+```
+### Prerequisites
+Before running the examples, download the ONNX models and preset voices, and place them in the `assets` directory:
+```bash
+git clone https://huggingface.co/Supertone/supertonic assets
+```
+> **Note:** The Hugging Face repository uses Git LFS. Please ensure Git LFS is installed and initialized before cloning or pulling large model files.
+> - macOS: `brew install git-lfs && git lfs install`
+> - Generic: see `https://git-lfs.com` for installers
+### Technical Details
+- **Runtime**: ONNX Runtime for cross-platform inference (CPU-optimized; GPU mode is not tested)
+- **Browser Support**: onnxruntime-web for client-side inference
+- **Batch Processing**: Supports batch inference for improved throughput
+- **Audio Output**: Outputs 16-bit WAV files
+## Performance
+We evaluated Supertonic's performance (with 2 inference steps) using two key metrics across input texts of varying lengths: Short (59 chars), Mid (152 chars), and Long (266 chars).
+**Metrics:**
+- **Characters per Second**: Measures throughput by dividing the number of input characters by the time required to generate audio. Higher is better.
+- **Real-time Factor (RTF)**: Measures the time taken to synthesize audio relative to its duration. Lower is better (e.g., RTF of 0.1 means it takes 0.1 seconds to generate one second of audio).
+### Characters per Second
+| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
+|--------|-----------------|----------------|-----------------|
+| **Supertonic** (M4 pro - CPU) | 912 | 1048 | 1263 |
+| **Supertonic** (M4 pro - WebGPU) | 996 | 1801 | 2509 |
+| **Supertonic** (RTX4090) | 2615 | 6548 | 12164 |
+| `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 144 | 209 | 287 |
+| `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 37 | 55 | 82 |
+| `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 12 | 18 | 24 |
+| `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 38 | 64 | 92 |
+| `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 104 | 107 | 117 |
+| `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 37 | 42 | 47 |
+> **Notes:**
+> `API` = Cloud-based API services (measured from Seoul)
+> `Open` = Open-source models
+> Supertonic (M4 pro - CPU) and (M4 pro - WebGPU): Tested with ONNX
+> Supertonic (RTX4090): Tested with PyTorch model
+> Kokoro: Tested on M4 Pro CPU with ONNX
+> NeuTTS Air: Tested on M4 Pro CPU with Q8-GGUF
+### Real-time Factor
+| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
+|--------|-----------------|----------------|-----------------|
+| **Supertonic** (M4 pro - CPU) | 0.015 | 0.013 | 0.012 |
+| **Supertonic** (M4 pro - WebGPU) | 0.014 | 0.007 | 0.006 |
+| **Supertonic** (RTX4090) | 0.005 | 0.002 | 0.001 |
+| `API` [ElevenLabs Flash v2.5](https://elevenlabs.io/docs/api-reference/text-to-speech/convert) | 0.133 | 0.077 | 0.057 |
+| `API` [OpenAI TTS-1](https://platform.openai.com/docs/guides/text-to-speech) | 0.471 | 0.302 | 0.201 |
+| `API` [Gemini 2.5 Flash TTS](https://ai.google.dev/gemini-api/docs/speech-generation) | 1.060 | 0.673 | 0.541 |
+| `API` [Supertone Sona speech 1](https://docs.supertoneapi.com/en/api-reference/endpoints/text-to-speech) | 0.372 | 0.206 | 0.163 |
+| `Open` [Kokoro](https://github.com/hexgrad/kokoro/) | 0.144 | 0.124 | 0.126 |
+| `Open` [NeuTTS Air](https://github.com/neuphonic/neutts-air) | 0.390 | 0.338 | 0.343 |
+<details>
+<summary><b>Additional Performance Data (5-step inference)</b></summary>
+<br>
+**Characters per Second (5-step)**
+| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
+|--------|-----------------|----------------|-----------------|
+| **Supertonic** (M4 pro - CPU) | 596 | 691 | 850 |
+| **Supertonic** (M4 pro - WebGPU) | 570 | 1118 | 1546 |
+| **Supertonic** (RTX4090) | 1286 | 3757 | 6242 |
+**Real-time Factor (5-step)**
+| System | Short (59 chars) | Mid (152 chars) | Long (266 chars) |
+|--------|-----------------|----------------|-----------------|
+| **Supertonic** (M4 pro - CPU) | 0.023 | 0.019 | 0.018 |
+| **Supertonic** (M4 pro - WebGPU) | 0.024 | 0.012 | 0.010 |
+| **Supertonic** (RTX4090) | 0.011 | 0.004 | 0.002 |
+</details>
+## License
+This project’s sample code is released under the MIT License. - see the [LICENSE](https://github.com/supertone-inc/supertonic?tab=MIT-1-ov-file) for details.
+The accompanying model is released under the OpenRAIL-M License. - see the [LICENSE](https://huggingface.co/Supertone/supertonic/blob/main/LICENSE) file for details.
+This model was trained using PyTorch, which is licensed under the BSD 3-Clause License but is not redistributed with this project. - see the [LICENSE](https://docs.pytorch.org/FBGEMM/general/License.html) for details.
+Copyright (c) 2025 Supertone Inc.

assets/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "model_name": "Supertonic",
+  "model_type": "onnx",
+  "description": "This is a stub config for Hugging Face download counting. The actual model is located at onnx/"
+}

assets/onnx/duration_predictor.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e30b9d2dc2da1b12af5ff4a9b6e6ac00ca0bc900a2648ec0bf6e134e386c8133
+size 1590703

assets/onnx/text_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c857c504f84855a78e3506e882ddd4b414c7e6cc5c96a87d4e1fc741917aa4b
+size 27978387

assets/onnx/tts.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+    "tts_version": "v1.5.0",
+    "split": "opensource-en",
+    "ttl_ckpt_path": "unknown.pt",
+    "dp_ckpt_path": "unknown.pt",
+    "ae_ckpt_path": "unknown.pt",
+    "ttl_train": "unknown",
+    "dp_train": "unknown",
+    "ae_train": "unknown",
+    "ttl": {
+        "latent_dim": 24,
+        "chunk_compress_factor": 6,
+        "batch_expander": {
+            "n_batch_expand": 6
+        },
+        "normalizer": {
+            "scale": 0.25
+        },
+        "text_encoder": {
+            "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
+            "text_embedder": {
+                "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
+                "char_emb_dim": 256
+            },
+            "convnext": {
+                "idim": 256,
+                "ksz": 5,
+                "intermediate_dim": 1024,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "attn_encoder": {
+                "hidden_channels": 256,
+                "filter_channels": 1024,
+                "n_heads": 4,
+                "n_layers": 4,
+                "p_dropout": 0.0
+            },
+            "proj_out": {
+                "idim": 256,
+                "odim": 256
+            }
+        },
+        "flow_matching": {
+            "sig_min": 0
+        },
+        "style_encoder": {
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 256
+            },
+            "convnext": {
+                "idim": 256,
+                "ksz": 5,
+                "intermediate_dim": 1024,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "style_token_layer": {
+                "input_dim": 256,
+                "n_style": 50,
+                "style_key_dim": 256,
+                "style_value_dim": 256,
+                "prototype_dim": 256,
+                "n_units": 256,
+                "n_heads": 2
+            }
+        },
+        "speech_prompted_text_encoder": {
+            "text_dim": 256,
+            "style_dim": 256,
+            "n_units": 256,
+            "n_heads": 2
+        },
+        "uncond_masker": {
+            "prob_both_uncond": 0.04,
+            "prob_text_uncond": 0.01,
+            "std": 0.1,
+            "text_dim": 256,
+            "n_style": 50,
+            "style_key_dim": 256,
+            "style_value_dim": 256
+        },
+        "vector_field": {
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 512
+            },
+            "time_encoder": {
+                "time_dim": 64,
+                "hdim": 256
+            },
+            "main_blocks": {
+                "n_blocks": 4,
+                "time_cond_layer": {
+                    "idim": 512,
+                    "time_dim": 64
+                },
+                "style_cond_layer": {
+                    "idim": 512,
+                    "style_dim": 256
+                },
+                "text_cond_layer": {
+                    "idim": 512,
+                    "text_dim": 256,
+                    "n_heads": 4,
+                    "use_residual": true,
+                    "rotary_base": 10000,
+                    "rotary_scale": 10
+                },
+                "convnext_0": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 1024,
+                    "num_layers": 4,
+                    "dilation_lst": [
+                        1,
+                        2,
+                        4,
+                        8
+                    ]
+                },
+                "convnext_1": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 1024,
+                    "num_layers": 1,
+                    "dilation_lst": [
+                        1
+                    ]
+                },
+                "convnext_2": {
+                    "idim": 512,
+                    "ksz": 5,
+                    "intermediate_dim": 1024,
+                    "num_layers": 1,
+                    "dilation_lst": [
+                        1
+                    ]
+                }
+            },
+            "last_convnext": {
+                "idim": 512,
+                "ksz": 5,
+                "intermediate_dim": 1024,
+                "num_layers": 4,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "proj_out": {
+                "idim": 512,
+                "chunk_compress_factor": 6,
+                "ldim": 24
+            }
+        }
+    },
+    "ae": {
+        "sample_rate": 44100,
+        "n_delay": 0,
+        "base_chunk_size": 512,
+        "chunk_compress_factor": 1,
+        "ldim": 24,
+        "encoder": {
+            "spec_processor": {
+                "n_fft": 2048,
+                "win_length": 2048,
+                "hop_length": 512,
+                "n_mels": 228,
+                "sample_rate": 44100,
+                "eps": 1e-05,
+                "norm_mean": 0.0,
+                "norm_std": 1.0
+            },
+            "ksz_init": 7,
+            "ksz": 7,
+            "num_layers": 10,
+            "dilation_lst": [
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1,
+                1
+            ],
+            "intermediate_dim": 2048,
+            "idim": 1253,
+            "hdim": 512,
+            "odim": 24
+        },
+        "decoder": {
+            "ksz_init": 7,
+            "ksz": 7,
+            "num_layers": 10,
+            "dilation_lst": [
+                1,
+                2,
+                4,
+                1,
+                2,
+                4,
+                1,
+                1,
+                1,
+                1
+            ],
+            "intermediate_dim": 2048,
+            "idim": 24,
+            "hdim": 512,
+            "head": {
+                "idim": 512,
+                "hdim": 2048,
+                "odim": 512,
+                "ksz": 3
+            }
+        }
+    },
+    "dp": {
+        "latent_dim": 24,
+        "chunk_compress_factor": 6,
+        "normalizer": {
+            "scale": 1.0
+        },
+        "sentence_encoder": {
+            "char_emb_dim": 64,
+            "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
+            "text_embedder": {
+                "char_dict_path": "resources/metadata/char_dict/opensource-en/char_dict.json",
+                "char_emb_dim": 64
+            },
+            "convnext": {
+                "idim": 64,
+                "ksz": 5,
+                "intermediate_dim": 256,
+                "num_layers": 6,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "attn_encoder": {
+                "hidden_channels": 64,
+                "filter_channels": 256,
+                "n_heads": 2,
+                "n_layers": 2,
+                "p_dropout": 0.0
+            },
+            "proj_out": {
+                "idim": 64,
+                "odim": 64
+            }
+        },
+        "style_encoder": {
+            "proj_in": {
+                "ldim": 24,
+                "chunk_compress_factor": 6,
+                "odim": 64
+            },
+            "convnext": {
+                "idim": 64,
+                "ksz": 5,
+                "intermediate_dim": 256,
+                "num_layers": 4,
+                "dilation_lst": [
+                    1,
+                    1,
+                    1,
+                    1
+                ]
+            },
+            "style_token_layer": {
+                "input_dim": 64,
+                "n_style": 8,
+                "style_key_dim": 0,
+                "style_value_dim": 16,
+                "prototype_dim": 64,
+                "n_units": 64,
+                "n_heads": 2
+            }
+        },
+        "predictor": {
+            "sentence_dim": 64,
+            "n_style": 8,
+            "style_dim": 16,
+            "hdim": 128,
+            "n_layer": 2
+        }
+    }
+}

assets/onnx/tts.yml ADDED Viewed

	@@ -0,0 +1,223 @@

+tts_version: "v1.5.0"
+split: "opensource-en"
+ttl_ckpt_path: "unknown.pt"
+dp_ckpt_path: "unknown.pt"
+ae_ckpt_path: "unknown.pt"
+ttl_train: "unknown"
+dp_train: "unknown"
+ae_train: "unknown"
+ttl:
+    latent_dim: 24
+    chunk_compress_factor: 6
+    batch_expander:
+        n_batch_expand: 6
+    normalizer:
+        scale: 0.25
+    text_encoder:
+        char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
+        text_embedder:
+            char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
+            char_emb_dim: 256
+        convnext:
+            idim: 256
+            ksz: 5
+            intermediate_dim: 1024
+            num_layers: 6
+            dilation_lst: [1, 1, 1, 1, 1, 1]
+        attn_encoder:
+            hidden_channels: 256
+            filter_channels: 1024
+            n_heads: 4
+            n_layers: 4
+            p_dropout: 0.0
+        proj_out:
+            idim: 256
+            odim: 256
+    flow_matching:
+        sig_min: 0
+    style_encoder:
+        proj_in:
+            ldim: 24
+            chunk_compress_factor: 6
+            odim: 256
+        convnext:
+            idim: 256
+            ksz: 5
+            intermediate_dim: 1024
+            num_layers: 6
+            dilation_lst: [1, 1, 1, 1, 1, 1]
+        style_token_layer:
+            input_dim: 256
+            n_style: 50
+            style_key_dim: 256
+            style_value_dim: 256
+            prototype_dim: 256
+            n_units: 256
+            n_heads: 2
+    speech_prompted_text_encoder:
+        text_dim: 256
+        style_dim: 256
+        n_units: 256
+        n_heads: 2
+    uncond_masker:
+        prob_both_uncond: 0.04
+        prob_text_uncond: 0.01
+        std: 0.1
+        text_dim: 256
+        n_style: 50
+        style_key_dim: 256
+        style_value_dim: 256
+    vector_field:
+        proj_in:
+            ldim: 24
+            chunk_compress_factor: 6
+            odim: 512
+        time_encoder:
+            time_dim: 64
+            hdim: 256
+        main_blocks:
+            n_blocks: 4
+            time_cond_layer:
+                idim: 512
+                time_dim: 64
+            style_cond_layer:
+                idim: 512
+                style_dim: 256
+            text_cond_layer:
+                idim: 512
+                text_dim: 256
+                n_heads: 4
+                use_residual: True
+                rotary_base: 10000
+                rotary_scale: 10
+            convnext_0:
+                idim: 512
+                ksz: 5
+                intermediate_dim: 1024
+                num_layers: 4
+                dilation_lst: [1, 2, 4, 8]
+            convnext_1:
+                idim: 512
+                ksz: 5
+                intermediate_dim: 1024
+                num_layers: 1
+                dilation_lst: [1]
+            convnext_2:
+                idim: 512
+                ksz: 5
+                intermediate_dim: 1024
+                num_layers: 1
+                dilation_lst: [1]
+        last_convnext:
+            idim: 512
+            ksz: 5
+            intermediate_dim: 1024
+            num_layers: 4
+            dilation_lst: [1, 1, 1, 1]
+        proj_out:
+            idim: 512
+            chunk_compress_factor: 6
+            ldim: 24
+ae:
+    sample_rate: 44100
+    n_delay: 0
+    base_chunk_size: 512
+    chunk_compress_factor: 1
+    ldim: 24
+    encoder:
+        spec_processor:
+            n_fft: 2048
+            win_length: 2048
+            hop_length: 512
+            n_mels: 228
+            sample_rate: 44100
+            eps: 1e-05
+            norm_mean: 0.0
+            norm_std: 1.0
+        ksz_init: 7
+        ksz: 7
+        num_layers: 10
+        dilation_lst: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        intermediate_dim: 2048
+        idim: 1253
+        hdim: 512
+        odim: 24
+    decoder:
+        ksz_init: 7
+        ksz: 7
+        num_layers: 10
+        dilation_lst: [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]
+        intermediate_dim: 2048
+        idim: 24
+        hdim: 512
+        head:
+            idim: 512
+            hdim: 2048
+            odim: 512
+            ksz: 3
+dp:
+    latent_dim: 24
+    chunk_compress_factor: 6
+    normalizer:
+        scale: 1.0
+    sentence_encoder:
+        char_emb_dim: 64
+        char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
+        text_embedder:
+            char_dict_path: "resources/metadata/char_dict/opensource-en/char_dict.json"
+            char_emb_dim: 64
+        convnext:
+            idim: 64
+            ksz: 5
+            intermediate_dim: 256
+            num_layers: 6
+            dilation_lst: [1, 1, 1, 1, 1, 1]
+        attn_encoder:
+            hidden_channels: 64
+            filter_channels: 256
+            n_heads: 2
+            n_layers: 2
+            p_dropout: 0.0
+        proj_out:
+            idim: 64
+            odim: 64
+    style_encoder:
+        proj_in:
+            ldim: 24
+            chunk_compress_factor: 6
+            odim: 64
+        convnext:
+            idim: 64
+            ksz: 5
+            intermediate_dim: 256
+            num_layers: 4
+            dilation_lst: [1, 1, 1, 1]
+        style_token_layer:
+            input_dim: 64
+            n_style: 8
+            style_key_dim: 0
+            style_value_dim: 16
+            prototype_dim: 64
+            n_units: 64
+            n_heads: 2
+    predictor:
+        sentence_dim: 64
+        n_style: 8
+        style_dim: 16
+        hdim: 128
+        n_layer: 2
+unicode_indexer_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.npy"
+unicode_indexer_json_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/unicode_indexer.json"
+window_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/window.json"
+filter_bank_path: "/data/public/model/supertonic/tts/v1.5.0/opensource-en/onnx/filter_bank.json"

assets/onnx/unicode_indexer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/onnx/vector_estimator.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eb36e8cc102e0db3171229a6ae87be3ff244d949997010c0edf0fd6b643483d
+size 132517477

assets/onnx/vocoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6886622edc640d74d28c22e8282f0cf8d8eb5dc33d0ced67ed652ef6ea68d0c3
+size 101424195

assets/voice_styles/F1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/voice_styles/F2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/voice_styles/M1.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/voice_styles/M2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

helper.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import json
+import os
+import time
+from contextlib import contextmanager
+from typing import Optional
+from unicodedata import normalize
+import re
+import numpy as np
+import onnxruntime as ort
+class UnicodeProcessor:
+    def __init__(self, unicode_indexer_path: str):
+        with open(unicode_indexer_path, "r") as f:
+            self.indexer = json.load(f)
+    def _preprocess_text(self, text: str) -> str:
+        # TODO: add more preprocessing
+        text = normalize("NFKD", text)
+        return text
+    def _get_text_mask(self, text_ids_lengths: np.ndarray) -> np.ndarray:
+        text_mask = length_to_mask(text_ids_lengths)
+        return text_mask
+    def _text_to_unicode_values(self, text: str) -> np.ndarray:
+        unicode_values = np.array(
+            [ord(char) for char in text], dtype=np.uint16
+        )  # 2 bytes
+        return unicode_values
+    def __call__(self, text_list: list[str]) -> tuple[np.ndarray, np.ndarray]:
+        text_list = [self._preprocess_text(t) for t in text_list]
+        text_ids_lengths = np.array([len(text) for text in text_list], dtype=np.int64)
+        text_ids = np.zeros((len(text_list), text_ids_lengths.max()), dtype=np.int64)
+        for i, text in enumerate(text_list):
+            unicode_vals = self._text_to_unicode_values(text)
+            text_ids[i, : len(unicode_vals)] = np.array(
+                [self.indexer[val] for val in unicode_vals], dtype=np.int64
+            )
+        text_mask = self._get_text_mask(text_ids_lengths)
+        return text_ids, text_mask
+class Style:
+    def __init__(self, style_ttl_onnx: np.ndarray, style_dp_onnx: np.ndarray):
+        self.ttl = style_ttl_onnx
+        self.dp = style_dp_onnx
+class TextToSpeech:
+    def __init__(
+        self,
+        cfgs: dict,
+        text_processor: UnicodeProcessor,
+        dp_ort: ort.InferenceSession,
+        text_enc_ort: ort.InferenceSession,
+        vector_est_ort: ort.InferenceSession,
+        vocoder_ort: ort.InferenceSession,
+    ):
+        self.cfgs = cfgs
+        self.text_processor = text_processor
+        self.dp_ort = dp_ort
+        self.text_enc_ort = text_enc_ort
+        self.vector_est_ort = vector_est_ort
+        self.vocoder_ort = vocoder_ort
+        self.sample_rate = cfgs["ae"]["sample_rate"]
+        self.base_chunk_size = cfgs["ae"]["base_chunk_size"]
+        self.chunk_compress_factor = cfgs["ttl"]["chunk_compress_factor"]
+        self.ldim = cfgs["ttl"]["latent_dim"]
+    def sample_noisy_latent(
+        self, duration: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray]:
+        bsz = len(duration)
+        wav_len_max = duration.max() * self.sample_rate
+        wav_lengths = (duration * self.sample_rate).astype(np.int64)
+        chunk_size = self.base_chunk_size * self.chunk_compress_factor
+        latent_len = ((wav_len_max + chunk_size - 1) / chunk_size).astype(np.int32)
+        latent_dim = self.ldim * self.chunk_compress_factor
+        noisy_latent = np.random.randn(bsz, latent_dim, latent_len).astype(np.float32)
+        latent_mask = get_latent_mask(
+            wav_lengths, self.base_chunk_size, self.chunk_compress_factor
+        )
+        noisy_latent = noisy_latent * latent_mask
+        return noisy_latent, latent_mask
+    def _infer(
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
+    ) -> tuple[np.ndarray, np.ndarray]:
+        assert (
+            len(text_list) == style.ttl.shape[0]
+        ), "Number of texts must match number of style vectors"
+        bsz = len(text_list)
+        text_ids, text_mask = self.text_processor(text_list)
+        dur_onnx, *_ = self.dp_ort.run(
+            None, {"text_ids": text_ids, "style_dp": style.dp, "text_mask": text_mask}
+        )
+        dur_onnx = dur_onnx / speed
+        text_emb_onnx, *_ = self.text_enc_ort.run(
+            None,
+            {"text_ids": text_ids, "style_ttl": style.ttl, "text_mask": text_mask},
+        )  # dur_onnx: [bsz]
+        xt, latent_mask = self.sample_noisy_latent(dur_onnx)
+        total_step_np = np.array([total_step] * bsz, dtype=np.float32)
+        for step in range(total_step):
+            current_step = np.array([step] * bsz, dtype=np.float32)
+            xt, *_ = self.vector_est_ort.run(
+                None,
+                {
+                    "noisy_latent": xt,
+                    "text_emb": text_emb_onnx,
+                    "style_ttl": style.ttl,
+                    "text_mask": text_mask,
+                    "latent_mask": latent_mask,
+                    "current_step": current_step,
+                    "total_step": total_step_np,
+                },
+            )
+        wav, *_ = self.vocoder_ort.run(None, {"latent": xt})
+        return wav, dur_onnx
+    def __call__(
+        self,
+        text: str,
+        style: Style,
+        total_step: int,
+        speed: float = 1.05,
+        silence_duration: float = 0.3,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        assert (
+            style.ttl.shape[0] == 1
+        ), "Single speaker text to speech only supports single style"
+        text_list = chunk_text(text)
+        wav_cat = None
+        dur_cat = None
+        for text in text_list:
+            wav, dur_onnx = self._infer([text], style, total_step, speed)
+            if wav_cat is None:
+                wav_cat = wav
+                dur_cat = dur_onnx
+            else:
+                silence = np.zeros(
+                    (1, int(silence_duration * self.sample_rate)), dtype=np.float32
+                )
+                wav_cat = np.concatenate([wav_cat, silence, wav], axis=1)
+                dur_cat += dur_onnx + silence_duration
+        return wav_cat, dur_cat
+    def stream(
+        self,
+        text: str,
+        style: Style,
+        total_step: int,
+        speed: float = 1.05,
+        silence_duration: float = 0.3,
+    ):
+        assert (
+            style.ttl.shape[0] == 1
+        ), "Single speaker text to speech only supports single style"
+        text_list = chunk_text(text)
+        for i, text in enumerate(text_list):
+            wav, _ = self._infer([text], style, total_step, speed)
+            yield wav.flatten()
+            if i < len(text_list) - 1:
+                silence = np.zeros(
+                    (int(silence_duration * self.sample_rate),), dtype=np.float32
+                )
+                yield silence
+    def batch(
+        self, text_list: list[str], style: Style, total_step: int, speed: float = 1.05
+    ) -> tuple[np.ndarray, np.ndarray]:
+        return self._infer(text_list, style, total_step, speed)
+def length_to_mask(lengths: np.ndarray, max_len: Optional[int] = None) -> np.ndarray:
+    """
+    Convert lengths to binary mask.
+    Args:
+        lengths: (B,)
+        max_len: int
+    Returns:
+        mask: (B, 1, max_len)
+    """
+    max_len = max_len or lengths.max()
+    ids = np.arange(0, max_len)
+    mask = (ids < np.expand_dims(lengths, axis=1)).astype(np.float32)
+    return mask.reshape(-1, 1, max_len)
+def get_latent_mask(
+    wav_lengths: np.ndarray, base_chunk_size: int, chunk_compress_factor: int
+) -> np.ndarray:
+    latent_size = base_chunk_size * chunk_compress_factor
+    latent_lengths = (wav_lengths + latent_size - 1) // latent_size
+    latent_mask = length_to_mask(latent_lengths)
+    return latent_mask
+def load_onnx(
+    onnx_path: str, opts: ort.SessionOptions, providers: list[str]
+) -> ort.InferenceSession:
+    return ort.InferenceSession(onnx_path, sess_options=opts, providers=providers)
+def load_onnx_all(
+    onnx_dir: str, opts: ort.SessionOptions, providers: list[str]
+) -> tuple[
+    ort.InferenceSession,
+    ort.InferenceSession,
+    ort.InferenceSession,
+    ort.InferenceSession,
+]:
+    dp_onnx_path = os.path.join(onnx_dir, "duration_predictor.onnx")
+    text_enc_onnx_path = os.path.join(onnx_dir, "text_encoder.onnx")
+    vector_est_onnx_path = os.path.join(onnx_dir, "vector_estimator.onnx")
+    vocoder_onnx_path = os.path.join(onnx_dir, "vocoder.onnx")
+    dp_ort = load_onnx(dp_onnx_path, opts, providers)
+    text_enc_ort = load_onnx(text_enc_onnx_path, opts, providers)
+    vector_est_ort = load_onnx(vector_est_onnx_path, opts, providers)
+    vocoder_ort = load_onnx(vocoder_onnx_path, opts, providers)
+    return dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
+def load_cfgs(onnx_dir: str) -> dict:
+    cfg_path = os.path.join(onnx_dir, "tts.json")
+    with open(cfg_path, "r") as f:
+        cfgs = json.load(f)
+    return cfgs
+def load_text_processor(onnx_dir: str) -> UnicodeProcessor:
+    unicode_indexer_path = os.path.join(onnx_dir, "unicode_indexer.json")
+    text_processor = UnicodeProcessor(unicode_indexer_path)
+    return text_processor
+def load_text_to_speech(onnx_dir: str, use_gpu: bool = False) -> TextToSpeech:
+    opts = ort.SessionOptions()
+    if use_gpu:
+        raise NotImplementedError("GPU mode is not fully tested")
+    else:
+        providers = ["CPUExecutionProvider"]
+        print("Using CPU for inference")
+    cfgs = load_cfgs(onnx_dir)
+    dp_ort, text_enc_ort, vector_est_ort, vocoder_ort = load_onnx_all(
+        onnx_dir, opts, providers
+    )
+    text_processor = load_text_processor(onnx_dir)
+    return TextToSpeech(
+        cfgs, text_processor, dp_ort, text_enc_ort, vector_est_ort, vocoder_ort
+    )
+def load_voice_style(voice_style_paths: list[str], verbose: bool = False) -> Style:
+    bsz = len(voice_style_paths)
+    # Read first file to get dimensions
+    with open(voice_style_paths[0], "r") as f:
+        first_style = json.load(f)
+    ttl_dims = first_style["style_ttl"]["dims"]
+    dp_dims = first_style["style_dp"]["dims"]
+    # Pre-allocate arrays with full batch size
+    ttl_style = np.zeros([bsz, ttl_dims[1], ttl_dims[2]], dtype=np.float32)
+    dp_style = np.zeros([bsz, dp_dims[1], dp_dims[2]], dtype=np.float32)
+    # Fill in the data
+    for i, voice_style_path in enumerate(voice_style_paths):
+        with open(voice_style_path, "r") as f:
+            voice_style = json.load(f)
+        ttl_data = np.array(
+            voice_style["style_ttl"]["data"], dtype=np.float32
+        ).flatten()
+        ttl_style[i] = ttl_data.reshape(ttl_dims[1], ttl_dims[2])
+        dp_data = np.array(
+            voice_style["style_dp"]["data"], dtype=np.float32
+        ).flatten()
+        dp_style[i] = dp_data.reshape(dp_dims[1], dp_dims[2])
+    if verbose:
+        print(f"Loaded {bsz} voice styles")
+    return Style(ttl_style, dp_style)
+@contextmanager
+def timer(name: str):
+    start = time.time()
+    print(f"{name}...")
+    yield
+    print(f"  -> {name} completed in {time.time() - start:.2f} sec")
+def sanitize_filename(text: str, max_len: int) -> str:
+    """Sanitize filename by replacing non-alphanumeric characters with underscores"""
+    prefix = text[:max_len]
+    return re.sub(r"[^a-zA-Z0-9]", "_", prefix)
+def chunk_text(text: str, max_len: int = 300) -> list[str]:
+    """
+    Split text into chunks by paragraphs and sentences.
+    Args:
+        text: Input text to chunk
+        max_len: Maximum length of each chunk (default: 300)
+    Returns:
+        List of text chunks
+    """
+    # Split by paragraph (two or more newlines)
+    paragraphs = [p.strip() for p in re.split(r"\n\s*\n+", text.strip()) if p.strip()]
+    chunks = []
+    for paragraph in paragraphs:
+        paragraph = paragraph.strip()
+        if not paragraph:
+            continue
+        # Split by sentence boundaries (period, question mark, exclamation mark followed by space)
+        # But exclude common abbreviations like Mr., Mrs., Dr., etc. and single capital letters like F.
+        pattern = r"(?<!Mr\.)(?<!Mrs\.)(?<!Ms\.)(?<!Dr\.)(?<!Prof\.)(?<!Sr\.)(?<!Jr\.)(?<!Ph\.D\.)(?<!etc\.)(?<!e\.g\.)(?<!i\.e\.)(?<!vs\.)(?<!Inc\.)(?<!Ltd\.)(?<!Co\.)(?<!Corp\.)(?<!St\.)(?<!Ave\.)(?<!Blvd\.)(?<!\b[A-Z]\.)(?<=[.!?])\s+"
+        sentences = re.split(pattern, paragraph)
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) + 1 <= max_len:
+                current_chunk += (" " if current_chunk else "") + sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+    return chunks

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+numpy>=1.26.0
+onnxruntime==1.23.1
+soundfile>=0.12.1
+librosa>=0.10.0
+PyYAML>=6.0
+huggingface_hub