Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# ruff: noqa: E402
|
| 2 |
import gc
|
| 3 |
import json
|
| 4 |
import re
|
|
@@ -58,10 +57,10 @@ DEFAULT_TTS_MODEL_CFG = [
|
|
| 58 |
json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
|
| 59 |
]
|
| 60 |
|
| 61 |
-
# Конфигурация для F5-
|
| 62 |
RUSSIAN_TTS_MODEL_CFG = [
|
| 63 |
-
"hf://
|
| 64 |
-
"hf://
|
| 65 |
json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
|
| 66 |
]
|
| 67 |
|
|
@@ -102,9 +101,9 @@ except Exception as e:
|
|
| 102 |
|
| 103 |
try:
|
| 104 |
F5TTS_russian_model = load_f5tts_russian()
|
| 105 |
-
print("F5-
|
| 106 |
except Exception as e:
|
| 107 |
-
print(f"Failed to load F5-
|
| 108 |
F5TTS_russian_model = None
|
| 109 |
|
| 110 |
E2TTS_ema_model = load_e2tts() if USING_SPACES else None
|
|
@@ -159,7 +158,7 @@ def infer(
|
|
| 159 |
# Выбор модели в зависимости от языка
|
| 160 |
if language == "ru":
|
| 161 |
if F5TTS_russian_model is None:
|
| 162 |
-
gr.Warning("F5-
|
| 163 |
return None, None, ref_text
|
| 164 |
ema_model = F5TTS_russian_model
|
| 165 |
else:
|
|
@@ -215,7 +214,7 @@ with gr.Blocks() as app_credits:
|
|
| 215 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
| 216 |
* [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
|
| 217 |
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
|
| 218 |
-
* [
|
| 219 |
""")
|
| 220 |
|
| 221 |
with gr.Blocks() as app_tts:
|
|
@@ -710,8 +709,8 @@ with gr.Blocks() as app:
|
|
| 710 |
This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not USING_SPACES else "an online demo for [F5-TTS](https://github.com/SWivid/F5-TTS)"} with advanced batch processing support. This app supports the following TTS models:
|
| 711 |
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching) for English and Chinese
|
| 712 |
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS) for English and Chinese
|
| 713 |
-
* [F5-
|
| 714 |
-
The checkpoints support English, Chinese, and Russian (via F5-
|
| 715 |
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
|
| 716 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
|
| 717 |
"""
|
|
|
|
|
|
|
| 1 |
import gc
|
| 2 |
import json
|
| 3 |
import re
|
|
|
|
| 57 |
json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
|
| 58 |
]
|
| 59 |
|
| 60 |
+
# Конфигурация для F5-TTS_RUSSIAN (русский)
|
| 61 |
RUSSIAN_TTS_MODEL_CFG = [
|
| 62 |
+
"hf://Misha24-10/F5-TTS_RUSSIAN/F5TTS_v1_Base/model.safetensors",
|
| 63 |
+
"hf://Misha24-10/F5-TTS_RUSSIAN/vocab.txt",
|
| 64 |
json.dumps(dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)),
|
| 65 |
]
|
| 66 |
|
|
|
|
| 101 |
|
| 102 |
try:
|
| 103 |
F5TTS_russian_model = load_f5tts_russian()
|
| 104 |
+
print("F5-TTS_RUSSIAN loaded successfully.")
|
| 105 |
except Exception as e:
|
| 106 |
+
print(f"Failed to load F5-TTS_RUSSIAN: {str(e)}")
|
| 107 |
F5TTS_russian_model = None
|
| 108 |
|
| 109 |
E2TTS_ema_model = load_e2tts() if USING_SPACES else None
|
|
|
|
| 158 |
# Выбор модели в зависимости от языка
|
| 159 |
if language == "ru":
|
| 160 |
if F5TTS_russian_model is None:
|
| 161 |
+
gr.Warning("F5-TTS_RUSSIAN model failed to load. Cannot generate Russian audio.")
|
| 162 |
return None, None, ref_text
|
| 163 |
ema_model = F5TTS_russian_model
|
| 164 |
else:
|
|
|
|
| 214 |
* [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
| 215 |
* [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
|
| 216 |
* [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
|
| 217 |
+
* [Misha24-10](https://huggingface.co/Misha24-10) for the [F5-TTS_RUSSIAN](https://huggingface.co/Misha24-10/F5-TTS_RUSSIAN) model
|
| 218 |
""")
|
| 219 |
|
| 220 |
with gr.Blocks() as app_tts:
|
|
|
|
| 709 |
This is {"a local web UI for [F5 TTS](https://github.com/SWivid/F5-TTS)" if not USING_SPACES else "an online demo for [F5-TTS](https://github.com/SWivid/F5-TTS)"} with advanced batch processing support. This app supports the following TTS models:
|
| 710 |
* [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching) for English and Chinese
|
| 711 |
* [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS) for English and Chinese
|
| 712 |
+
* [F5-TTS_RUSSIAN](https://huggingface.co/Misha24-10/F5-TTS_RUSSIAN) by [Misha24-10](https://huggingface.co/Misha24-10) for Russian
|
| 713 |
+
The checkpoints support English, Chinese, and Russian (via F5-TTS_RUSSIAN, licensed under CC-BY-NC-SA-4.0).
|
| 714 |
If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 12s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
|
| 715 |
**NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<12s). Ensure the audio is fully uploaded before generating.**
|
| 716 |
"""
|