Spaces:
Running
Running
| import gradio as gr | |
| LAST_UPDATED = "06/12/2024" | |
| #################################### | |
| # Datos estáticos del leaderboard | |
| #################################### | |
| leaderboard_data = [ | |
| {'name': 'StyleTTS 2', 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 2.42, 'SpeechBERT': 0, 'Logf0': 0}, | |
| {'name': 'Matxa-TTS', 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50, 'SpeechBERT': 0, 'Logf0': 0}, | |
| {'name': 'Matxa-TTS-multiaccent', 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98, 'SpeechBERT': 0, 'Logf0': 0}, | |
| {'name': 'StableTTS', 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62, 'SpeechBERT': 0.7837, 'Logf0': 0.3831}, | |
| {'name': 'Vits 2', 'PESQ': 0, 'WER': 0, 'UTMOS': 3.61, 'SpeechBERT': 0, 'Logf0': 0}, | |
| ] | |
| # Texto para la pestaña de métricas | |
| METRICS_TAB_TEXT = """ | |
| ## Metrics | |
| Models in the leaderboard are evaluated using several key metrics: | |
| * **UTMOS** (UTokyo-SaruLab Mean Opinion Score), | |
| * **WER** (Word Error Rate), | |
| * **PESQ** (Perceptual Evaluation of Speech Quality). | |
| These metrics help evaluate both the accuracy and quality of the model. | |
| ### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)] | |
| UTMOS is a MOS prediction system. **A higher UTMOS indicates better quality** of the generated voice. | |
| ### WER (Word Error Rate) | |
| WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**. | |
| Example: | |
| | Reference | the | cat | sat | on | the | mat | | |
| |-------------|------|-----|---------|-----|------|-----| | |
| | Prediction | the | cat | **sit** | on | the | | | |
| | Label | ✅ | ✅ | S | ✅ | ✅ | D | | |
| The WER calculation is done as follows: | |
| ``` | |
| WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333 | |
| ``` | |
| Moreover, We calculate the WER using the STT_Ca_Citrinet_512 model. [[Link](https://langtech-bsc.gitbook.io/aina-kit/aina-hack/automatic-speech-recognition)] | |
| ### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)] | |
| PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**. | |
| ## Benchmark Datasets | |
| Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/projecte-aina/catalan_tts_arena/blob/main/catalan_benchmark_v1.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation. | |
| """ | |
| CITATION_TEXT = """@misc{catalan-tts-arena, | |
| title = {Catalan Texto-to-Speech Leaderboard}, | |
| author = {Rodolfo Zevallos, José Giraldo, Alex Peiró-Lilja, Carme Armentano-Oller}, | |
| year = 2024, | |
| publisher = {Hugging Face}, | |
| howpublished = "\\url{https://huggingface.co/spaces/projecte-aina/catalan_tts_arena}" | |
| } | |
| """ | |
| DESCR = """ | |
| # 🏆 Catalan TTS Arena: Benchmarking TTS Models | |
| \nThe Catalan TTS Leaderboard ranks and evaluates TTS models in Catalan. | |
| \nThe leaderboard currently focuses on Catalan TTS, and will be expanded to multilingual evaluation in later versions. | |
| """.strip() | |
| #################################### | |
| # Functions (static version) | |
| #################################### | |
| def get_leaderboard(): | |
| """ | |
| Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS. | |
| """ | |
| # Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida) | |
| sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True) | |
| # Asignar el rank basado en el orden por PESQ | |
| for rank, model in enumerate(sorted_leaderboard): | |
| model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed) | |
| return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['PESQ'], model['SpeechBERT'], model['Logf0']] for model in sorted_leaderboard] | |
| #################################### | |
| # Interfaz con Gradio | |
| #################################### | |
| theme = gr.themes.Base( | |
| font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], | |
| ) | |
| with gr.Blocks(theme=theme) as demo: | |
| gr.Markdown(DESCR, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0): | |
| leaderboard_table = gr.DataFrame( | |
| headers=["Rank", "Model", "UTMOS ⬆️️", "WER ⬇️", "PESQ", "SpeechBERT ⬆️", "Logf0 ⬆️"], | |
| datatype=["str", "str", "str", "str", "str", "str", "str"], | |
| value=get_leaderboard() # Carga los datos iniciales de la tabla | |
| ) | |
| with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1): | |
| gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") | |
| gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Accordion("📙 Citation", open=False): | |
| gr.Textbox( | |
| value=CITATION_TEXT, lines=7, | |
| label="Copy the BibTeX snippet to cite this source", | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # Lanzar la aplicación | |
| demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False) |