Spaces:
Sleeping
Sleeping
| # ui/manual_tab.py | |
| """ | |
| Builds the manual-evaluation tab (single pair metrics). | |
| """ | |
| import gradio as gr | |
| from metrics import ( | |
| compute_bleu_single, | |
| compute_bleurt_single, | |
| compute_rouge_single, # kept for backwards-compat / if user only wants a single number | |
| compute_bertscore_single, # kept for backwards-compat (F1) | |
| BERT_FRIENDLY_TO_MODEL, | |
| ) | |
| # Import triplet helpers (P, R, F1) | |
| from metrics.rouge import rougeL_prec_rec_f1 | |
| from metrics.bertscore import bertscore_prec_rec_f1 | |
| from ui.common import toggle_manual_visibility | |
| from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup | |
| def build_manual_tab(): | |
| gr.HTML(""" | |
| <style> | |
| #run-eval-btn button { | |
| background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important; | |
| color: #fff !important; | |
| border: none !important; | |
| box-shadow: 0 6px 16px rgba(0,0,0,.25); | |
| } | |
| #run-eval-btn button:hover { filter: brightness(1.08); transform: translateY(-1px); } | |
| </style> | |
| """) | |
| with gr.Blocks() as tab: | |
| gr.Markdown("## Manual Evaluation") | |
| gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.") | |
| with gr.Row(): | |
| reference_input = gr.Textbox(label="Reference Text", lines=3) | |
| generated_input = gr.Textbox(label="Generated Text", lines=3) | |
| metric_selector = MetricCheckboxGroup() | |
| bert_model_selector = BertCheckboxGroup() | |
| with gr.Row(): | |
| run_btn = gr.Button("🚀 Run Evaluation", variant="primary", elem_id="run-eval-btn") | |
| clear_btn = gr.Button("Clear") | |
| bleu_out = gr.Textbox(label="BLEU Score", interactive=False) | |
| bleurt_out= gr.Textbox(label="BLEURT Score", interactive=False) | |
| rouge_out = gr.Textbox(label="ROUGE Scores ", interactive=False) | |
| bert_out = gr.Textbox(label="BERTScore Results ", interactive=False) | |
| def compute_manual(reference, generated, metrics, berts): | |
| # BLEU (single scalar, as before) | |
| bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else "" | |
| # BLEURT (single scalar, as before) | |
| bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else "" | |
| # ROUGE — show P/R/F1 each on its own line (no special styling) | |
| rouge = "" | |
| if "ROUGE" in metrics: | |
| try: | |
| # order: pred, ref | |
| p, r, f1 = rougeL_prec_rec_f1(generated, reference) | |
| rouge = f"ROUGE-L Precision: {p:.4f}\nROUGE-L Recall: {r:.4f}\nROUGE-L F1: {f1:.4f}" | |
| except Exception: | |
| # fallback to legacy single score, in case helper isn't available | |
| rouge = compute_rouge_single(reference, generated) | |
| # BERTScore — for each selected model, show P/R/F1 each on its own line | |
| bertscore = "" | |
| if "BERTSCORE" in metrics and berts: | |
| parts = [] | |
| for friendly in berts: | |
| model_id = BERT_FRIENDLY_TO_MODEL[friendly] | |
| try: | |
| p, r, f1 = bertscore_prec_rec_f1(reference, generated, model_id) | |
| parts.append( | |
| f"{friendly} Precision: {p:.4f}\n" | |
| f"{friendly} Recall: {r:.4f}\n" | |
| f"{friendly} F1: {f1:.4f}" | |
| ) | |
| except Exception: | |
| # fallback: only F1 via the existing single helper | |
| f1_only = compute_bertscore_single(reference, generated, model_id, per_section=False) | |
| if f1_only is None: | |
| parts.append(f"{friendly}: error") | |
| else: | |
| parts.append(f"{friendly} F1: {f1_only:.4f}") | |
| bertscore = "\n\n".join(parts) | |
| return bleu, bleurt, rouge, bertscore | |
| run_btn.click( | |
| fn=compute_manual, | |
| inputs=[reference_input, generated_input, metric_selector, bert_model_selector], | |
| outputs=[bleu_out, bleurt_out, rouge_out, bert_out], | |
| ) | |
| metric_selector.change( | |
| fn=toggle_manual_visibility, | |
| inputs=[metric_selector], | |
| outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector], | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]), | |
| inputs=[], | |
| outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector], | |
| ) | |
| return tab | |