data-ai-llm-eval-app / ui /manual_tab.py
Guilherme
Improved UI ; removed per section scores ; Changed color scale
d906888
# ui/manual_tab.py
"""
Builds the manual-evaluation tab (single pair metrics).
"""
import gradio as gr
from metrics import (
compute_bleu_single,
compute_bleurt_single,
compute_rouge_single, # kept for backwards-compat / if user only wants a single number
compute_bertscore_single, # kept for backwards-compat (F1)
BERT_FRIENDLY_TO_MODEL,
)
# Import triplet helpers (P, R, F1)
from metrics.rouge import rougeL_prec_rec_f1
from metrics.bertscore import bertscore_prec_rec_f1
from ui.common import toggle_manual_visibility
from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
def build_manual_tab():
gr.HTML("""
<style>
#run-eval-btn button {
background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
color: #fff !important;
border: none !important;
box-shadow: 0 6px 16px rgba(0,0,0,.25);
}
#run-eval-btn button:hover { filter: brightness(1.08); transform: translateY(-1px); }
</style>
""")
with gr.Blocks() as tab:
gr.Markdown("## Manual Evaluation")
gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.")
with gr.Row():
reference_input = gr.Textbox(label="Reference Text", lines=3)
generated_input = gr.Textbox(label="Generated Text", lines=3)
metric_selector = MetricCheckboxGroup()
bert_model_selector = BertCheckboxGroup()
with gr.Row():
run_btn = gr.Button("🚀 Run Evaluation", variant="primary", elem_id="run-eval-btn")
clear_btn = gr.Button("Clear")
bleu_out = gr.Textbox(label="BLEU Score", interactive=False)
bleurt_out= gr.Textbox(label="BLEURT Score", interactive=False)
rouge_out = gr.Textbox(label="ROUGE Scores ", interactive=False)
bert_out = gr.Textbox(label="BERTScore Results ", interactive=False)
def compute_manual(reference, generated, metrics, berts):
# BLEU (single scalar, as before)
bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else ""
# BLEURT (single scalar, as before)
bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else ""
# ROUGE — show P/R/F1 each on its own line (no special styling)
rouge = ""
if "ROUGE" in metrics:
try:
# order: pred, ref
p, r, f1 = rougeL_prec_rec_f1(generated, reference)
rouge = f"ROUGE-L Precision: {p:.4f}\nROUGE-L Recall: {r:.4f}\nROUGE-L F1: {f1:.4f}"
except Exception:
# fallback to legacy single score, in case helper isn't available
rouge = compute_rouge_single(reference, generated)
# BERTScore — for each selected model, show P/R/F1 each on its own line
bertscore = ""
if "BERTSCORE" in metrics and berts:
parts = []
for friendly in berts:
model_id = BERT_FRIENDLY_TO_MODEL[friendly]
try:
p, r, f1 = bertscore_prec_rec_f1(reference, generated, model_id)
parts.append(
f"{friendly} Precision: {p:.4f}\n"
f"{friendly} Recall: {r:.4f}\n"
f"{friendly} F1: {f1:.4f}"
)
except Exception:
# fallback: only F1 via the existing single helper
f1_only = compute_bertscore_single(reference, generated, model_id, per_section=False)
if f1_only is None:
parts.append(f"{friendly}: error")
else:
parts.append(f"{friendly} F1: {f1_only:.4f}")
bertscore = "\n\n".join(parts)
return bleu, bleurt, rouge, bertscore
run_btn.click(
fn=compute_manual,
inputs=[reference_input, generated_input, metric_selector, bert_model_selector],
outputs=[bleu_out, bleurt_out, rouge_out, bert_out],
)
metric_selector.change(
fn=toggle_manual_visibility,
inputs=[metric_selector],
outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector],
)
clear_btn.click(
fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]),
inputs=[],
outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector],
)
return tab