File size: 4,937 Bytes
4b112ae
 
 
 
 
 
 
 
2e6da1f
 
4b112ae
 
2e6da1f
 
 
4b112ae
 
 
 
 
d906888
 
 
 
 
 
 
 
 
 
 
 
 
4b112ae
 
 
 
 
 
 
 
 
 
 
 
d906888
 
4b112ae
 
2e6da1f
 
d906888
 
4b112ae
 
2e6da1f
4b112ae
2e6da1f
 
4b112ae
2e6da1f
 
 
 
 
 
 
 
 
 
 
 
 
4b112ae
 
 
2e6da1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b112ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e6da1f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# ui/manual_tab.py
"""

Builds the manual-evaluation tab (single pair metrics).

"""
import gradio as gr
from metrics import (
    compute_bleu_single,
    compute_bleurt_single,
    compute_rouge_single,       # kept for backwards-compat / if user only wants a single number
    compute_bertscore_single,   # kept for backwards-compat (F1)
    BERT_FRIENDLY_TO_MODEL,
)
# Import triplet helpers (P, R, F1)
from metrics.rouge import rougeL_prec_rec_f1
from metrics.bertscore import bertscore_prec_rec_f1
from ui.common import toggle_manual_visibility
from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup


def build_manual_tab():

    gr.HTML("""

            <style>

              #run-eval-btn button {

                background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;

                color: #fff !important;

                border: none !important;

                box-shadow: 0 6px 16px rgba(0,0,0,.25);

              }

              #run-eval-btn button:hover { filter: brightness(1.08); transform: translateY(-1px); }

            </style>

            """)

    with gr.Blocks() as tab:
        gr.Markdown("## Manual Evaluation")
        gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.")

        with gr.Row():
            reference_input = gr.Textbox(label="Reference Text", lines=3)
            generated_input = gr.Textbox(label="Generated Text", lines=3)

        metric_selector = MetricCheckboxGroup()
        bert_model_selector = BertCheckboxGroup()

        with gr.Row():

            run_btn = gr.Button("🚀 Run Evaluation", variant="primary", elem_id="run-eval-btn")
            clear_btn = gr.Button("Clear")

        bleu_out  = gr.Textbox(label="BLEU Score", interactive=False)
        bleurt_out= gr.Textbox(label="BLEURT Score", interactive=False)
        rouge_out = gr.Textbox(label="ROUGE Scores ", interactive=False)
        bert_out  = gr.Textbox(label="BERTScore Results ", interactive=False)

        def compute_manual(reference, generated, metrics, berts):
            # BLEU (single scalar, as before)
            bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else ""

            # BLEURT (single scalar, as before)
            bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else ""

            # ROUGE — show P/R/F1 each on its own line (no special styling)
            rouge = ""
            if "ROUGE" in metrics:
                try:
                    # order: pred, ref
                    p, r, f1 = rougeL_prec_rec_f1(generated, reference)
                    rouge = f"ROUGE-L Precision: {p:.4f}\nROUGE-L Recall: {r:.4f}\nROUGE-L F1: {f1:.4f}"
                except Exception:
                    # fallback to legacy single score, in case helper isn't available
                    rouge = compute_rouge_single(reference, generated)

            # BERTScore — for each selected model, show P/R/F1 each on its own line
            bertscore = ""
            if "BERTSCORE" in metrics and berts:
                parts = []
                for friendly in berts:
                    model_id = BERT_FRIENDLY_TO_MODEL[friendly]
                    try:
                        p, r, f1 = bertscore_prec_rec_f1(reference, generated, model_id)
                        parts.append(
                            f"{friendly} Precision: {p:.4f}\n"
                            f"{friendly} Recall: {r:.4f}\n"
                            f"{friendly} F1: {f1:.4f}"
                        )
                    except Exception:
                        # fallback: only F1 via the existing single helper
                        f1_only = compute_bertscore_single(reference, generated, model_id, per_section=False)
                        if f1_only is None:
                            parts.append(f"{friendly}: error")
                        else:
                            parts.append(f"{friendly} F1: {f1_only:.4f}")
                bertscore = "\n\n".join(parts)

            return bleu, bleurt, rouge, bertscore

        run_btn.click(
            fn=compute_manual,
            inputs=[reference_input, generated_input, metric_selector, bert_model_selector],
            outputs=[bleu_out, bleurt_out, rouge_out, bert_out],
        )
        metric_selector.change(
            fn=toggle_manual_visibility,
            inputs=[metric_selector],
            outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector],
        )
        clear_btn.click(
            fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]),
            inputs=[],
            outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector],
        )

    return tab