data-ai-llm-eval-app / utils /tokenizer_refgen.py
Guilherme
Add utils modules to package for Space
6352550
import unicodedata
import re
import difflib
__all__ = ["strip_accents", "tokenize_for_diff", "generate_diff_html"]
# -------------------------------------------------------------------
# Diff helpers (can be moved to another file if desired)
# -------------------------------------------------------------------
def strip_accents(text: str) -> str:
"""
Remove diacritics/accents for comparison while preserving original for display.
"""
nfkd = unicodedata.normalize("NFKD", text)
return "".join(c for c in nfkd if not unicodedata.combining(c))
def tokenize_for_diff(text: str):
"""
Returns the original word tokens and normalized versions (lowercased, accents stripped)
for matching. Section headers like '## S:' are removed before tokenizing.
"""
cleaned = re.sub(r"##\s*[SOAP]:", "", text, flags=re.IGNORECASE)
words = re.findall(r"\b\w+\b", cleaned, flags=re.UNICODE)
orig_tokens = words
norm_tokens = [strip_accents(w).lower() for w in words]
return orig_tokens, norm_tokens
def generate_diff_html(ref_txt: str, gen_txt: str) -> str:
"""
Side-by-side token-level diff, case- and accent-insensitive.
Reference deletions in dark red, inserts in gold.
"""
ref_orig, ref_norm = tokenize_for_diff(ref_txt)
gen_orig, gen_norm = tokenize_for_diff(gen_txt)
matcher = difflib.SequenceMatcher(a=ref_norm, b=gen_norm)
ref_html, gen_html = [], []
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "equal":
ref_html.extend(ref_orig[i1:i2])
gen_html.extend(gen_orig[j1:j2])
if tag in ("delete", "replace"):
for tok in ref_orig[i1:i2]:
ref_html.append(
f'<span style="background:darkred;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>'
)
if tag in ("insert", "replace"):
for tok in gen_orig[j1:j2]:
gen_html.append(
f'<span style="background:#b8860b;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>'
)
ref_col = " ".join(ref_html)
gen_col = " ".join(gen_html)
return (
'<div style="display:flex; gap:1rem; align-items:flex-start; '
'font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;">'
'<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">'
'<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Reference</div>'
f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{ref_col}</div>'
'</div>'
'<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">'
'<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Generated</div>'
f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{gen_col}</div>'
'</div>'
'</div>'
)