Spaces:

gjoliveira
/

data-ai-llm-eval-app

Sleeping

File size: 3,139 Bytes
import unicodedata
import re
import difflib

__all__ = ["strip_accents", "tokenize_for_diff", "generate_diff_html"]

# -------------------------------------------------------------------
# Diff helpers (can be moved to another file if desired)
# -------------------------------------------------------------------

def strip_accents(text: str) -> str:
    """

    Remove diacritics/accents for comparison while preserving original for display.

    """
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c))


def tokenize_for_diff(text: str):
    """

    Returns the original word tokens and normalized versions (lowercased, accents stripped)

    for matching. Section headers like '## S:' are removed before tokenizing.

    """
    cleaned = re.sub(r"##\s*[SOAP]:", "", text, flags=re.IGNORECASE)
    words = re.findall(r"\b\w+\b", cleaned, flags=re.UNICODE)
    orig_tokens = words
    norm_tokens = [strip_accents(w).lower() for w in words]
    return orig_tokens, norm_tokens


def generate_diff_html(ref_txt: str, gen_txt: str) -> str:
    """

    Side-by-side token-level diff, case- and accent-insensitive.

    Reference deletions in dark red, inserts in gold.

    """
    ref_orig, ref_norm = tokenize_for_diff(ref_txt)
    gen_orig, gen_norm = tokenize_for_diff(gen_txt)

    matcher = difflib.SequenceMatcher(a=ref_norm, b=gen_norm)
    ref_html, gen_html = [], []

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            ref_html.extend(ref_orig[i1:i2])
            gen_html.extend(gen_orig[j1:j2])
        if tag in ("delete", "replace"):
            for tok in ref_orig[i1:i2]:
                ref_html.append(
                    f'<span style="background:darkred;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>'
                )
        if tag in ("insert", "replace"):
            for tok in gen_orig[j1:j2]:
                gen_html.append(
                    f'<span style="background:#b8860b;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>'
                )

    ref_col = " ".join(ref_html)
    gen_col = " ".join(gen_html)
    return (
        '<div style="display:flex; gap:1rem; align-items:flex-start; '
        'font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;">'
          '<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">'
            '<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Reference</div>'
            f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{ref_col}</div>'
          '</div>'
          '<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">'
            '<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Generated</div>'
            f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{gen_col}</div>'
          '</div>'
        '</div>'
    )