Spaces:
Sleeping
Sleeping
| import unicodedata | |
| import re | |
| import difflib | |
| __all__ = ["strip_accents", "tokenize_for_diff", "generate_diff_html"] | |
| # ------------------------------------------------------------------- | |
| # Diff helpers (can be moved to another file if desired) | |
| # ------------------------------------------------------------------- | |
| def strip_accents(text: str) -> str: | |
| """ | |
| Remove diacritics/accents for comparison while preserving original for display. | |
| """ | |
| nfkd = unicodedata.normalize("NFKD", text) | |
| return "".join(c for c in nfkd if not unicodedata.combining(c)) | |
| def tokenize_for_diff(text: str): | |
| """ | |
| Returns the original word tokens and normalized versions (lowercased, accents stripped) | |
| for matching. Section headers like '## S:' are removed before tokenizing. | |
| """ | |
| cleaned = re.sub(r"##\s*[SOAP]:", "", text, flags=re.IGNORECASE) | |
| words = re.findall(r"\b\w+\b", cleaned, flags=re.UNICODE) | |
| orig_tokens = words | |
| norm_tokens = [strip_accents(w).lower() for w in words] | |
| return orig_tokens, norm_tokens | |
| def generate_diff_html(ref_txt: str, gen_txt: str) -> str: | |
| """ | |
| Side-by-side token-level diff, case- and accent-insensitive. | |
| Reference deletions in dark red, inserts in gold. | |
| """ | |
| ref_orig, ref_norm = tokenize_for_diff(ref_txt) | |
| gen_orig, gen_norm = tokenize_for_diff(gen_txt) | |
| matcher = difflib.SequenceMatcher(a=ref_norm, b=gen_norm) | |
| ref_html, gen_html = [], [] | |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
| if tag == "equal": | |
| ref_html.extend(ref_orig[i1:i2]) | |
| gen_html.extend(gen_orig[j1:j2]) | |
| if tag in ("delete", "replace"): | |
| for tok in ref_orig[i1:i2]: | |
| ref_html.append( | |
| f'<span style="background:darkred;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>' | |
| ) | |
| if tag in ("insert", "replace"): | |
| for tok in gen_orig[j1:j2]: | |
| gen_html.append( | |
| f'<span style="background:#b8860b;color:#fff;padding:2px;border-radius:2px;margin:0 1px;display:inline-block;">{tok}</span>' | |
| ) | |
| ref_col = " ".join(ref_html) | |
| gen_col = " ".join(gen_html) | |
| return ( | |
| '<div style="display:flex; gap:1rem; align-items:flex-start; ' | |
| 'font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;">' | |
| '<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">' | |
| '<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Reference</div>' | |
| f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{ref_col}</div>' | |
| '</div>' | |
| '<div style="flex:1; border:1px solid #444; padding:10px; border-radius:6px; background:#0f1218;">' | |
| '<div style="font-weight:600; margin-bottom:4px; color:#f0f0f0;">Generated</div>' | |
| f'<div style="line-height:1.3; color:#fff; word-break:break-word;">{gen_col}</div>' | |
| '</div>' | |
| '</div>' | |
| ) | |