itslikethisnow's picture
initial commit
dccc925
"""Utility functions for backend processing."""
import html
import re
def clean_model_output(content: str) -> str:
"""
Clean LLM output by removing reasoning artifacts.
Removes:
1. <think>...</think> tags from DeepSeek models
2. HTML entities (&lt;think&gt;)
3. "Reasoning:" headers
4. Malformed/cut-off tags
Args:
content: Raw LLM response string
Returns:
Cleaned response string
"""
if not content:
return ""
# 1. Unescape HTML entities
content = html.unescape(content)
# 2. Remove <think> blocks (case-insensitive, DOTALL for multiline)
content = re.sub(
r'<think>.*?</think>',
'',
content,
flags=re.DOTALL | re.IGNORECASE
)
# 3. Handle cut-off/malformed tags (content after </think>)
if '</think>' in content:
content = content.split('</think>')[-1]
# 4. Remove "Reasoning:" headers
if content.strip().startswith("Reasoning:"):
parts = content.split("\n\n", 1)
if len(parts) > 1:
content = parts[1]
return content.strip()