Spaces:
Sleeping
Sleeping
File size: 12,010 Bytes
860424e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
import os
import requests
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup
from smolagents.tools import tool
import wikipediaapi
def fetch_wikipedia_page(url: str) -> str:
"""Fetch raw HTML of a Wikipedia page."""
headers = {
"User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])",
"Accept-Language": "en-US,en;q=0.9",
}
resp = requests.get(url, headers=headers, timeout=50)
resp.raise_for_status()
return resp.text
def _normalize_title(value: str) -> str:
"""Lowercase, collapse whitespace for robust title comparisons."""
return " ".join(value.lower().split()) if isinstance(value, str) else ""
def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
"""Remove sections (header + content until next header of same/higher level) whose
header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
"""
if not titles:
return
excluded = {_normalize_title(t) for t in titles}
header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
# Find all headers that match excluded titles
headers_to_remove = []
for header in soup.find_all(header_tags):
title_text = _normalize_title(header.get_text(" ", strip=True))
if title_text in excluded:
headers_to_remove.append(header)
# Remove each matching section (header + content)
for header in headers_to_remove:
# Skip if header was already removed as part of another section
if not header.parent:
continue
level = int(header.name[1])
# Determine the container to remove - could be the header itself or its parent wrapper
header_container = header
# If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
if (header.parent and
header.parent.name == 'div' and
header.parent.get('class') and
any('heading' in cls.lower() for cls in header.parent.get('class', []))):
header_container = header.parent
nodes_to_remove = [header_container]
# Collect all content after the header container until next header of same/higher level
current = header_container
while current.next_sibling:
current = current.next_sibling
sib_name = getattr(current, "name", None)
# If we hit another header (directly or within a heading container), check its level
next_header = None
if sib_name in header_tags:
next_header = current
elif (sib_name == 'div' and
current.get('class') and
any('heading' in cls.lower() for cls in current.get('class', []))):
# This is a heading container, find the header inside it
for child in current.find_all(header_tags):
next_header = child
break
if next_header:
next_level = int(next_header.name[1])
if next_level <= level:
# This is a header of same or higher level - stop here
break
# Add this node to removal list
nodes_to_remove.append(current)
# Remove all collected nodes
for node in nodes_to_remove:
try:
node.decompose()
except Exception:
try:
node.extract()
except Exception:
pass
def _cleanup_non_content(root: BeautifulSoup) -> None:
"""Remove Wikipedia UI/maintenance blocks from the main content area."""
selectors = [
"div#toc",
"div.toc",
"div.hatnote",
"div.shortdescription",
"div.reflist",
"ol.references",
"div.navbox",
"table.navbox",
"table.vertical-navbox",
"table.sidebar",
"table.ambox",
"table.metadata",
"div#catlinks",
"div.mw-authority-control",
"div.printfooter",
"div.portal",
"table.infobox", # avoid dumping infobox into text
]
for sel in selectors:
for el in root.select(sel):
try:
el.decompose()
except Exception:
try:
el.extract()
except Exception:
pass
def extract_text(soup: BeautifulSoup) -> str:
"""Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
Excludes content that's inside tables and excludes headers that are also used as
table names (either as <caption> or the nearest previous header) to avoid duplication
with extract_tables."""
content_root = soup.select_one("div.mw-parser-output") or soup
for elem in content_root(["script", "style", "sup", "aside", "nav"]):
elem.decompose()
_cleanup_non_content(content_root)
# Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
table_names_normalized = set()
for table in content_root.find_all("table"):
# Skip non-content tables (same logic as extract_tables)
classes = table.get("class", [])
if isinstance(classes, list) and any(
c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
for c in classes
):
continue
name_text = None
caption_el = table.find("caption")
if caption_el:
caption_text = caption_el.get_text(" ", strip=True)
if caption_text:
name_text = caption_text
else:
# Empty caption: treat as no caption and fallback to previous header
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name_text = prev_header.get_text(" ", strip=True)
else:
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name_text = prev_header.get_text(" ", strip=True)
if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
name_text = "Infobox"
if name_text:
table_names_normalized.add(_normalize_title(name_text))
# Find all text elements in document order, but exclude duplicates
text_elements = []
for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
# Skip elements that are inside a table (to avoid duplication with extract_tables)
if element.find_parent("table"):
continue
# Skip headers that match any table name (to avoid duplication with extract_tables)
if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
header_text_norm = _normalize_title(element.get_text(" ", strip=True))
if header_text_norm in table_names_normalized:
continue
# Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
if element.name == "li":
li_text_norm = _normalize_title(element.get_text(" ", strip=True))
if li_text_norm in table_names_normalized:
continue
text = element.get_text(" ", strip=True)
if text: # Only include non-empty text
text_elements.append(text)
return "\n\n".join(text_elements)
def extract_tables(soup: BeautifulSoup) -> list[dict]:
"""Extract all HTML tables as dicts: {name, df}."""
content_root = soup.select_one("div.mw-parser-output") or soup
tables = []
for table_idx, table in enumerate(content_root.find_all("table")):
# Skip non-content tables (navboxes, sidebars, etc.)
classes = table.get("class", [])
if isinstance(classes, list) and any(
c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
for c in classes
):
continue
# Prefer explicit <caption>
caption_el = table.find("caption")
name = caption_el.get_text(" ", strip=True) if caption_el else None
# Fallback: nearest previous section header
if not name:
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name = prev_header.get_text(" ", strip=True)
# Fallback: class-based hints (e.g., infobox)
if not name:
if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
name = "Infobox"
# Final fallback
if not name:
name = f"Table {table_idx + 1}"
try:
dfs = pd.read_html(StringIO(str(table)))
if len(dfs) == 1:
tables.append({"name": name, "df": dfs[0]})
else:
for part_idx, df in enumerate(dfs, start=1):
tables.append({"name": f"{name} (part {part_idx})", "df": df})
except ValueError:
continue
return tables
def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
"""Combine text + tables into a single string for LLM input."""
output = []
output.append("=== ARTICLE TEXT ===\n")
output.append(text)
excluded = {_normalize_title(s) for s in sections_to_exclude}
filtered_tables = [
t for t in tables if _normalize_title(t.get("name", "")) not in excluded
]
for i, t in enumerate(filtered_tables, start=1):
tname = t.get("name") or f"Table {i}"
df = t["df"]
output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
output.append(df.to_markdown(index=False))
return "\n".join(output)
@tool
def wikipedia_summary(entity: str) -> dict:
"""
Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
Args:
entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
Returns:
A dictionary with the summary of the page and the url of the page.
"""
import wikipedia
summary_tool = wikipediaapi.Wikipedia(
user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
)
page = summary_tool.page(entity)
if not page.exists():
raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
sections = [section._title for section in page.sections]
return {
"summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
''',
"url": wikipedia.page(pageid=page.pageid).url
}
@tool
def read_wikipedia_page(
url: str,
sections_to_exclude: list[str] = [
"External links",
"References",
"Further reading",
"See also",
"Notes",
]) -> str:
"""
Read a Wikipedia page and return a string with the text of the page.
Args:
url: The URL of the Wikipedia page to read.
sections_to_exclude: A list of sections to exclude from the page.
Returns:
A string with the text of the page.
"""
if "https://en.wikipedia.org/wiki/" not in url:
raise ValueError("URL is required")
# Fetch the page
html = fetch_wikipedia_page(url)
# Parse the page
soup = BeautifulSoup(html, "html.parser")
# Remove unwanted sections
_remove_sections_by_titles(soup, sections_to_exclude)
# Extract after pruning unwanted sections
text = extract_text(soup)
tables = extract_tables(soup)
# Combine
llm_ready = format_for_llm(text, tables, sections_to_exclude)
return llm_ready |