GAIA_benchmark_agent / src /tools /wikipedia_tools.py
gabriel-melki
Modify package structure
860424e
import os
import requests
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup
from smolagents.tools import tool
import wikipediaapi
def fetch_wikipedia_page(url: str) -> str:
"""Fetch raw HTML of a Wikipedia page."""
headers = {
"User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])",
"Accept-Language": "en-US,en;q=0.9",
}
resp = requests.get(url, headers=headers, timeout=50)
resp.raise_for_status()
return resp.text
def _normalize_title(value: str) -> str:
"""Lowercase, collapse whitespace for robust title comparisons."""
return " ".join(value.lower().split()) if isinstance(value, str) else ""
def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
"""Remove sections (header + content until next header of same/higher level) whose
header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
"""
if not titles:
return
excluded = {_normalize_title(t) for t in titles}
header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
# Find all headers that match excluded titles
headers_to_remove = []
for header in soup.find_all(header_tags):
title_text = _normalize_title(header.get_text(" ", strip=True))
if title_text in excluded:
headers_to_remove.append(header)
# Remove each matching section (header + content)
for header in headers_to_remove:
# Skip if header was already removed as part of another section
if not header.parent:
continue
level = int(header.name[1])
# Determine the container to remove - could be the header itself or its parent wrapper
header_container = header
# If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
if (header.parent and
header.parent.name == 'div' and
header.parent.get('class') and
any('heading' in cls.lower() for cls in header.parent.get('class', []))):
header_container = header.parent
nodes_to_remove = [header_container]
# Collect all content after the header container until next header of same/higher level
current = header_container
while current.next_sibling:
current = current.next_sibling
sib_name = getattr(current, "name", None)
# If we hit another header (directly or within a heading container), check its level
next_header = None
if sib_name in header_tags:
next_header = current
elif (sib_name == 'div' and
current.get('class') and
any('heading' in cls.lower() for cls in current.get('class', []))):
# This is a heading container, find the header inside it
for child in current.find_all(header_tags):
next_header = child
break
if next_header:
next_level = int(next_header.name[1])
if next_level <= level:
# This is a header of same or higher level - stop here
break
# Add this node to removal list
nodes_to_remove.append(current)
# Remove all collected nodes
for node in nodes_to_remove:
try:
node.decompose()
except Exception:
try:
node.extract()
except Exception:
pass
def _cleanup_non_content(root: BeautifulSoup) -> None:
"""Remove Wikipedia UI/maintenance blocks from the main content area."""
selectors = [
"div#toc",
"div.toc",
"div.hatnote",
"div.shortdescription",
"div.reflist",
"ol.references",
"div.navbox",
"table.navbox",
"table.vertical-navbox",
"table.sidebar",
"table.ambox",
"table.metadata",
"div#catlinks",
"div.mw-authority-control",
"div.printfooter",
"div.portal",
"table.infobox", # avoid dumping infobox into text
]
for sel in selectors:
for el in root.select(sel):
try:
el.decompose()
except Exception:
try:
el.extract()
except Exception:
pass
def extract_text(soup: BeautifulSoup) -> str:
"""Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
Excludes content that's inside tables and excludes headers that are also used as
table names (either as <caption> or the nearest previous header) to avoid duplication
with extract_tables."""
content_root = soup.select_one("div.mw-parser-output") or soup
for elem in content_root(["script", "style", "sup", "aside", "nav"]):
elem.decompose()
_cleanup_non_content(content_root)
# Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
table_names_normalized = set()
for table in content_root.find_all("table"):
# Skip non-content tables (same logic as extract_tables)
classes = table.get("class", [])
if isinstance(classes, list) and any(
c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
for c in classes
):
continue
name_text = None
caption_el = table.find("caption")
if caption_el:
caption_text = caption_el.get_text(" ", strip=True)
if caption_text:
name_text = caption_text
else:
# Empty caption: treat as no caption and fallback to previous header
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name_text = prev_header.get_text(" ", strip=True)
else:
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name_text = prev_header.get_text(" ", strip=True)
if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
name_text = "Infobox"
if name_text:
table_names_normalized.add(_normalize_title(name_text))
# Find all text elements in document order, but exclude duplicates
text_elements = []
for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
# Skip elements that are inside a table (to avoid duplication with extract_tables)
if element.find_parent("table"):
continue
# Skip headers that match any table name (to avoid duplication with extract_tables)
if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
header_text_norm = _normalize_title(element.get_text(" ", strip=True))
if header_text_norm in table_names_normalized:
continue
# Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
if element.name == "li":
li_text_norm = _normalize_title(element.get_text(" ", strip=True))
if li_text_norm in table_names_normalized:
continue
text = element.get_text(" ", strip=True)
if text: # Only include non-empty text
text_elements.append(text)
return "\n\n".join(text_elements)
def extract_tables(soup: BeautifulSoup) -> list[dict]:
"""Extract all HTML tables as dicts: {name, df}."""
content_root = soup.select_one("div.mw-parser-output") or soup
tables = []
for table_idx, table in enumerate(content_root.find_all("table")):
# Skip non-content tables (navboxes, sidebars, etc.)
classes = table.get("class", [])
if isinstance(classes, list) and any(
c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
for c in classes
):
continue
# Prefer explicit <caption>
caption_el = table.find("caption")
name = caption_el.get_text(" ", strip=True) if caption_el else None
# Fallback: nearest previous section header
if not name:
prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
if prev_header:
name = prev_header.get_text(" ", strip=True)
# Fallback: class-based hints (e.g., infobox)
if not name:
if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
name = "Infobox"
# Final fallback
if not name:
name = f"Table {table_idx + 1}"
try:
dfs = pd.read_html(StringIO(str(table)))
if len(dfs) == 1:
tables.append({"name": name, "df": dfs[0]})
else:
for part_idx, df in enumerate(dfs, start=1):
tables.append({"name": f"{name} (part {part_idx})", "df": df})
except ValueError:
continue
return tables
def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
"""Combine text + tables into a single string for LLM input."""
output = []
output.append("=== ARTICLE TEXT ===\n")
output.append(text)
excluded = {_normalize_title(s) for s in sections_to_exclude}
filtered_tables = [
t for t in tables if _normalize_title(t.get("name", "")) not in excluded
]
for i, t in enumerate(filtered_tables, start=1):
tname = t.get("name") or f"Table {i}"
df = t["df"]
output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
output.append(df.to_markdown(index=False))
return "\n".join(output)
@tool
def wikipedia_summary(entity: str) -> dict:
"""
Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
Args:
entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
Returns:
A dictionary with the summary of the page and the url of the page.
"""
import wikipedia
summary_tool = wikipediaapi.Wikipedia(
user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
)
page = summary_tool.page(entity)
if not page.exists():
raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
sections = [section._title for section in page.sections]
return {
"summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
''',
"url": wikipedia.page(pageid=page.pageid).url
}
@tool
def read_wikipedia_page(
url: str,
sections_to_exclude: list[str] = [
"External links",
"References",
"Further reading",
"See also",
"Notes",
]) -> str:
"""
Read a Wikipedia page and return a string with the text of the page.
Args:
url: The URL of the Wikipedia page to read.
sections_to_exclude: A list of sections to exclude from the page.
Returns:
A string with the text of the page.
"""
if "https://en.wikipedia.org/wiki/" not in url:
raise ValueError("URL is required")
# Fetch the page
html = fetch_wikipedia_page(url)
# Parse the page
soup = BeautifulSoup(html, "html.parser")
# Remove unwanted sections
_remove_sections_by_titles(soup, sections_to_exclude)
# Extract after pruning unwanted sections
text = extract_text(soup)
tables = extract_tables(soup)
# Combine
llm_ready = format_for_llm(text, tables, sections_to_exclude)
return llm_ready