GAIA_benchmark_agent

Sleeping

GAIA_benchmark_agent / src /tools /wikipedia_tools.py

gabriel-melki

Modify package structure

860424e 3 months ago

12 kB

	import os
	import requests
	from io import StringIO
	import pandas as pd
	from bs4 import BeautifulSoup
	from smolagents.tools import tool
	import wikipediaapi


	def fetch_wikipedia_page(url: str) -> str:
	"""Fetch raw HTML of a Wikipedia page."""
	headers = {
	"User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])",
	"Accept-Language": "en-US,en;q=0.9",
	}
	resp = requests.get(url, headers=headers, timeout=50)
	resp.raise_for_status()
	return resp.text


	def _normalize_title(value: str) -> str:
	"""Lowercase, collapse whitespace for robust title comparisons."""
	return " ".join(value.lower().split()) if isinstance(value, str) else ""


	def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
	"""Remove sections (header + content until next header of same/higher level) whose
	header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
	"""
	if not titles:
	return
	excluded = {_normalize_title(t) for t in titles}
	header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]

	# Find all headers that match excluded titles
	headers_to_remove = []
	for header in soup.find_all(header_tags):
	title_text = _normalize_title(header.get_text(" ", strip=True))
	if title_text in excluded:
	headers_to_remove.append(header)

	# Remove each matching section (header + content)
	for header in headers_to_remove:
	# Skip if header was already removed as part of another section
	if not header.parent:
	continue

	level = int(header.name[1])

	# Determine the container to remove - could be the header itself or its parent wrapper
	header_container = header
	# If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
	if (header.parent and
	header.parent.name == 'div' and
	header.parent.get('class') and
	any('heading' in cls.lower() for cls in header.parent.get('class', []))):
	header_container = header.parent

	nodes_to_remove = [header_container]

	# Collect all content after the header container until next header of same/higher level
	current = header_container
	while current.next_sibling:
	current = current.next_sibling
	sib_name = getattr(current, "name", None)

	# If we hit another header (directly or within a heading container), check its level
	next_header = None
	if sib_name in header_tags:
	next_header = current
	elif (sib_name == 'div' and
	current.get('class') and
	any('heading' in cls.lower() for cls in current.get('class', []))):
	# This is a heading container, find the header inside it
	for child in current.find_all(header_tags):
	next_header = child
	break

	if next_header:
	next_level = int(next_header.name[1])
	if next_level <= level:
	# This is a header of same or higher level - stop here
	break

	# Add this node to removal list
	nodes_to_remove.append(current)

	# Remove all collected nodes
	for node in nodes_to_remove:
	try:
	node.decompose()
	except Exception:
	try:
	node.extract()
	except Exception:
	pass


	def _cleanup_non_content(root: BeautifulSoup) -> None:
	"""Remove Wikipedia UI/maintenance blocks from the main content area."""
	selectors = [
	"div#toc",
	"div.toc",
	"div.hatnote",
	"div.shortdescription",
	"div.reflist",
	"ol.references",
	"div.navbox",
	"table.navbox",
	"table.vertical-navbox",
	"table.sidebar",
	"table.ambox",
	"table.metadata",
	"div#catlinks",
	"div.mw-authority-control",
	"div.printfooter",
	"div.portal",
	"table.infobox", # avoid dumping infobox into text
	]
	for sel in selectors:
	for el in root.select(sel):
	try:
	el.decompose()
	except Exception:
	try:
	el.extract()
	except Exception:
	pass


	def extract_text(soup: BeautifulSoup) -> str:
	"""Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
	Excludes content that's inside tables and excludes headers that are also used as
	table names (either as <caption> or the nearest previous header) to avoid duplication
	with extract_tables."""
	content_root = soup.select_one("div.mw-parser-output") or soup

	for elem in content_root(["script", "style", "sup", "aside", "nav"]):
	elem.decompose()
	_cleanup_non_content(content_root)

	# Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
	table_names_normalized = set()
	for table in content_root.find_all("table"):
	# Skip non-content tables (same logic as extract_tables)
	classes = table.get("class", [])
	if isinstance(classes, list) and any(
	c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
	for c in classes
	):
	continue

	name_text = None
	caption_el = table.find("caption")
	if caption_el:
	caption_text = caption_el.get_text(" ", strip=True)
	if caption_text:
	name_text = caption_text
	else:
	# Empty caption: treat as no caption and fallback to previous header
	prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
	if prev_header:
	name_text = prev_header.get_text(" ", strip=True)
	else:
	prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
	if prev_header:
	name_text = prev_header.get_text(" ", strip=True)

	if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
	name_text = "Infobox"

	if name_text:
	table_names_normalized.add(_normalize_title(name_text))

	# Find all text elements in document order, but exclude duplicates
	text_elements = []
	for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
	# Skip elements that are inside a table (to avoid duplication with extract_tables)
	if element.find_parent("table"):
	continue

	# Skip headers that match any table name (to avoid duplication with extract_tables)
	if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
	header_text_norm = _normalize_title(element.get_text(" ", strip=True))
	if header_text_norm in table_names_normalized:
	continue

	# Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
	if element.name == "li":
	li_text_norm = _normalize_title(element.get_text(" ", strip=True))
	if li_text_norm in table_names_normalized:
	continue

	text = element.get_text(" ", strip=True)
	if text: # Only include non-empty text
	text_elements.append(text)

	return "\n\n".join(text_elements)


	def extract_tables(soup: BeautifulSoup) -> list[dict]:
	"""Extract all HTML tables as dicts: {name, df}."""
	content_root = soup.select_one("div.mw-parser-output") or soup

	tables = []
	for table_idx, table in enumerate(content_root.find_all("table")):
	# Skip non-content tables (navboxes, sidebars, etc.)
	classes = table.get("class", [])
	if isinstance(classes, list) and any(
	c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
	for c in classes
	):
	continue

	# Prefer explicit <caption>
	caption_el = table.find("caption")
	name = caption_el.get_text(" ", strip=True) if caption_el else None

	# Fallback: nearest previous section header
	if not name:
	prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
	if prev_header:
	name = prev_header.get_text(" ", strip=True)

	# Fallback: class-based hints (e.g., infobox)
	if not name:
	if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
	name = "Infobox"

	# Final fallback
	if not name:
	name = f"Table {table_idx + 1}"

	try:
	dfs = pd.read_html(StringIO(str(table)))
	if len(dfs) == 1:
	tables.append({"name": name, "df": dfs[0]})
	else:
	for part_idx, df in enumerate(dfs, start=1):
	tables.append({"name": f"{name} (part {part_idx})", "df": df})
	except ValueError:
	continue
	return tables


	def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
	"""Combine text + tables into a single string for LLM input."""
	output = []
	output.append("=== ARTICLE TEXT ===\n")
	output.append(text)

	excluded = {_normalize_title(s) for s in sections_to_exclude}
	filtered_tables = [
	t for t in tables if _normalize_title(t.get("name", "")) not in excluded
	]

	for i, t in enumerate(filtered_tables, start=1):
	tname = t.get("name") or f"Table {i}"
	df = t["df"]
	output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
	output.append(df.to_markdown(index=False))

	return "\n".join(output)


	@tool
	def wikipedia_summary(entity: str) -> dict:
	"""
	Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
	Args:
	entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
	Returns:
	A dictionary with the summary of the page and the url of the page.
	"""
	import wikipedia
	summary_tool = wikipediaapi.Wikipedia(
	user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
	)
	page = summary_tool.page(entity)
	if not page.exists():
	raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
	sections = [section._title for section in page.sections]
	return {
	"summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
	''',
	"url": wikipedia.page(pageid=page.pageid).url
	}


	@tool
	def read_wikipedia_page(
	url: str,
	sections_to_exclude: list[str] = [
	"External links",
	"References",
	"Further reading",
	"See also",
	"Notes",
	]) -> str:
	"""
	Read a Wikipedia page and return a string with the text of the page.
	Args:
	url: The URL of the Wikipedia page to read.
	sections_to_exclude: A list of sections to exclude from the page.
	Returns:
	A string with the text of the page.
	"""
	if "https://en.wikipedia.org/wiki/" not in url:
	raise ValueError("URL is required")
	# Fetch the page
	html = fetch_wikipedia_page(url)
	# Parse the page
	soup = BeautifulSoup(html, "html.parser")
	# Remove unwanted sections
	_remove_sections_by_titles(soup, sections_to_exclude)

	# Extract after pruning unwanted sections
	text = extract_text(soup)
	tables = extract_tables(soup)

	# Combine
	llm_ready = format_for_llm(text, tables, sections_to_exclude)
	return llm_ready