Spaces:
Sleeping
Sleeping
| import os | |
| import requests | |
| from io import StringIO | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| from smolagents.tools import tool | |
| import wikipediaapi | |
| def fetch_wikipedia_page(url: str) -> str: | |
| """Fetch raw HTML of a Wikipedia page.""" | |
| headers = { | |
| "User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| resp = requests.get(url, headers=headers, timeout=50) | |
| resp.raise_for_status() | |
| return resp.text | |
| def _normalize_title(value: str) -> str: | |
| """Lowercase, collapse whitespace for robust title comparisons.""" | |
| return " ".join(value.lower().split()) if isinstance(value, str) else "" | |
| def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None: | |
| """Remove sections (header + content until next header of same/higher level) whose | |
| header text matches any of `titles` (case-insensitive). Mutates `soup` in-place. | |
| """ | |
| if not titles: | |
| return | |
| excluded = {_normalize_title(t) for t in titles} | |
| header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"] | |
| # Find all headers that match excluded titles | |
| headers_to_remove = [] | |
| for header in soup.find_all(header_tags): | |
| title_text = _normalize_title(header.get_text(" ", strip=True)) | |
| if title_text in excluded: | |
| headers_to_remove.append(header) | |
| # Remove each matching section (header + content) | |
| for header in headers_to_remove: | |
| # Skip if header was already removed as part of another section | |
| if not header.parent: | |
| continue | |
| level = int(header.name[1]) | |
| # Determine the container to remove - could be the header itself or its parent wrapper | |
| header_container = header | |
| # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point | |
| if (header.parent and | |
| header.parent.name == 'div' and | |
| header.parent.get('class') and | |
| any('heading' in cls.lower() for cls in header.parent.get('class', []))): | |
| header_container = header.parent | |
| nodes_to_remove = [header_container] | |
| # Collect all content after the header container until next header of same/higher level | |
| current = header_container | |
| while current.next_sibling: | |
| current = current.next_sibling | |
| sib_name = getattr(current, "name", None) | |
| # If we hit another header (directly or within a heading container), check its level | |
| next_header = None | |
| if sib_name in header_tags: | |
| next_header = current | |
| elif (sib_name == 'div' and | |
| current.get('class') and | |
| any('heading' in cls.lower() for cls in current.get('class', []))): | |
| # This is a heading container, find the header inside it | |
| for child in current.find_all(header_tags): | |
| next_header = child | |
| break | |
| if next_header: | |
| next_level = int(next_header.name[1]) | |
| if next_level <= level: | |
| # This is a header of same or higher level - stop here | |
| break | |
| # Add this node to removal list | |
| nodes_to_remove.append(current) | |
| # Remove all collected nodes | |
| for node in nodes_to_remove: | |
| try: | |
| node.decompose() | |
| except Exception: | |
| try: | |
| node.extract() | |
| except Exception: | |
| pass | |
| def _cleanup_non_content(root: BeautifulSoup) -> None: | |
| """Remove Wikipedia UI/maintenance blocks from the main content area.""" | |
| selectors = [ | |
| "div#toc", | |
| "div.toc", | |
| "div.hatnote", | |
| "div.shortdescription", | |
| "div.reflist", | |
| "ol.references", | |
| "div.navbox", | |
| "table.navbox", | |
| "table.vertical-navbox", | |
| "table.sidebar", | |
| "table.ambox", | |
| "table.metadata", | |
| "div#catlinks", | |
| "div.mw-authority-control", | |
| "div.printfooter", | |
| "div.portal", | |
| "table.infobox", # avoid dumping infobox into text | |
| ] | |
| for sel in selectors: | |
| for el in root.select(sel): | |
| try: | |
| el.decompose() | |
| except Exception: | |
| try: | |
| el.extract() | |
| except Exception: | |
| pass | |
| def extract_text(soup: BeautifulSoup) -> str: | |
| """Extract main text (paragraphs + headers + lists) from article body only, preserving document order. | |
| Excludes content that's inside tables and excludes headers that are also used as | |
| table names (either as <caption> or the nearest previous header) to avoid duplication | |
| with extract_tables.""" | |
| content_root = soup.select_one("div.mw-parser-output") or soup | |
| for elem in content_root(["script", "style", "sup", "aside", "nav"]): | |
| elem.decompose() | |
| _cleanup_non_content(content_root) | |
| # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text | |
| table_names_normalized = set() | |
| for table in content_root.find_all("table"): | |
| # Skip non-content tables (same logic as extract_tables) | |
| classes = table.get("class", []) | |
| if isinstance(classes, list) and any( | |
| c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"} | |
| for c in classes | |
| ): | |
| continue | |
| name_text = None | |
| caption_el = table.find("caption") | |
| if caption_el: | |
| caption_text = caption_el.get_text(" ", strip=True) | |
| if caption_text: | |
| name_text = caption_text | |
| else: | |
| # Empty caption: treat as no caption and fallback to previous header | |
| prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) | |
| if prev_header: | |
| name_text = prev_header.get_text(" ", strip=True) | |
| else: | |
| prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) | |
| if prev_header: | |
| name_text = prev_header.get_text(" ", strip=True) | |
| if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes): | |
| name_text = "Infobox" | |
| if name_text: | |
| table_names_normalized.add(_normalize_title(name_text)) | |
| # Find all text elements in document order, but exclude duplicates | |
| text_elements = [] | |
| for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]): | |
| # Skip elements that are inside a table (to avoid duplication with extract_tables) | |
| if element.find_parent("table"): | |
| continue | |
| # Skip headers that match any table name (to avoid duplication with extract_tables) | |
| if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}: | |
| header_text_norm = _normalize_title(element.get_text(" ", strip=True)) | |
| if header_text_norm in table_names_normalized: | |
| continue | |
| # Skip list items that are exactly a table name (common for inline mini-TOCs within sections) | |
| if element.name == "li": | |
| li_text_norm = _normalize_title(element.get_text(" ", strip=True)) | |
| if li_text_norm in table_names_normalized: | |
| continue | |
| text = element.get_text(" ", strip=True) | |
| if text: # Only include non-empty text | |
| text_elements.append(text) | |
| return "\n\n".join(text_elements) | |
| def extract_tables(soup: BeautifulSoup) -> list[dict]: | |
| """Extract all HTML tables as dicts: {name, df}.""" | |
| content_root = soup.select_one("div.mw-parser-output") or soup | |
| tables = [] | |
| for table_idx, table in enumerate(content_root.find_all("table")): | |
| # Skip non-content tables (navboxes, sidebars, etc.) | |
| classes = table.get("class", []) | |
| if isinstance(classes, list) and any( | |
| c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"} | |
| for c in classes | |
| ): | |
| continue | |
| # Prefer explicit <caption> | |
| caption_el = table.find("caption") | |
| name = caption_el.get_text(" ", strip=True) if caption_el else None | |
| # Fallback: nearest previous section header | |
| if not name: | |
| prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"]) | |
| if prev_header: | |
| name = prev_header.get_text(" ", strip=True) | |
| # Fallback: class-based hints (e.g., infobox) | |
| if not name: | |
| if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes): | |
| name = "Infobox" | |
| # Final fallback | |
| if not name: | |
| name = f"Table {table_idx + 1}" | |
| try: | |
| dfs = pd.read_html(StringIO(str(table))) | |
| if len(dfs) == 1: | |
| tables.append({"name": name, "df": dfs[0]}) | |
| else: | |
| for part_idx, df in enumerate(dfs, start=1): | |
| tables.append({"name": f"{name} (part {part_idx})", "df": df}) | |
| except ValueError: | |
| continue | |
| return tables | |
| def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str: | |
| """Combine text + tables into a single string for LLM input.""" | |
| output = [] | |
| output.append("=== ARTICLE TEXT ===\n") | |
| output.append(text) | |
| excluded = {_normalize_title(s) for s in sections_to_exclude} | |
| filtered_tables = [ | |
| t for t in tables if _normalize_title(t.get("name", "")) not in excluded | |
| ] | |
| for i, t in enumerate(filtered_tables, start=1): | |
| tname = t.get("name") or f"Table {i}" | |
| df = t["df"] | |
| output.append(f"\n\n=== TABLE {i}: {tname} ===\n") | |
| output.append(df.to_markdown(index=False)) | |
| return "\n".join(output) | |
| def wikipedia_summary(entity: str) -> dict: | |
| """ | |
| Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page. | |
| Args: | |
| entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers. | |
| Returns: | |
| A dictionary with the summary of the page and the url of the page. | |
| """ | |
| import wikipedia | |
| summary_tool = wikipediaapi.Wikipedia( | |
| user_agent=f"My research agent ({os.getenv('USER_EMAIL')})", | |
| ) | |
| page = summary_tool.page(entity) | |
| if not page.exists(): | |
| raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.") | |
| sections = [section._title for section in page.sections] | |
| return { | |
| "summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary} | |
| ''', | |
| "url": wikipedia.page(pageid=page.pageid).url | |
| } | |
| def read_wikipedia_page( | |
| url: str, | |
| sections_to_exclude: list[str] = [ | |
| "External links", | |
| "References", | |
| "Further reading", | |
| "See also", | |
| "Notes", | |
| ]) -> str: | |
| """ | |
| Read a Wikipedia page and return a string with the text of the page. | |
| Args: | |
| url: The URL of the Wikipedia page to read. | |
| sections_to_exclude: A list of sections to exclude from the page. | |
| Returns: | |
| A string with the text of the page. | |
| """ | |
| if "https://en.wikipedia.org/wiki/" not in url: | |
| raise ValueError("URL is required") | |
| # Fetch the page | |
| html = fetch_wikipedia_page(url) | |
| # Parse the page | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove unwanted sections | |
| _remove_sections_by_titles(soup, sections_to_exclude) | |
| # Extract after pruning unwanted sections | |
| text = extract_text(soup) | |
| tables = extract_tables(soup) | |
| # Combine | |
| llm_ready = format_for_llm(text, tables, sections_to_exclude) | |
| return llm_ready |