GAIA_benchmark_agent

Sleeping

File size: 12,010 Bytes

860424e

import os
import requests
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup
from smolagents.tools import tool
import wikipediaapi


def fetch_wikipedia_page(url: str) -> str:
    """Fetch raw HTML of a Wikipedia page."""
    headers = {
        "User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])",
        "Accept-Language": "en-US,en;q=0.9",
    }
    resp = requests.get(url, headers=headers, timeout=50)
    resp.raise_for_status()
    return resp.text


def _normalize_title(value: str) -> str:
    """Lowercase, collapse whitespace for robust title comparisons."""
    return " ".join(value.lower().split()) if isinstance(value, str) else ""


def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
    """Remove sections (header + content until next header of same/higher level) whose
    header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
    """
    if not titles:
        return
    excluded = {_normalize_title(t) for t in titles}
    header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]

    # Find all headers that match excluded titles
    headers_to_remove = []
    for header in soup.find_all(header_tags):
        title_text = _normalize_title(header.get_text(" ", strip=True))
        if title_text in excluded:
            headers_to_remove.append(header)
    
    # Remove each matching section (header + content)
    for header in headers_to_remove:
        # Skip if header was already removed as part of another section
        if not header.parent:
            continue
            
        level = int(header.name[1])
        
        # Determine the container to remove - could be the header itself or its parent wrapper
        header_container = header
        # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
        if (header.parent and 
            header.parent.name == 'div' and 
            header.parent.get('class') and 
            any('heading' in cls.lower() for cls in header.parent.get('class', []))):
            header_container = header.parent
        
        nodes_to_remove = [header_container]
        
        # Collect all content after the header container until next header of same/higher level
        current = header_container
        while current.next_sibling:
            current = current.next_sibling
            sib_name = getattr(current, "name", None)
            
            # If we hit another header (directly or within a heading container), check its level
            next_header = None
            if sib_name in header_tags:
                next_header = current
            elif (sib_name == 'div' and 
                  current.get('class') and 
                  any('heading' in cls.lower() for cls in current.get('class', []))):
                # This is a heading container, find the header inside it
                for child in current.find_all(header_tags):
                    next_header = child
                    break
            
            if next_header:
                next_level = int(next_header.name[1])
                if next_level <= level:
                    # This is a header of same or higher level - stop here
                    break
            
            # Add this node to removal list
            nodes_to_remove.append(current)
        
        # Remove all collected nodes
        for node in nodes_to_remove:
            try:
                node.decompose()
            except Exception:
                try:
                    node.extract()
                except Exception:
                    pass


def _cleanup_non_content(root: BeautifulSoup) -> None:
    """Remove Wikipedia UI/maintenance blocks from the main content area."""
    selectors = [
        "div#toc",
        "div.toc",
        "div.hatnote",
        "div.shortdescription",
        "div.reflist",
        "ol.references",
        "div.navbox",
        "table.navbox",
        "table.vertical-navbox",
        "table.sidebar",
        "table.ambox",
        "table.metadata",
        "div#catlinks",
        "div.mw-authority-control",
        "div.printfooter",
        "div.portal",
        "table.infobox",  # avoid dumping infobox into text
    ]
    for sel in selectors:
        for el in root.select(sel):
            try:
                el.decompose()
            except Exception:
                try:
                    el.extract()
                except Exception:
                    pass


def extract_text(soup: BeautifulSoup) -> str:
    """Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
    Excludes content that's inside tables and excludes headers that are also used as
    table names (either as <caption> or the nearest previous header) to avoid duplication
    with extract_tables."""
    content_root = soup.select_one("div.mw-parser-output") or soup

    for elem in content_root(["script", "style", "sup", "aside", "nav"]):
        elem.decompose()
    _cleanup_non_content(content_root)

    # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
    table_names_normalized = set()
    for table in content_root.find_all("table"):
        # Skip non-content tables (same logic as extract_tables)
        classes = table.get("class", [])
        if isinstance(classes, list) and any(
            c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
            for c in classes
        ):
            continue

        name_text = None
        caption_el = table.find("caption")
        if caption_el:
            caption_text = caption_el.get_text(" ", strip=True)
            if caption_text:
                name_text = caption_text
            else:
                # Empty caption: treat as no caption and fallback to previous header
                prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
                if prev_header:
                    name_text = prev_header.get_text(" ", strip=True)
        else:
            prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
            if prev_header:
                name_text = prev_header.get_text(" ", strip=True)

        if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
            name_text = "Infobox"

        if name_text:
            table_names_normalized.add(_normalize_title(name_text))

    # Find all text elements in document order, but exclude duplicates
    text_elements = []
    for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
        # Skip elements that are inside a table (to avoid duplication with extract_tables)
        if element.find_parent("table"):
            continue

        # Skip headers that match any table name (to avoid duplication with extract_tables)
        if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
            header_text_norm = _normalize_title(element.get_text(" ", strip=True))
            if header_text_norm in table_names_normalized:
                continue

        # Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
        if element.name == "li":
            li_text_norm = _normalize_title(element.get_text(" ", strip=True))
            if li_text_norm in table_names_normalized:
                continue
            
        text = element.get_text(" ", strip=True)
        if text:  # Only include non-empty text
            text_elements.append(text)

    return "\n\n".join(text_elements)


def extract_tables(soup: BeautifulSoup) -> list[dict]:
    """Extract all HTML tables as dicts: {name, df}."""
    content_root = soup.select_one("div.mw-parser-output") or soup

    tables = []
    for table_idx, table in enumerate(content_root.find_all("table")):
        # Skip non-content tables (navboxes, sidebars, etc.)
        classes = table.get("class", [])
        if isinstance(classes, list) and any(
            c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
            for c in classes
        ):
            continue

        # Prefer explicit <caption>
        caption_el = table.find("caption")
        name = caption_el.get_text(" ", strip=True) if caption_el else None

        # Fallback: nearest previous section header
        if not name:
            prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
            if prev_header:
                name = prev_header.get_text(" ", strip=True)

        # Fallback: class-based hints (e.g., infobox)
        if not name:
            if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
                name = "Infobox"

        # Final fallback
        if not name:
            name = f"Table {table_idx + 1}"

        try:
            dfs = pd.read_html(StringIO(str(table)))
            if len(dfs) == 1:
                tables.append({"name": name, "df": dfs[0]})
            else:
                for part_idx, df in enumerate(dfs, start=1):
                    tables.append({"name": f"{name} (part {part_idx})", "df": df})
        except ValueError:
            continue
    return tables


def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
    """Combine text + tables into a single string for LLM input."""
    output = []
    output.append("=== ARTICLE TEXT ===\n")
    output.append(text)

    excluded = {_normalize_title(s) for s in sections_to_exclude}
    filtered_tables = [
        t for t in tables if _normalize_title(t.get("name", "")) not in excluded
    ]

    for i, t in enumerate(filtered_tables, start=1):
        tname = t.get("name") or f"Table {i}"
        df = t["df"]
        output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
        output.append(df.to_markdown(index=False))

    return "\n".join(output)


@tool
def wikipedia_summary(entity: str) -> dict:
    """
    Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
    Args:
        entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
    Returns:
        A dictionary with the summary of the page and the url of the page.
    """
    import wikipedia
    summary_tool = wikipediaapi.Wikipedia(
        user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
    )
    page = summary_tool.page(entity)
    if not page.exists():
        raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
    sections = [section._title for section in page.sections]
    return {
        "summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
        ''',
        "url": wikipedia.page(pageid=page.pageid).url
    }
    

@tool
def read_wikipedia_page(
    url: str,
    sections_to_exclude: list[str] = [
        "External links",
        "References",
        "Further reading",
        "See also",
        "Notes",
    ]) -> str:
    """
    Read a Wikipedia page and return a string with the text of the page.
    Args:
        url: The URL of the Wikipedia page to read.
        sections_to_exclude: A list of sections to exclude from the page.
    Returns:
        A string with the text of the page.
    """
    if "https://en.wikipedia.org/wiki/" not in url:
        raise ValueError("URL is required")
    # Fetch the page
    html = fetch_wikipedia_page(url)
    # Parse the page
    soup = BeautifulSoup(html, "html.parser")
    # Remove unwanted sections
    _remove_sections_by_titles(soup, sections_to_exclude)

    # Extract after pruning unwanted sections
    text = extract_text(soup)
    tables = extract_tables(soup)

    # Combine
    llm_ready = format_for_llm(text, tables, sections_to_exclude)
    return llm_ready