File size: 12,010 Bytes
860424e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
import os
import requests
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup
from smolagents.tools import tool
import wikipediaapi


def fetch_wikipedia_page(url: str) -> str:
    """Fetch raw HTML of a Wikipedia page."""
    headers = {
        "User-Agent": "GAIA_benchmark_agent/1.0 (contact: [email protected])",
        "Accept-Language": "en-US,en;q=0.9",
    }
    resp = requests.get(url, headers=headers, timeout=50)
    resp.raise_for_status()
    return resp.text


def _normalize_title(value: str) -> str:
    """Lowercase, collapse whitespace for robust title comparisons."""
    return " ".join(value.lower().split()) if isinstance(value, str) else ""


def _remove_sections_by_titles(soup: BeautifulSoup, titles: list[str]) -> None:
    """Remove sections (header + content until next header of same/higher level) whose
    header text matches any of `titles` (case-insensitive). Mutates `soup` in-place.
    """
    if not titles:
        return
    excluded = {_normalize_title(t) for t in titles}
    header_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]

    # Find all headers that match excluded titles
    headers_to_remove = []
    for header in soup.find_all(header_tags):
        title_text = _normalize_title(header.get_text(" ", strip=True))
        if title_text in excluded:
            headers_to_remove.append(header)
    
    # Remove each matching section (header + content)
    for header in headers_to_remove:
        # Skip if header was already removed as part of another section
        if not header.parent:
            continue
            
        level = int(header.name[1])
        
        # Determine the container to remove - could be the header itself or its parent wrapper
        header_container = header
        # If header is wrapped in a heading container (like div.mw-heading), use that as the starting point
        if (header.parent and 
            header.parent.name == 'div' and 
            header.parent.get('class') and 
            any('heading' in cls.lower() for cls in header.parent.get('class', []))):
            header_container = header.parent
        
        nodes_to_remove = [header_container]
        
        # Collect all content after the header container until next header of same/higher level
        current = header_container
        while current.next_sibling:
            current = current.next_sibling
            sib_name = getattr(current, "name", None)
            
            # If we hit another header (directly or within a heading container), check its level
            next_header = None
            if sib_name in header_tags:
                next_header = current
            elif (sib_name == 'div' and 
                  current.get('class') and 
                  any('heading' in cls.lower() for cls in current.get('class', []))):
                # This is a heading container, find the header inside it
                for child in current.find_all(header_tags):
                    next_header = child
                    break
            
            if next_header:
                next_level = int(next_header.name[1])
                if next_level <= level:
                    # This is a header of same or higher level - stop here
                    break
            
            # Add this node to removal list
            nodes_to_remove.append(current)
        
        # Remove all collected nodes
        for node in nodes_to_remove:
            try:
                node.decompose()
            except Exception:
                try:
                    node.extract()
                except Exception:
                    pass


def _cleanup_non_content(root: BeautifulSoup) -> None:
    """Remove Wikipedia UI/maintenance blocks from the main content area."""
    selectors = [
        "div#toc",
        "div.toc",
        "div.hatnote",
        "div.shortdescription",
        "div.reflist",
        "ol.references",
        "div.navbox",
        "table.navbox",
        "table.vertical-navbox",
        "table.sidebar",
        "table.ambox",
        "table.metadata",
        "div#catlinks",
        "div.mw-authority-control",
        "div.printfooter",
        "div.portal",
        "table.infobox",  # avoid dumping infobox into text
    ]
    for sel in selectors:
        for el in root.select(sel):
            try:
                el.decompose()
            except Exception:
                try:
                    el.extract()
                except Exception:
                    pass


def extract_text(soup: BeautifulSoup) -> str:
    """Extract main text (paragraphs + headers + lists) from article body only, preserving document order.
    Excludes content that's inside tables and excludes headers that are also used as
    table names (either as <caption> or the nearest previous header) to avoid duplication
    with extract_tables."""
    content_root = soup.select_one("div.mw-parser-output") or soup

    for elem in content_root(["script", "style", "sup", "aside", "nav"]):
        elem.decompose()
    _cleanup_non_content(content_root)

    # Identify table names (from captions or nearest previous headers) to avoid duplicating them in text
    table_names_normalized = set()
    for table in content_root.find_all("table"):
        # Skip non-content tables (same logic as extract_tables)
        classes = table.get("class", [])
        if isinstance(classes, list) and any(
            c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
            for c in classes
        ):
            continue

        name_text = None
        caption_el = table.find("caption")
        if caption_el:
            caption_text = caption_el.get_text(" ", strip=True)
            if caption_text:
                name_text = caption_text
            else:
                # Empty caption: treat as no caption and fallback to previous header
                prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
                if prev_header:
                    name_text = prev_header.get_text(" ", strip=True)
        else:
            prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
            if prev_header:
                name_text = prev_header.get_text(" ", strip=True)

        if not name_text and isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
            name_text = "Infobox"

        if name_text:
            table_names_normalized.add(_normalize_title(name_text))

    # Find all text elements in document order, but exclude duplicates
    text_elements = []
    for element in content_root.find_all(["h1", "h2", "h3", "h4", "h5", "h6", "p", "li"]):
        # Skip elements that are inside a table (to avoid duplication with extract_tables)
        if element.find_parent("table"):
            continue

        # Skip headers that match any table name (to avoid duplication with extract_tables)
        if element.name in {"h1", "h2", "h3", "h4", "h5", "h6"}:
            header_text_norm = _normalize_title(element.get_text(" ", strip=True))
            if header_text_norm in table_names_normalized:
                continue

        # Skip list items that are exactly a table name (common for inline mini-TOCs within sections)
        if element.name == "li":
            li_text_norm = _normalize_title(element.get_text(" ", strip=True))
            if li_text_norm in table_names_normalized:
                continue
            
        text = element.get_text(" ", strip=True)
        if text:  # Only include non-empty text
            text_elements.append(text)

    return "\n\n".join(text_elements)


def extract_tables(soup: BeautifulSoup) -> list[dict]:
    """Extract all HTML tables as dicts: {name, df}."""
    content_root = soup.select_one("div.mw-parser-output") or soup

    tables = []
    for table_idx, table in enumerate(content_root.find_all("table")):
        # Skip non-content tables (navboxes, sidebars, etc.)
        classes = table.get("class", [])
        if isinstance(classes, list) and any(
            c.lower() in {"navbox", "vertical-navbox", "sidebar", "mbox", "metadata"}
            for c in classes
        ):
            continue

        # Prefer explicit <caption>
        caption_el = table.find("caption")
        name = caption_el.get_text(" ", strip=True) if caption_el else None

        # Fallback: nearest previous section header
        if not name:
            prev_header = table.find_previous(["h1", "h2", "h3", "h4", "h5", "h6"])
            if prev_header:
                name = prev_header.get_text(" ", strip=True)

        # Fallback: class-based hints (e.g., infobox)
        if not name:
            if isinstance(classes, list) and any(c.lower() == "infobox" for c in classes):
                name = "Infobox"

        # Final fallback
        if not name:
            name = f"Table {table_idx + 1}"

        try:
            dfs = pd.read_html(StringIO(str(table)))
            if len(dfs) == 1:
                tables.append({"name": name, "df": dfs[0]})
            else:
                for part_idx, df in enumerate(dfs, start=1):
                    tables.append({"name": f"{name} (part {part_idx})", "df": df})
        except ValueError:
            continue
    return tables


def format_for_llm(text: str, tables: list[dict], sections_to_exclude: list[str]) -> str:
    """Combine text + tables into a single string for LLM input."""
    output = []
    output.append("=== ARTICLE TEXT ===\n")
    output.append(text)

    excluded = {_normalize_title(s) for s in sections_to_exclude}
    filtered_tables = [
        t for t in tables if _normalize_title(t.get("name", "")) not in excluded
    ]

    for i, t in enumerate(filtered_tables, start=1):
        tname = t.get("name") or f"Table {i}"
        df = t["df"]
        output.append(f"\n\n=== TABLE {i}: {tname} ===\n")
        output.append(df.to_markdown(index=False))

    return "\n".join(output)


@tool
def wikipedia_summary(entity: str) -> dict:
    """
    Search Wikipedia for a query and return a dictionary with the summary of the page and the url of the page.
    Args:
        entity: the entity being searched for and ALWAYS pass exactly the entity name (person/place/event/concept) with no qualifiers.
    Returns:
        A dictionary with the summary of the page and the url of the page.
    """
    import wikipedia
    summary_tool = wikipediaapi.Wikipedia(
        user_agent=f"My research agent ({os.getenv('USER_EMAIL')})",
    )
    page = summary_tool.page(entity)
    if not page.exists():
        raise ValueError(f"No Wikipedia page found for '{entity}'. Try a different query.")
    sections = [section._title for section in page.sections]
    return {
        "summary": f'''The sections inside the page are {", ".join(sections)} and the summary of the page is {page.summary}
        ''',
        "url": wikipedia.page(pageid=page.pageid).url
    }
    

@tool
def read_wikipedia_page(
    url: str,
    sections_to_exclude: list[str] = [
        "External links",
        "References",
        "Further reading",
        "See also",
        "Notes",
    ]) -> str:
    """
    Read a Wikipedia page and return a string with the text of the page.
    Args:
        url: The URL of the Wikipedia page to read.
        sections_to_exclude: A list of sections to exclude from the page.
    Returns:
        A string with the text of the page.
    """
    if "https://en.wikipedia.org/wiki/" not in url:
        raise ValueError("URL is required")
    # Fetch the page
    html = fetch_wikipedia_page(url)
    # Parse the page
    soup = BeautifulSoup(html, "html.parser")
    # Remove unwanted sections
    _remove_sections_by_titles(soup, sections_to_exclude)

    # Extract after pruning unwanted sections
    text = extract_text(soup)
    tables = extract_tables(soup)

    # Combine
    llm_ready = format_for_llm(text, tables, sections_to_exclude)
    return llm_ready