import os import requests from urllib.parse import urlparse from utils.config import TEMP_DIR def fetch_paper_from_url(url: str) -> str: """ Downloads a PDF from a URL (supports arXiv and medRxiv). Args: url (str): The URL of the paper. Returns: str: Path to the downloaded PDF file. """ # Handle arXiv abstract URLs if "arxiv.org/abs/" in url: url = url.replace("/abs/", "/pdf/") if not url.endswith(".pdf"): url += ".pdf" # Handle medRxiv URLs # Example: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1 # or: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1.full.pdf elif "medrxiv.org/content/" in url: if not url.endswith(".pdf"): url = url + ".full.pdf" try: # Add headers to avoid 403 Forbidden errors from bioRxiv/medRxiv headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', } response = requests.get(url, stream=True, headers=headers, timeout=30) response.raise_for_status() # Extract filename from URL or use default parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) if not filename.endswith(".pdf"): filename = "downloaded_paper.pdf" file_path = os.path.join(TEMP_DIR, filename) with open(file_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) return file_path except Exception as e: print(f"Error downloading {url}: {e}") return ""