papercast / processing /url_fetcher.py
batuhanozkose
feat: Implement initial PaperCast application with core modules, documentation, a periodic curl script, and a Gradio certificate.
472739a
import os
import requests
from urllib.parse import urlparse
from utils.config import TEMP_DIR
def fetch_paper_from_url(url: str) -> str:
"""
Downloads a PDF from a URL (supports arXiv and medRxiv).
Args:
url (str): The URL of the paper.
Returns:
str: Path to the downloaded PDF file.
"""
# Handle arXiv abstract URLs
if "arxiv.org/abs/" in url:
url = url.replace("/abs/", "/pdf/")
if not url.endswith(".pdf"):
url += ".pdf"
# Handle medRxiv URLs
# Example: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1
# or: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1.full.pdf
elif "medrxiv.org/content/" in url:
if not url.endswith(".pdf"):
url = url + ".full.pdf"
try:
# Add headers to avoid 403 Forbidden errors from bioRxiv/medRxiv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
}
response = requests.get(url, stream=True, headers=headers, timeout=30)
response.raise_for_status()
# Extract filename from URL or use default
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename.endswith(".pdf"):
filename = "downloaded_paper.pdf"
file_path = os.path.join(TEMP_DIR, filename)
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return file_path
except Exception as e:
print(f"Error downloading {url}: {e}")
return ""