Spaces:

MCP-1st-Birthday
/

papercast

Running

batuhanozkose

feat: Implement initial PaperCast application with core modules, documentation, a periodic curl script, and a Gradio certificate.

472739a 27 days ago

raw

history blame contribute delete

1.89 kB

	import os
	import requests
	from urllib.parse import urlparse
	from utils.config import TEMP_DIR

	def fetch_paper_from_url(url: str) -> str:
	"""
	Downloads a PDF from a URL (supports arXiv and medRxiv).

	Args:
	url (str): The URL of the paper.

	Returns:
	str: Path to the downloaded PDF file.
	"""
	# Handle arXiv abstract URLs
	if "arxiv.org/abs/" in url:
	url = url.replace("/abs/", "/pdf/")
	if not url.endswith(".pdf"):
	url += ".pdf"

	# Handle medRxiv URLs
	# Example: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1
	# or: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1.full.pdf
	elif "medrxiv.org/content/" in url:
	if not url.endswith(".pdf"):
	url = url + ".full.pdf"

	try:
	# Add headers to avoid 403 Forbidden errors from bioRxiv/medRxiv
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive',
	}

	response = requests.get(url, stream=True, headers=headers, timeout=30)
	response.raise_for_status()

	# Extract filename from URL or use default
	parsed_url = urlparse(url)
	filename = os.path.basename(parsed_url.path)
	if not filename.endswith(".pdf"):
	filename = "downloaded_paper.pdf"

	file_path = os.path.join(TEMP_DIR, filename)

	with open(file_path, "wb") as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	return file_path
	except Exception as e:
	print(f"Error downloading {url}: {e}")
	return ""