Spaces:
Running
Running
batuhanozkose
feat: Implement initial PaperCast application with core modules, documentation, a periodic curl script, and a Gradio certificate.
472739a
| import os | |
| import requests | |
| from urllib.parse import urlparse | |
| from utils.config import TEMP_DIR | |
| def fetch_paper_from_url(url: str) -> str: | |
| """ | |
| Downloads a PDF from a URL (supports arXiv and medRxiv). | |
| Args: | |
| url (str): The URL of the paper. | |
| Returns: | |
| str: Path to the downloaded PDF file. | |
| """ | |
| # Handle arXiv abstract URLs | |
| if "arxiv.org/abs/" in url: | |
| url = url.replace("/abs/", "/pdf/") | |
| if not url.endswith(".pdf"): | |
| url += ".pdf" | |
| # Handle medRxiv URLs | |
| # Example: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1 | |
| # or: https://www.medrxiv.org/content/10.1101/2025.11.13.25340182v1.full.pdf | |
| elif "medrxiv.org/content/" in url: | |
| if not url.endswith(".pdf"): | |
| url = url + ".full.pdf" | |
| try: | |
| # Add headers to avoid 403 Forbidden errors from bioRxiv/medRxiv | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Connection': 'keep-alive', | |
| } | |
| response = requests.get(url, stream=True, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| # Extract filename from URL or use default | |
| parsed_url = urlparse(url) | |
| filename = os.path.basename(parsed_url.path) | |
| if not filename.endswith(".pdf"): | |
| filename = "downloaded_paper.pdf" | |
| file_path = os.path.join(TEMP_DIR, filename) | |
| with open(file_path, "wb") as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| return file_path | |
| except Exception as e: | |
| print(f"Error downloading {url}: {e}") | |
| return "" | |