Spaces:

MCP-1st-Birthday
/

papercast

Running

papercast / app.py

batuhanozkose

feat: Add Paper Auto-Discovery (PAD) engine and update documentation

39bbc0e 23 days ago

57 kB

	import os
	from datetime import datetime
	import gradio as gr
	from agents.podcast_agent import PodcastAgent
	from synthesis.tts_engine import ELEVENLABS_VOICES
	from synthesis.supertonic_tts import SUPERTONIC_VOICES
	from utils.config import (
	OUTPUT_DIR,
	SCRIPT_GENERATION_MODEL,
	)
	from utils.history import get_history_items, load_history
	from processing.paper_discovery import search_papers, PaperDiscoveryEngine

	# Ensure output directory exists
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# --- Configuration & Constants ---

	PODCAST_LENGTH_PRESETS = {
	"⚡ Very Short (6-8 exchanges, ~2-3 min)": (7, 2000),
	"📝 Short (10-12 exchanges, ~3-4 min)": (11, 3000),
	"📄 Medium (14-16 exchanges, ~5-6 min)": (15, 4000),
	"📚 Medium-Long (18-20 exchanges, ~7-8 min)": (19, 5000),
	"📖 Long (22-25 exchanges, ~9-11 min)": (23, 6000),
	"📕 Very Long (28-32 exchanges, ~12-15 min)": (30, 8000),
	}

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&family=Inter:wght@300;400;500;600&display=swap');

	:root {
	--primary-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%);
	--glass-bg: rgba(17, 24, 39, 0.7);
	--glass-border: rgba(255, 255, 255, 0.1);
	}

	body, .gradio-container {
	font-family: 'Inter', sans-serif !important;
	background-color: #0f172a !important; /* Dark slate background */
	}

	h1, h2, h3, h4, h5, h6 {
	font-family: 'Outfit', sans-serif !important;
	}

	/* Hero Section */
	.hero-container {
	text-align: center;
	padding: 40px 20px;
	margin-bottom: 20px;
	position: relative;
	overflow: hidden;
	}

	.hero-title {
	font-size: 4rem !important;
	font-weight: 800 !important;
	margin-bottom: 10px;
	letter-spacing: -0.02em;
	color: white;
	}

	.hero-title span {
	background: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%);
	-webkit-background-clip: text;
	background-clip: text;
	-webkit-text-fill-color: transparent;
	color: #a855f7; /* Fallback */
	}

	.hero-subtitle {
	font-size: 1.2rem;
	color: #94a3b8;
	max-width: 600px;
	margin: 0 auto;
	line-height: 1.6;
	}

	/* Cards & Containers */
	.glass-panel {
	background: var(--glass-bg) !important;
	backdrop-filter: blur(12px);
	border: 1px solid var(--glass-border) !important;
	border-radius: 16px !important;
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
	padding: 20px;
	}

	/* Buttons */
	.primary-btn {
	background: var(--primary-gradient) !important;
	border: none !important;
	color: white !important;
	font-weight: 600 !important;
	transition: all 0.3s ease !important;
	box-shadow: 0 10px 15px -3px rgba(168, 85, 247, 0.4) !important;
	}

	.primary-btn:hover {
	transform: translateY(-2px);
	box-shadow: 0 20px 25px -5px rgba(168, 85, 247, 0.5) !important;
	}

	/* Inputs */
	input, textarea, select {
	background-color: rgba(30, 41, 59, 0.8) !important;
	border: 1px solid rgba(71, 85, 105, 0.5) !important;
	color: #e2e8f0 !important;
	}

	/* Progress Steps */
	.step-container {
	display: flex;
	justify-content: space-between;
	margin-bottom: 20px;
	position: relative;
	}

	.step-line {
	position: absolute;
	top: 15px;
	left: 0;
	right: 0;
	height: 2px;
	background: #334155;
	z-index: 0;
	}

	.step-item {
	position: relative;
	z-index: 1;
	display: flex;
	flex-direction: column;
	align-items: center;
	width: 25%;
	}

	.step-circle {
	width: 32px;
	height: 32px;
	border-radius: 50%;
	background: #1e293b;
	border: 2px solid #475569;
	display: flex;
	align-items: center;
	justify-content: center;
	font-weight: bold;
	color: #94a3b8;
	transition: all 0.3s ease;
	margin-bottom: 8px;
	}

	.step-item.active .step-circle {
	background: #a855f7;
	border-color: #a855f7;
	color: white;
	box-shadow: 0 0 15px rgba(168, 85, 247, 0.5);
	}

	.step-item.completed .step-circle {
	background: #10b981;
	border-color: #10b981;
	color: white;
	}

	.step-label {
	font-size: 0.8rem;
	color: #64748b;
	font-weight: 500;
	}

	.step-item.active .step-label {
	color: #e2e8f0;
	}

	/* Terminal Output */
	.terminal-window {
	background: #0f172a !important;
	border: 1px solid #334155 !important;
	border-radius: 8px !important;
	font-family: 'JetBrains Mono', monospace !important;
	color: #22c55e !important;
	padding: 15px !important;
	}
	"""

	# --- Helper Functions ---

	def get_podcast_length_params(length_choice):
	return PODCAST_LENGTH_PRESETS.get(length_choice, (15, 4000))

	def validate_settings_for_generation(llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key):
	errors = []
	if llm_choice == "Own Inference":
	if not own_base_url:
	errors.append("❌ Own Inference: Base URL is required")
	elif not (own_base_url.startswith("http://") or own_base_url.startswith("https://")):
	errors.append("❌ Own Inference: Base URL must start with http:// or https://")
	elif llm_choice == "OpenAI":
	if not openai_key:
	errors.append("❌ OpenAI: API key is required")
	elif not openai_key.startswith("sk-"):
	errors.append("❌ OpenAI: API key must start with 'sk-'")

	# Only require ElevenLabs API key if using ElevenLabs
	if tts_provider == "elevenlabs":
	if not elevenlabs_key:
	errors.append("❌ ElevenLabs TTS: API key is required")
	elif not elevenlabs_key.startswith("sk_"):
	errors.append("❌ ElevenLabs TTS: API key must start with 'sk_'")
	# Supertonic doesn't require an API key (CPU-based)

	if errors:
	return False, "\n".join(errors)
	return True, ""

	def get_stats():
	history = load_history()
	return f"🚀 Total Podcasts: {len(history)}"

	def generate_progress_html(current_step):
	"""Generate modern HTML progress steps"""
	steps = ["Fetch", "Extract", "Script", "Audio"]

	html = '<div class="step-container"><div class="step-line"></div>'

	for i, name in enumerate(steps):
	step_num = i + 1
	status_class = ""
	icon = str(step_num)

	if step_num < current_step:
	status_class = "completed"
	icon = "✓"
	elif step_num == current_step:
	status_class = "active"

	html += f"""
	<div class="step-item {status_class}">
	<div class="step-circle">{icon}</div>
	<div class="step-label">{name}</div>
	</div>
	"""

	html += '</div>'
	return html

	def validated_generate_agent(
	url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
	user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
	user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
	user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
	user_persona_mode,
	progress=gr.Progress()
	):
	is_valid, error_message = validate_settings_for_generation(
	user_llm_choice, user_own_base_url, user_own_api_key,
	user_openai_key, user_tts_provider, user_elevenlabs_key
	)

	if not is_valid:
	raise gr.Error(error_message)

	# Show progress container
	yield gr.update(visible=True, value=generate_progress_html(0)), "🚀 Initializing...", gr.update(visible=False)

	try:
	# Run the generator
	iterator = run_agent(
	url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
	user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
	user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
	user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
	user_persona_mode, progress
	)

	logs_history = ""
	current_step = 0

	for item in iterator:
	if isinstance(item, tuple):
	# Final result
	audio_path, final_logs = item
	generate_transcript(audio_path, final_logs)
	progress(1.0, desc="Done!")
	yield gr.update(value=generate_progress_html(5)), final_logs + "\n\n✨ DONE!", gr.update(value=audio_path, visible=True)
	else:
	# Log update
	log_entry = item
	logs_history += log_entry + "\n"

	# Determine step
	new_step = current_step
	step_desc = "Processing..."
	if "fetch_paper" in log_entry or "downloaded" in log_entry:
	new_step = 1
	step_desc = "Fetching Paper..."
	elif "Extracted" in log_entry or "read_pdf" in log_entry:
	new_step = 2
	step_desc = "Extracting Text..."
	elif "generate_script" in log_entry or "Generated script" in log_entry:
	new_step = 3
	step_desc = "Generating Script..."
	elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry:
	new_step = 4
	step_desc = "Synthesizing Audio..."

	if new_step != current_step:
	current_step = new_step
	# Map step to progress (1-4 -> 0.2-0.8)
	prog_val = 0.2 * current_step
	progress(prog_val, desc=step_desc)
	yield gr.update(value=generate_progress_html(current_step)), logs_history, gr.update(visible=False)
	else:
	yield gr.update(), logs_history, gr.update(visible=False)

	except Exception as e:
	raise gr.Error(f"System Error: {str(e)}")

	def run_agent(
	url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
	user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
	user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
	user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
	user_persona_mode,
	progress=gr.Progress()
	):
	# Determine provider mode
	if user_llm_choice == "Own Inference":
	provider_mode = "own_inference"
	else: # OpenAI
	provider_mode = "openai"

	target_exchanges, max_tokens = get_podcast_length_params(user_podcast_length)

	agent = PodcastAgent(
	provider_mode=provider_mode,
	own_base_url=user_own_base_url if user_own_base_url else None,
	own_api_key=user_own_api_key if user_own_api_key else None,
	own_model=user_own_model if user_own_model else None,
	openai_key=user_openai_key if user_openai_key else None,
	openai_model=user_openai_model if user_openai_model else None,
	tts_provider=user_tts_provider if user_tts_provider else "elevenlabs",
	elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
	host_voice=user_host_voice if user_host_voice else None,
	guest_voice=user_guest_voice if user_guest_voice else None,
	max_tokens=max_tokens,
	target_dialogue_count=target_exchanges,
	context_limit=user_context_limit,
	persona_mode=user_persona_mode if user_persona_mode else "friendly_explainer",
	)

	yield f"Starting Agent... [Mode: {provider_mode}]"

	# Logic for single vs multi
	if advanced_mode:
	# Parse URLs if provided
	urls = None
	if multi_urls and multi_urls.strip():
	urls = [u.strip() for u in multi_urls.strip().split("\n") if u.strip()]

	# Parse PDFs if provided
	pdfs = None
	if multi_pdfs:
	if not isinstance(multi_pdfs, list):
	pdfs = [multi_pdfs]
	else:
	pdfs = multi_pdfs

	# Check if any input provided
	if not urls and not pdfs:
	raise Exception("No input provided for advanced mode")

	# Process both URLs and PDFs together
	url_count = len(urls) if urls else 0
	pdf_count = len(pdfs) if pdfs else 0
	total = url_count + pdf_count

	yield f"Processing {total} items ({url_count} URLs + {pdf_count} PDFs)..."
	yield from agent.process_multiple(urls=urls, pdf_files=pdfs)
	else:
	if not url and not pdf_file:
	raise Exception("Please provide a URL or PDF")
	yield from agent.process(url=url if url else None, pdf_file=pdf_file)

	def generate_transcript(audio_path, logs):
	if not audio_path: return None
	base_name = os.path.splitext(os.path.basename(audio_path))[0]
	transcript_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")
	with open(transcript_path, "w") as f:
	f.write(f"PAPERCAST TRANSCRIPT - {datetime.now()}\n{'='*30}\n\n{logs}")
	return transcript_path

	def get_history_data():
	items = get_history_items()
	if not items: return []
	return [[
	item.get("timestamp", "N/A"),
	item.get("url", "PDF Upload") or "PDF Upload",
	item.get("audio_path", "")
	] for item in items]

	def on_history_select(evt: gr.SelectData, data):
	try:
	return data.iloc[evt.index[0]].iloc[2] # Audio path is column 2
	except:
	return None

	def perform_paper_search(query: str, progress=gr.Progress()):
	"""
	PAD: Search for papers using Paper Auto-Discovery

	Returns formatted results for display in UI
	"""
	if not query or not query.strip():
	return gr.update(choices=[], value=None, visible=False), "⚠️ Please enter a search query"

	progress(0.2, desc="Searching Semantic Scholar & arXiv...")

	try:
	# Search using PAD
	results = search_papers(query.strip(), max_results=5)

	if not results:
	return gr.update(choices=[], value=None, visible=False), "❌ No papers found. Try a different query."

	progress(0.8, desc=f"Found {len(results)} papers")

	# Format results for Dropdown display
	choices = []
	for i, paper in enumerate(results, 1):
	authors_str = ", ".join(paper.authors[:2])
	if len(paper.authors) > 2:
	authors_str += " et al."

	year_str = f" ({paper.year})" if paper.year else ""
	source_emoji = "📚" if paper.source == "semantic_scholar" else "🔬"

	# Create display label for dropdown
	label = f"{i}. {source_emoji} {paper.title}{year_str} \| {authors_str}"
	choices.append(label) # Dropdown just needs the labels

	progress(1.0, desc="Search complete!")

	print(f"[DEBUG] Search found {len(results)} papers")
	print(f"[DEBUG] Choices created: {len(choices)}")
	print(f"[DEBUG] First choice: {choices[0] if choices else 'NONE'}")

	# Store results in a global variable (we'll use State instead)
	# Return updated Dropdown and success message
	success_msg = f"✅ Found {len(results)} papers from Semantic Scholar & arXiv"

	# Select the first option by default to ensure visibility/interaction
	first_choice = choices[0] if choices else None

	return gr.update(choices=choices, value=first_choice, visible=True, interactive=True), success_msg

	except Exception as e:
	return gr.update(choices=[], value=None, visible=False), f"❌ Search failed: {str(e)}"

	def on_paper_select(selected_label, query):
	"""
	Handle paper selection from search results.
	Returns the PDF URL to be used for podcast generation.
	"""
	if not selected_label:
	return None, "⚠️ Please select a paper from the search results"

	try:
	# Extract index from label (format: "1. emoji title...")
	selected_index = int(selected_label.split(".")[0]) - 1

	# Re-run search to get results (since we can't pass complex objects through Gradio)
	results = search_papers(query.strip(), max_results=5)

	if not results or selected_index >= len(results) or selected_index < 0:
	return None, "❌ Invalid selection"

	selected_paper = results[selected_index]

	# Get PDF URL
	engine = PaperDiscoveryEngine()
	pdf_url = engine.get_pdf_url(selected_paper)

	if not pdf_url:
	return None, f"❌ No PDF available for: {selected_paper.title}"

	# Return PDF URL and success message
	authors_str = ", ".join(selected_paper.authors[:3])
	if len(selected_paper.authors) > 3:
	authors_str += " et al."

	success_msg = f"✅ Selected: {selected_paper.title}\n\n👥 {authors_str}\n📅 {selected_paper.year or 'N/A'}\n🔗 {pdf_url}"

	return pdf_url, success_msg

	except Exception as e:
	return None, f"❌ Selection failed: {str(e)}"

	# --- Main UI ---

	def main():
	# Use a dark theme base but override heavily with CSS
	theme = gr.themes.Soft(
	primary_hue="violet",
	secondary_hue="slate",
	neutral_hue="slate",
	font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"],
	).set(
	body_background_fill="#0f172a",
	block_background_fill="#1e293b",
	block_border_width="1px",
	block_border_color="rgba(255,255,255,0.1)",
	)

	with gr.Blocks(title="PaperCast") as demo:

	# Session State
	user_llm_choice = gr.State(value="Own Inference")
	user_own_base_url = gr.State(value="")
	user_own_api_key = gr.State(value="")
	user_own_model = gr.State(value="")
	user_openai_key = gr.State(value="")
	user_openai_model = gr.State(value="")
	user_tts_provider = gr.State(value="elevenlabs")
	user_elevenlabs_key = gr.State(value="")
	user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # ElevenLabs default
	user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # ElevenLabs default
	user_podcast_length = gr.State(value=4096)
	user_persona_mode = gr.State(value="friendly_explainer") # PPF default

	# Hero Section
	with gr.Row(elem_classes="hero-container"):
	gr.HTML("""
	<h1 class="hero-title"><span>PaperCast</span> 🎙️</h1>
	<p class="hero-subtitle">
	Experience the future of knowledge consumption. <br>
	An autonomous agentic system that transforms complex research papers into engaging, studio-quality audio experiences.
	</p>
	""")

	with gr.Tabs():

	# --- Tab 1: Create ---
	with gr.Tab("✨ Create Podcast"):

	with gr.Row():
	# Left Col: Inputs
	with gr.Column(scale=4, elem_classes="glass-panel"):
	gr.Markdown("### 📥 Source Material")

	with gr.Tabs(selected=0) as input_tabs:
	with gr.Tab("🔗 URL", id=0):
	url_input = gr.Textbox(
	label="Paper URL",
	placeholder="https://arxiv.org/abs/...",
	show_label=False,
	container=False
	)

	with gr.Tab("📄 PDF Upload"):
	pdf_upload = gr.File(
	label="Upload PDF",
	file_types=[".pdf"],
	container=False
	)

	with gr.Tab("🔍 Search (PAD)"):
	gr.Markdown("Paper Auto-Discovery — Search across Semantic Scholar & arXiv")

	with gr.Row():
	search_query = gr.Textbox(
	label="Search Query",
	placeholder="e.g., 'diffusion models', 'Grok reasoning', 'transformer attention'...",
	show_label=False,
	container=False,
	scale=4,
	lines=1,
	max_lines=1
	)
	search_btn = gr.Button("🔎 Search", variant="primary", scale=1)

	search_status = gr.Markdown("", visible=True)

	# Container for search results (always visible)
	with gr.Column(visible=True) as search_results_container:
	search_results = gr.Radio(
	label="📋 Select a Paper",
	choices=[],
	interactive=True,
	show_label=True,
	)

	use_selected_btn = gr.Button(
	"✅ Use Selected Paper",
	variant="primary",
	size="lg"
	)

	# Hidden state to store selected PDF URL from search
	selected_pdf_url = gr.State(value=None)
	selected_search_query = gr.State(value=None)

	# Wire search functionality
	def handle_search(query):
	"""Handle search button click"""
	if not query or not query.strip():
	return (
	gr.update(choices=[], value=None),
	"⚠️ Please enter a search query",
	query
	)

	try:
	# Search using PAD
	results = search_papers(query.strip(), max_results=5)

	if not results:
	return (
	gr.update(choices=[], value=None),
	"❌ No papers found. Try a different query.",
	query
	)

	# Format results for Radio display
	choices = []
	for i, paper in enumerate(results, 1):
	authors_str = ", ".join(paper.authors[:2])
	if len(paper.authors) > 2:
	authors_str += " et al."

	year_str = f" ({paper.year})" if paper.year else ""
	source_emoji = "📚" if paper.source == "semantic_scholar" else "🔬"

	# Create display label
	label = f"{i}. {source_emoji} {paper.title}{year_str} \| {authors_str}"
	choices.append(label)

	first_choice = choices[0] if choices else None
	status_msg = f"✅ Found {len(results)} papers from Semantic Scholar & arXiv"
	status_msg += "\n\n➡️ Next: Select a paper from the list below, then click 'Use Selected Paper'"

	print(f"[DEBUG] handle_search - found {len(choices)} papers")
	print(f"[DEBUG] choices: {choices[:2]}...")

	return (
	gr.update(choices=choices, value=first_choice),
	status_msg,
	query
	)

	except Exception as e:
	print(f"[ERROR] Search failed: {e}")
	return (
	gr.update(choices=[], value=None),
	f"❌ Search failed: {str(e)}",
	query
	)

	search_btn.click(
	fn=handle_search,
	inputs=[search_query],
	outputs=[search_results, search_status, selected_search_query]
	)

	def handle_use_selected(selected_idx, query):
	"""Handle 'Use Selected Paper' button click"""
	pdf_url, status_msg = on_paper_select(selected_idx, query)
	# Add instruction to the status message
	if pdf_url:
	status_msg += "\n\n➡️ Next: Switch to the '🔗 URL' tab to see the paper URL, then click '🎙️ Generate Podcast'"
	return pdf_url, status_msg, pdf_url # Update url_input with PDF URL

	use_selected_btn.click(
	fn=handle_use_selected,
	inputs=[search_results, selected_search_query],
	outputs=[selected_pdf_url, search_status, url_input]
	)

	with gr.Accordion("⚙️ Advanced Options", open=False, visible=True) as advanced_accordion:
	advanced_mode = gr.Checkbox(label="Batch Mode (Multiple Papers)")

	# Warning message (only visible in batch mode)
	batch_warning = gr.Markdown(
	"""
	> ⚠️ Experimental Feature
	>
	> Batch mode is currently experimental and may not work reliably in all cases.
	> Some attempts may fail due to model limitations or processing errors.
	> If you experience issues, try processing papers individually.
	""",
	visible=False
	)

	with gr.Group(visible=False) as batch_inputs:
	multi_url_input = gr.Textbox(label="Multiple URLs (one per line)", lines=3)
	multi_pdf_upload = gr.File(label="Multiple PDFs", file_count="multiple")

	gr.Markdown("---")
	gr.Markdown("### 📊 Context Settings")

	# Context limit slider (only visible in batch mode)
	context_limit_slider = gr.Slider(
	minimum=50000,
	maximum=500000,
	value=80000,
	step=10000,
	label="Max Context Limit (characters)",
	info="⚠️ Warning: Increasing this limit will increase token costs and processing time."
	)

	def toggle_advanced(adv):
	return {
	batch_warning: gr.update(visible=adv),
	batch_inputs: gr.update(visible=adv),
	url_input: gr.update(visible=not adv),
	pdf_upload: gr.update(visible=not adv)
	}
	advanced_mode.change(toggle_advanced, advanced_mode, [batch_warning, batch_inputs, url_input, pdf_upload])

	# Hide Advanced Options when Search (PAD) tab is selected
	def on_tab_select(evt: gr.SelectData):
	"""Handle tab selection - hide batch mode for Search tab"""
	# Tab indices: 0=URL, 1=PDF Upload, 2=Search (PAD)
	is_search_tab = (evt.index == 2)
	return gr.update(visible=not is_search_tab)

	input_tabs.select(
	fn=on_tab_select,
	outputs=[advanced_accordion]
	)

	generate_btn = gr.Button(
	"🎙️ Generate Podcast",
	variant="primary",
	elem_classes="primary-btn",
	size="lg"
	)

	# Right Col: Output
	with gr.Column(scale=5, elem_classes="glass-panel"):
	gr.Markdown("### 📡 Live Feed")

	# Progress Steps
	progress_html = gr.HTML(visible=False)

	# Terminal Log
	status_output = gr.Code(
	label="System Logs",
	language="shell",
	interactive=False,
	lines=12,
	elem_classes="terminal-window"
	)

	# Audio Player
	audio_output = gr.Audio(
	label="🎧 Final Podcast",
	type="filepath",
	interactive=False,
	visible=False
	)

	# Wiring
	generate_btn.click(
	fn=validated_generate_agent,
	inputs=[
	url_input, pdf_upload, advanced_mode, multi_url_input, multi_pdf_upload,
	user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
	user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
	user_host_voice, user_guest_voice, user_podcast_length, context_limit_slider,
	user_persona_mode
	],
	outputs=[progress_html, status_output, audio_output]
	)

	# --- Tab 2: Library ---
	with gr.Tab("📚 Library"):
	with gr.Row(elem_classes="glass-panel"):
	with gr.Column():
	refresh_btn = gr.Button("🔄 Refresh Library", size="sm", variant="secondary")
	history_table = gr.Dataframe(
	headers=["Date", "Source", "Audio Path"],
	datatype=["str", "str", "str"],
	value=get_history_data(),
	interactive=False,
	label="Recent Podcasts"
	)
	with gr.Column():
	history_player = gr.Audio(label="Playback")

	refresh_btn.click(lambda: get_history_data(), None, history_table)
	history_table.select(on_history_select, history_table, history_player)

	# --- Tab 3: Settings ---
	with gr.Tab("⚙️ Settings"):
	with gr.Row(elem_classes="glass-panel"):
	with gr.Column():
	gr.Markdown("### 🤖 Model Configuration")
	llm_choice = gr.Radio(
	["Own Inference", "OpenAI"],
	value="Own Inference",
	label="Provider"
	)

	# Own Inference
	with gr.Group(visible=True) as own_group:
	own_base = gr.Textbox(label="Base URL", placeholder="http://localhost:1234/v1")
	own_key = gr.Textbox(label="API Key", type="password")
	own_model = gr.Textbox(label="Model Name", placeholder="llama-3.1-8b")

	# OpenAI
	with gr.Group(visible=False) as openai_group:
	openai_key = gr.Textbox(label="OpenAI Key", type="password")
	openai_model = gr.Textbox(label="Model", value="gpt-4o-mini")

	def toggle_llm(choice):
	return [
	gr.update(visible=choice=="Own Inference"), # own_group
	gr.update(visible=choice=="OpenAI") # openai_group
	]
	llm_choice.change(toggle_llm, llm_choice, [own_group, openai_group])

	with gr.Column():
	gr.Markdown("### 🗣️ Voice Settings")

	tts_choice = gr.Radio(
	["ElevenLabs", "Supertonic (CPU)"],
	value="ElevenLabs",
	label="TTS Provider",
	info="Supertonic runs on CPU (no API key required, but may be slower than cloud-based TTS)"
	)

	# ElevenLabs Settings
	with gr.Group(visible=True) as elevenlabs_group:
	eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
	host_voice_eleven = gr.Dropdown(
	choices=list(ELEVENLABS_VOICES.keys()),
	value="Antoni (Male - Well-rounded)",
	label="Host Voice"
	)
	guest_voice_eleven = gr.Dropdown(
	choices=list(ELEVENLABS_VOICES.keys()),
	value="Bella (Female - Soft)",
	label="Guest Voice"
	)

	# Supertonic Settings
	with gr.Group(visible=False) as supertonic_group:
	gr.Markdown("CPU-based TTS (no API key required)\n\n⚠️ Note: CPU processing may be slower than cloud-based services")
	host_voice_supertonic = gr.Dropdown(
	choices=list(SUPERTONIC_VOICES.keys()),
	value="M1 (Male 1)",
	label="Host Voice"
	)
	guest_voice_supertonic = gr.Dropdown(
	choices=list(SUPERTONIC_VOICES.keys()),
	value="F1 (Female 1)",
	label="Guest Voice"
	)

	length_slider = gr.Dropdown(
	choices=list(PODCAST_LENGTH_PRESETS.keys()),
	value="📄 Medium (14-16 exchanges, ~5-6 min)",
	label="Podcast Length"
	)

	gr.Markdown("### 🎭 Podcast Persona Framework (PPF)")
	persona_dropdown = gr.Dropdown(
	choices=[
	"🤝 Friendly Explainer (Default)",
	"⚔️ Academic Debate",
	"🔥 Savage Roast",
	"🎓 Pedagogical",
	"🌐 Interdisciplinary Clash"
	],
	value="🤝 Friendly Explainer (Default)",
	label="Conversation Style",
	info="Choose the podcast conversation style and character personalities"
	)

	gr.Markdown("""
	Persona Descriptions:

	- 🤝 Friendly Explainer — Alex & Jamie
	Two friends casually discussing the paper. Accessible, warm, ideal for general audiences. (Default mode)

	- ⚔️ Academic Debate — Dr. Morgan & Prof. Rivera
	Dr. Morgan defends the paper, Prof. Rivera politely challenges claims and methodology.
	"This claim is strong, but Table 2's baseline seems weak..."

	- 🔥 Savage Roast — The Critic & The Defender
	The Critic brutally roasts the paper, The Defender stubbornly fights back.
	"This ablation is an absolute clown show!", "Figure 4 is just statistical noise!"
	Fun and bold approach!

	- 🎓 Pedagogical — Professor Chen & Student Sam
	Professor teaches step-by-step, Student constantly asks questions.
	Perfect for learning complex concepts from scratch.

	- 🌐 Interdisciplinary Clash — Domain Expert & The Outsider
	Domain Expert explains technical details, Outsider critiques from a completely different field perspective.
	"This neuron analogy makes zero biological sense!"
	""")

	def toggle_tts_provider(choice):
	is_elevenlabs = choice == "ElevenLabs"
	return [
	gr.update(visible=is_elevenlabs), # elevenlabs_group
	gr.update(visible=not is_elevenlabs) # supertonic_group
	]

	def update_voices_on_tts_change(choice):
	"""Update voice IDs when TTS provider changes"""
	if choice == "ElevenLabs":
	# Return ElevenLabs default voices
	return "ErXwobaYiN019PkySvjV", "EXAVITQu4vr4xnSDxMaL"
	else: # Supertonic
	# Return Supertonic default voices (M1, F1)
	return "M1", "F1"

	tts_choice.change(toggle_tts_provider, tts_choice, [elevenlabs_group, supertonic_group])
	tts_choice.change(update_voices_on_tts_change, tts_choice, [user_host_voice, user_guest_voice])

	# Bind settings to state
	llm_choice.change(lambda x: x, llm_choice, user_llm_choice)
	own_base.change(lambda x: x, own_base, user_own_base_url)
	own_key.change(lambda x: x, own_key, user_own_api_key)
	own_model.change(lambda x: x, own_model, user_own_model)
	openai_key.change(lambda x: x, openai_key, user_openai_key)
	openai_model.change(lambda x: x, openai_model, user_openai_model)

	# TTS Provider binding
	def update_tts_provider(choice):
	return "elevenlabs" if choice == "ElevenLabs" else "supertonic"
	tts_choice.change(update_tts_provider, tts_choice, user_tts_provider)

	# Voice bindings - need to handle both providers
	def update_host_voice(tts_provider, eleven_voice, super_voice):
	if tts_provider == "ElevenLabs":
	return ELEVENLABS_VOICES.get(eleven_voice, "ErXwobaYiN019PkySvjV")
	else:
	return SUPERTONIC_VOICES.get(super_voice, "M1")

	def update_guest_voice(tts_provider, eleven_voice, super_voice):
	if tts_provider == "ElevenLabs":
	return ELEVENLABS_VOICES.get(eleven_voice, "EXAVITQu4vr4xnSDxMaL")
	else:
	return SUPERTONIC_VOICES.get(super_voice, "F1")

	eleven_key.change(lambda x: x, eleven_key, user_elevenlabs_key)

	# Update voice states when either provider's voice changes
	host_voice_eleven.change(
	lambda v: ELEVENLABS_VOICES.get(v, "ErXwobaYiN019PkySvjV"),
	host_voice_eleven,
	user_host_voice
	)
	guest_voice_eleven.change(
	lambda v: ELEVENLABS_VOICES.get(v, "EXAVITQu4vr4xnSDxMaL"),
	guest_voice_eleven,
	user_guest_voice
	)
	host_voice_supertonic.change(
	lambda v: SUPERTONIC_VOICES.get(v, "M1"),
	host_voice_supertonic,
	user_host_voice
	)
	guest_voice_supertonic.change(
	lambda v: SUPERTONIC_VOICES.get(v, "F1"),
	guest_voice_supertonic,
	user_guest_voice
	)

	length_slider.change(lambda x: x, length_slider, user_podcast_length)

	# Persona binding
	def map_persona_to_key(display_name):
	"""Map UI display names to internal persona keys"""
	mapping = {
	"🤝 Friendly Explainer (Default)": "friendly_explainer",
	"⚔️ Academic Debate": "academic_debate",
	"🔥 Savage Roast": "savage_roast",
	"🎓 Pedagogical": "pedagogical",
	"🌐 Interdisciplinary Clash": "interdisciplinary_clash"
	}
	return mapping.get(display_name, "friendly_explainer")

	persona_dropdown.change(map_persona_to_key, persona_dropdown, user_persona_mode)

	# --- Tab 4: About ---
	with gr.Tab("ℹ️ About"):
	with gr.Row(elem_classes="glass-panel"):
	with gr.Column(scale=1):
	pass
	with gr.Column(scale=3):
	gr.Markdown(f"""
	<div style="text-align: center;">

	# About PaperCast

	The world's first adaptive persona-driven academic podcast platform with intelligent paper discovery.

	Transform any research paper into engaging audio conversations with your choice of style — from casual explanations to brutal critiques. Powered by our revolutionary Podcast Persona Framework (PPF), Paper Auto-Discovery (PAD) engine, MCP tools, and studio-quality TTS.

	---

	## 🚀 Revolutionary Frameworks

	### PAD — Paper Auto-Discovery Engine
	The world's first intelligent multi-source paper discovery system built specifically for podcast generation.

	Finding the right research paper shouldn't be a chore. We built PAD (Paper Auto-Discovery) from the ground up — a custom-engineered search system that goes beyond simple keyword matching.

	What makes PAD revolutionary:

	🔍 Multi-Source Intelligence — Searches across multiple academic databases simultaneously:
	- Semantic Scholar Graph API - Access to 200M+ papers with semantic understanding
	- arXiv - Latest preprints and cutting-edge research
	- Parallel execution for lightning-fast results (under 2 seconds)

	🧠 Smart Result Aggregation — Built from scratch with advanced deduplication:
	- Intelligent title matching across sources
	- Eliminates duplicates while preserving metadata quality
	- Prioritizes papers with open-access PDFs

	⚡ Seamless Integration — No copy-paste, no manual URL hunting:
	- Search directly within PaperCast interface
	- One-click paper selection
	- Automatic PDF URL extraction and validation
	- Instant transition to podcast generation

	🎯 Research-Grade Quality — Enterprise-level reliability:
	- Graceful handling of API rate limits
	- Fallback strategies when one source fails
	- Comprehensive error handling and user feedback
	- Extracts full metadata (authors, year, abstract, citations)

	Why we built PAD from scratch:

	Existing search tools are designed for reading papers, not generating podcasts. We needed:
	- Speed: Parallel API calls return results in under 2 seconds
	- Reliability: Custom retry logic and fallback strategies
	- Integration: Direct pipeline from search → PDF → podcast
	- User Experience: No context switching, no tab juggling

	Technical Innovation:
	- Custom Python engine using `ThreadPoolExecutor` for concurrent API calls
	- Smart result ranking combining relevance scores from multiple sources
	- Automatic PDF URL construction for arXiv papers
	- State-of-the-art deduplication using fuzzy title matching
	---

	### PPF — Podcast Persona Framework
	The world's first adaptive persona system for AI-generated academic podcasts.

	Every other podcast generator treats all papers the same way: bland, generic conversations that put you to sleep. We solved the one-size-fits-all problem by inventing the Podcast Persona Framework (PPF) — a groundbreaking system that adapts conversation style, character dynamics, and educational approach to your preference.

	What makes PPF revolutionary:

	🎭 5 Distinct Persona Modes — Not just voice changes, but fundamentally different conversation dynamics:
	- 🤝 Friendly Explainer — Two colleagues casually discussing research over coffee
	- ⚔️ Academic Debate — Rigorous defense vs. constructive criticism (perfect for critical analysis)
	- 🔥 Savage Roast — Brutally entertaining critique meets passionate defense (most engaging!)
	- 🎓 Pedagogical — Patient professor teaching eager student (best for learning complex topics)
	- 🌐 Interdisciplinary Clash — Domain expert vs. outsider perspective (reveals hidden assumptions)

	🧠 Dynamic Character Intelligence — Each persona features unique characters with distinct personalities:
	- Not generic "Host" and "Guest" — real names like Dr. Morgan, The Critic, Professor Chen
	- Characters maintain consistent perspectives throughout entire podcast
	- Authentic reactions, natural interruptions, genuine debates

	⚡ Zero Overhead — Works seamlessly with any TTS provider (ElevenLabs, Supertonic, etc.)
	- First speaker → Host voice
	- Second speaker → Guest voice
	- Automatic voice mapping regardless of character names

	🎯 Universal Compatibility — PPF is provider-agnostic:
	- Works with any LLM (OpenAI, local models, reasoning models)
	- Compatible with all TTS engines
	- No special configuration required

	Why this matters:

	Traditional podcast generators produce the same monotonous style for every paper. A groundbreaking ML paper gets the same treatment as a medical study. A complex theoretical physics paper sounds identical to an introductory survey.

	PPF changes everything. Now you choose how you want to consume research:
	- Need to learn? → Pedagogical mode
	- Want entertainment? → Savage Roast
	- Seeking critical analysis? → Academic Debate
	- Quick overview? → Friendly Explainer
	- Fresh perspective? → Interdisciplinary Clash

	Built from scratch, perfected for you. We didn't just add a "tone" parameter — we architected an entire persona system with character-aware prompts, dynamic speaker mapping, and adaptive conversation strategies.

	---

	## 🎯 How It Works

	Our intelligent agent orchestrates a dual-innovation pipeline combining PAD and PPF:

	1. 🔍 Discovery (PAD) - Search across Semantic Scholar & arXiv simultaneously, get results in <2 seconds
	2. 📥 Input - Select paper from PAD results, or use URL/PDF upload
	3. 📄 Extraction - PyMuPDF intelligently extracts paper structure
	4. 🎭 Persona Selection - Choose from 5 unique conversation modes (PPF)
	5. 🎬 Script Generation - LLM generates character-specific dialogue with distinct personalities
	6. 🗣️ Dynamic Mapping - Automatic voice assignment based on persona characters
	7. 🎤 Voice Synthesis - Studio-quality audio with ElevenLabs Turbo v2.5 or Supertonic
	8. ✅ Delivery - Listen, download, share your personalized podcast

	What makes this special: Unlike generic converters, we built two groundbreaking systems from scratch — PAD for intelligent discovery and PPF for adaptive personas.

	---

	## 🌟 Key Features

	🔍 PAD - Paper Auto-Discovery — Custom-built multi-source search engine (Semantic Scholar + arXiv) with parallel execution

	🎭 5 Revolutionary Persona Modes — First-of-its-kind adaptive conversation system (PPF)

	🧠 Dynamic Character Intelligence — Real personalities, not generic voices

	⚡ Lightning-Fast Search — Get 5 relevant papers in under 2 seconds with intelligent deduplication

	🎙️ Studio-Quality Audio — ElevenLabs Turbo v2.5 (250ms latency, cinematic quality)

	🔧 Universal Compatibility — Works with any LLM (OpenAI, local models, reasoning models)

	📚 Complete History — All podcasts saved locally with metadata

	🔄 Multi-Paper Support — Batch process multiple papers into comprehensive discussions

	🎯 Provider Agnostic — Bring your own API keys, use local models, total flexibility

	🚀 Zero Friction Workflow — From search query to podcast in 60 seconds

	---

	## 🔧 Technology Stack

	Core Innovations:
	- PAD (Paper Auto-Discovery) — Custom multi-source search engine built from scratch
	- PPF (Podcast Persona Framework) — Proprietary adaptive conversation system

	LLM: Universal support (OpenAI GPT-4o/o1, local LLMs, reasoning models)
	TTS: ElevenLabs Turbo v2.5 (premium) or Supertonic (free CPU-based)
	PDF Processing: PyMuPDF for fast, accurate text extraction
	UI Framework: Gradio 6 with custom glass-morphism design
	Agent Architecture: Custom Python orchestrator with MCP tools

	---

	## 🎓 Built For

	MCP 1st Birthday Hackathon - Track 2: MCP in Action (Consumer)
	Tag: `mcp-in-action-track-consumer`

	What we're showcasing:
	- 🔍 PAD Innovation - First-ever custom multi-source paper discovery engine built for podcast generation
	- 🎭 PPF Innovation - First-ever adaptive persona system for academic podcasts
	- 🤖 Autonomous Agent - Intelligent planning, reasoning, and persona-aware execution
	- 🔧 MCP Integration - Tools as cognitive extensions for the agent
	- 🎨 Gradio 6 UX - Glass-morphism design with intuitive search & persona controls
	- 🚀 Real Impact - Making research accessible and engaging for everyone

	Why PAD + PPF matter for this hackathon: We didn't just build a tool — we invented two new paradigms. PAD solves the discovery problem (finding papers), PPF solves the consumption problem (understanding papers). Together, they create a zero-friction pipeline from curiosity to knowledge.

	---

	## 📝 About the Agent

	PaperCast's discovery-aware, persona-driven autonomous agent makes intelligent decisions at every step:

	- 🔍 Discovery Intelligence - Orchestrates parallel API calls to multiple paper sources, ranks and deduplicates results
	- 🧠 Persona Analysis - Evaluates paper complexity and matches optimal persona mode
	- 📋 Strategic Planning - Determines conversation flow based on selected persona (debate-style vs. teaching-style)
	- 🎭 Character Orchestration - Generates distinct personalities for each persona (Dr. Morgan ≠ The Critic ≠ Professor Chen)
	- 💬 Adaptive Dialogue - Adjusts technical depth, humor level, and interaction style per persona
	- 🗣️ Dynamic Synthesis - Maps persona characters to voice IDs automatically
	- 🔄 Multi-Paper Intelligence - Synthesizes insights across papers while maintaining persona consistency

	The key insight: The agent doesn't just process papers — it discovers and performs them. PAD finds the perfect paper, PPF delivers it in your perfect style.

	---

	## 💡 Use Cases

	### 🎧 Learning & Education
	- PAD Search → Find "transformer attention mechanisms" → Get 5 papers instantly
	- Pedagogical mode for complex topics you want to master
	- Friendly Explainer for quick overviews during commutes
	- Interdisciplinary Clash to understand papers outside your field

	### 🔬 Research & Analysis
	- PAD Search → Discover latest papers on your research topic
	- Academic Debate for critical evaluation of methodologies
	- Savage Roast to identify weak points and overstated claims
	- Quick paper screening before deep reading (60 seconds from search to audio)

	### 🌍 Accessibility
	- Zero barrier to entry — No URLs, no downloads, just search and listen
	- Make cutting-edge research understandable for non-experts
	- Bridge knowledge gaps between disciplines
	- Learn through conversation, not dry text

	### 🎭 Entertainment
	- PAD + Savage Roast combo — Find trending papers and roast them
	- Host paper "debate clubs" with Academic Debate mode
	- Share entertaining takes on research with Savage Roast clips

	---

	## 🏆 What Makes Us Different

	🔍 We built PAD from scratch — First custom multi-source academic search engine designed for podcast generation. Parallel API orchestration, smart deduplication, zero-friction UX.

	🎭 We invented PPF — The Podcast Persona Framework is a world-first innovation. No other platform offers adaptive conversation personas.

	⚡ End-to-end innovation — Most tools stop at URL → podcast. We solved discovery + consumption with two custom-built systems.

	🧠 Real characters, not voices — Other tools change tone. We create distinct personalities with names, perspectives, and consistent behavior.

	🚀 60-second pipeline — From search query ("diffusion models") to finished podcast in under a minute. No other platform comes close.

	🔧 Built for flexibility — Provider-agnostic design works with any LLM, any TTS, any infrastructure.

	🎯 User empowerment — You choose what to listen to (PAD) and how to listen (PPF). Complete control over discovery and consumption.

	The bottom line: Every other podcast generator is a one-trick pony. PaperCast is a research discovery platform + repertory theater company — we find papers you love and perform them your way.

	---

	## 🙏 Special Thanks

	This project was made possible by the incredible support from:

	<div style="display: flex; justify-content: center; align-items: center; gap: 80px; margin: 50px 0; flex-wrap: wrap;">
	<div style="text-align: center;">
	<a href="https://modal.com" target="_blank">
	<img src="https://images.prismic.io/contrary-research/aDnorSdWJ-7kSv6V_ModalLabs_Cover.png?auto=format,compress" alt="Modal" style="height: 140px; width: auto; display: block; margin: 0 auto;">
	</a>
	</div>
	<div style="text-align: center;">
	<a href="https://elevenlabs.io" target="_blank">
	<img src="https://eleven-public-cdn.elevenlabs.io/payloadcms/9trrmnj2sj8-logo-logo.svg" alt="ElevenLabs" style="height: 100px; width: auto; display: block; margin: 0 auto;">
	</a>
	</div>
	</div>

	Why we chose these partners:

	🚀 Modal - Serverless AI infrastructure that gives us instant access to powerful GPUs (A100, H100) with sub-second cold starts. Their platform handles automatic scaling, letting us process papers efficiently without managing infrastructure. Perfect for variable workloads and rapid iteration.

	🎙️ ElevenLabs - We use their Turbo v2.5 model for studio-quality voice synthesis. This model delivers incredibly natural, emotionally expressive voices with low latency (~250-300ms) and 50% lower cost. The voice quality makes our podcasts truly engaging and professional.

	---

	Made with ❤️ using Anthropic, OpenAI, Modal, ElevenLabs, Gradio, and MCP

	</div>
	""")
	with gr.Column(scale=1):
	pass

	demo.launch(
	theme=theme,
	css=CUSTOM_CSS,
	mcp_server=True # Enable MCP support
	)

	if __name__ == "__main__":
	main()