papercast / app.py
batuhanozkose
feat: Add Paper Auto-Discovery (PAD) engine and update documentation
39bbc0e
import os
from datetime import datetime
import gradio as gr
from agents.podcast_agent import PodcastAgent
from synthesis.tts_engine import ELEVENLABS_VOICES
from synthesis.supertonic_tts import SUPERTONIC_VOICES
from utils.config import (
OUTPUT_DIR,
SCRIPT_GENERATION_MODEL,
)
from utils.history import get_history_items, load_history
from processing.paper_discovery import search_papers, PaperDiscoveryEngine
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
# --- Configuration & Constants ---
PODCAST_LENGTH_PRESETS = {
"⚑ Very Short (6-8 exchanges, ~2-3 min)": (7, 2000),
"πŸ“ Short (10-12 exchanges, ~3-4 min)": (11, 3000),
"πŸ“„ Medium (14-16 exchanges, ~5-6 min)": (15, 4000),
"πŸ“š Medium-Long (18-20 exchanges, ~7-8 min)": (19, 5000),
"πŸ“– Long (22-25 exchanges, ~9-11 min)": (23, 6000),
"πŸ“• Very Long (28-32 exchanges, ~12-15 min)": (30, 8000),
}
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&family=Inter:wght@300;400;500;600&display=swap');
:root {
--primary-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%);
--glass-bg: rgba(17, 24, 39, 0.7);
--glass-border: rgba(255, 255, 255, 0.1);
}
body, .gradio-container {
font-family: 'Inter', sans-serif !important;
background-color: #0f172a !important; /* Dark slate background */
}
h1, h2, h3, h4, h5, h6 {
font-family: 'Outfit', sans-serif !important;
}
/* Hero Section */
.hero-container {
text-align: center;
padding: 40px 20px;
margin-bottom: 20px;
position: relative;
overflow: hidden;
}
.hero-title {
font-size: 4rem !important;
font-weight: 800 !important;
margin-bottom: 10px;
letter-spacing: -0.02em;
color: white;
}
.hero-title span {
background: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%);
-webkit-background-clip: text;
background-clip: text;
-webkit-text-fill-color: transparent;
color: #a855f7; /* Fallback */
}
.hero-subtitle {
font-size: 1.2rem;
color: #94a3b8;
max-width: 600px;
margin: 0 auto;
line-height: 1.6;
}
/* Cards & Containers */
.glass-panel {
background: var(--glass-bg) !important;
backdrop-filter: blur(12px);
border: 1px solid var(--glass-border) !important;
border-radius: 16px !important;
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
padding: 20px;
}
/* Buttons */
.primary-btn {
background: var(--primary-gradient) !important;
border: none !important;
color: white !important;
font-weight: 600 !important;
transition: all 0.3s ease !important;
box-shadow: 0 10px 15px -3px rgba(168, 85, 247, 0.4) !important;
}
.primary-btn:hover {
transform: translateY(-2px);
box-shadow: 0 20px 25px -5px rgba(168, 85, 247, 0.5) !important;
}
/* Inputs */
input, textarea, select {
background-color: rgba(30, 41, 59, 0.8) !important;
border: 1px solid rgba(71, 85, 105, 0.5) !important;
color: #e2e8f0 !important;
}
/* Progress Steps */
.step-container {
display: flex;
justify-content: space-between;
margin-bottom: 20px;
position: relative;
}
.step-line {
position: absolute;
top: 15px;
left: 0;
right: 0;
height: 2px;
background: #334155;
z-index: 0;
}
.step-item {
position: relative;
z-index: 1;
display: flex;
flex-direction: column;
align-items: center;
width: 25%;
}
.step-circle {
width: 32px;
height: 32px;
border-radius: 50%;
background: #1e293b;
border: 2px solid #475569;
display: flex;
align-items: center;
justify-content: center;
font-weight: bold;
color: #94a3b8;
transition: all 0.3s ease;
margin-bottom: 8px;
}
.step-item.active .step-circle {
background: #a855f7;
border-color: #a855f7;
color: white;
box-shadow: 0 0 15px rgba(168, 85, 247, 0.5);
}
.step-item.completed .step-circle {
background: #10b981;
border-color: #10b981;
color: white;
}
.step-label {
font-size: 0.8rem;
color: #64748b;
font-weight: 500;
}
.step-item.active .step-label {
color: #e2e8f0;
}
/* Terminal Output */
.terminal-window {
background: #0f172a !important;
border: 1px solid #334155 !important;
border-radius: 8px !important;
font-family: 'JetBrains Mono', monospace !important;
color: #22c55e !important;
padding: 15px !important;
}
"""
# --- Helper Functions ---
def get_podcast_length_params(length_choice):
return PODCAST_LENGTH_PRESETS.get(length_choice, (15, 4000))
def validate_settings_for_generation(llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key):
errors = []
if llm_choice == "Own Inference":
if not own_base_url:
errors.append("❌ **Own Inference**: Base URL is required")
elif not (own_base_url.startswith("http://") or own_base_url.startswith("https://")):
errors.append("❌ **Own Inference**: Base URL must start with http:// or https://")
elif llm_choice == "OpenAI":
if not openai_key:
errors.append("❌ **OpenAI**: API key is required")
elif not openai_key.startswith("sk-"):
errors.append("❌ **OpenAI**: API key must start with 'sk-'")
# Only require ElevenLabs API key if using ElevenLabs
if tts_provider == "elevenlabs":
if not elevenlabs_key:
errors.append("❌ **ElevenLabs TTS**: API key is required")
elif not elevenlabs_key.startswith("sk_"):
errors.append("❌ **ElevenLabs TTS**: API key must start with 'sk_'")
# Supertonic doesn't require an API key (CPU-based)
if errors:
return False, "\n".join(errors)
return True, ""
def get_stats():
history = load_history()
return f"πŸš€ Total Podcasts: {len(history)}"
def generate_progress_html(current_step):
"""Generate modern HTML progress steps"""
steps = ["Fetch", "Extract", "Script", "Audio"]
html = '<div class="step-container"><div class="step-line"></div>'
for i, name in enumerate(steps):
step_num = i + 1
status_class = ""
icon = str(step_num)
if step_num < current_step:
status_class = "completed"
icon = "βœ“"
elif step_num == current_step:
status_class = "active"
html += f"""
<div class="step-item {status_class}">
<div class="step-circle">{icon}</div>
<div class="step-label">{name}</div>
</div>
"""
html += '</div>'
return html
def validated_generate_agent(
url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
user_persona_mode,
progress=gr.Progress()
):
is_valid, error_message = validate_settings_for_generation(
user_llm_choice, user_own_base_url, user_own_api_key,
user_openai_key, user_tts_provider, user_elevenlabs_key
)
if not is_valid:
raise gr.Error(error_message)
# Show progress container
yield gr.update(visible=True, value=generate_progress_html(0)), "πŸš€ Initializing...", gr.update(visible=False)
try:
# Run the generator
iterator = run_agent(
url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
user_persona_mode, progress
)
logs_history = ""
current_step = 0
for item in iterator:
if isinstance(item, tuple):
# Final result
audio_path, final_logs = item
generate_transcript(audio_path, final_logs)
progress(1.0, desc="Done!")
yield gr.update(value=generate_progress_html(5)), final_logs + "\n\n✨ DONE!", gr.update(value=audio_path, visible=True)
else:
# Log update
log_entry = item
logs_history += log_entry + "\n"
# Determine step
new_step = current_step
step_desc = "Processing..."
if "fetch_paper" in log_entry or "downloaded" in log_entry:
new_step = 1
step_desc = "Fetching Paper..."
elif "Extracted" in log_entry or "read_pdf" in log_entry:
new_step = 2
step_desc = "Extracting Text..."
elif "generate_script" in log_entry or "Generated script" in log_entry:
new_step = 3
step_desc = "Generating Script..."
elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry:
new_step = 4
step_desc = "Synthesizing Audio..."
if new_step != current_step:
current_step = new_step
# Map step to progress (1-4 -> 0.2-0.8)
prog_val = 0.2 * current_step
progress(prog_val, desc=step_desc)
yield gr.update(value=generate_progress_html(current_step)), logs_history, gr.update(visible=False)
else:
yield gr.update(), logs_history, gr.update(visible=False)
except Exception as e:
raise gr.Error(f"System Error: {str(e)}")
def run_agent(
url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
user_persona_mode,
progress=gr.Progress()
):
# Determine provider mode
if user_llm_choice == "Own Inference":
provider_mode = "own_inference"
else: # OpenAI
provider_mode = "openai"
target_exchanges, max_tokens = get_podcast_length_params(user_podcast_length)
agent = PodcastAgent(
provider_mode=provider_mode,
own_base_url=user_own_base_url if user_own_base_url else None,
own_api_key=user_own_api_key if user_own_api_key else None,
own_model=user_own_model if user_own_model else None,
openai_key=user_openai_key if user_openai_key else None,
openai_model=user_openai_model if user_openai_model else None,
tts_provider=user_tts_provider if user_tts_provider else "elevenlabs",
elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None,
host_voice=user_host_voice if user_host_voice else None,
guest_voice=user_guest_voice if user_guest_voice else None,
max_tokens=max_tokens,
target_dialogue_count=target_exchanges,
context_limit=user_context_limit,
persona_mode=user_persona_mode if user_persona_mode else "friendly_explainer",
)
yield f"Starting Agent... [Mode: {provider_mode}]"
# Logic for single vs multi
if advanced_mode:
# Parse URLs if provided
urls = None
if multi_urls and multi_urls.strip():
urls = [u.strip() for u in multi_urls.strip().split("\n") if u.strip()]
# Parse PDFs if provided
pdfs = None
if multi_pdfs:
if not isinstance(multi_pdfs, list):
pdfs = [multi_pdfs]
else:
pdfs = multi_pdfs
# Check if any input provided
if not urls and not pdfs:
raise Exception("No input provided for advanced mode")
# Process both URLs and PDFs together
url_count = len(urls) if urls else 0
pdf_count = len(pdfs) if pdfs else 0
total = url_count + pdf_count
yield f"Processing {total} items ({url_count} URLs + {pdf_count} PDFs)..."
yield from agent.process_multiple(urls=urls, pdf_files=pdfs)
else:
if not url and not pdf_file:
raise Exception("Please provide a URL or PDF")
yield from agent.process(url=url if url else None, pdf_file=pdf_file)
def generate_transcript(audio_path, logs):
if not audio_path: return None
base_name = os.path.splitext(os.path.basename(audio_path))[0]
transcript_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt")
with open(transcript_path, "w") as f:
f.write(f"PAPERCAST TRANSCRIPT - {datetime.now()}\n{'='*30}\n\n{logs}")
return transcript_path
def get_history_data():
items = get_history_items()
if not items: return []
return [[
item.get("timestamp", "N/A"),
item.get("url", "PDF Upload") or "PDF Upload",
item.get("audio_path", "")
] for item in items]
def on_history_select(evt: gr.SelectData, data):
try:
return data.iloc[evt.index[0]].iloc[2] # Audio path is column 2
except:
return None
def perform_paper_search(query: str, progress=gr.Progress()):
"""
PAD: Search for papers using Paper Auto-Discovery
Returns formatted results for display in UI
"""
if not query or not query.strip():
return gr.update(choices=[], value=None, visible=False), "⚠️ Please enter a search query"
progress(0.2, desc="Searching Semantic Scholar & arXiv...")
try:
# Search using PAD
results = search_papers(query.strip(), max_results=5)
if not results:
return gr.update(choices=[], value=None, visible=False), "❌ No papers found. Try a different query."
progress(0.8, desc=f"Found {len(results)} papers")
# Format results for Dropdown display
choices = []
for i, paper in enumerate(results, 1):
authors_str = ", ".join(paper.authors[:2])
if len(paper.authors) > 2:
authors_str += " et al."
year_str = f" ({paper.year})" if paper.year else ""
source_emoji = "πŸ“š" if paper.source == "semantic_scholar" else "πŸ”¬"
# Create display label for dropdown
label = f"{i}. {source_emoji} {paper.title}{year_str} | {authors_str}"
choices.append(label) # Dropdown just needs the labels
progress(1.0, desc="Search complete!")
print(f"[DEBUG] Search found {len(results)} papers")
print(f"[DEBUG] Choices created: {len(choices)}")
print(f"[DEBUG] First choice: {choices[0] if choices else 'NONE'}")
# Store results in a global variable (we'll use State instead)
# Return updated Dropdown and success message
success_msg = f"βœ… Found {len(results)} papers from Semantic Scholar & arXiv"
# Select the first option by default to ensure visibility/interaction
first_choice = choices[0] if choices else None
return gr.update(choices=choices, value=first_choice, visible=True, interactive=True), success_msg
except Exception as e:
return gr.update(choices=[], value=None, visible=False), f"❌ Search failed: {str(e)}"
def on_paper_select(selected_label, query):
"""
Handle paper selection from search results.
Returns the PDF URL to be used for podcast generation.
"""
if not selected_label:
return None, "⚠️ Please select a paper from the search results"
try:
# Extract index from label (format: "1. emoji title...")
selected_index = int(selected_label.split(".")[0]) - 1
# Re-run search to get results (since we can't pass complex objects through Gradio)
results = search_papers(query.strip(), max_results=5)
if not results or selected_index >= len(results) or selected_index < 0:
return None, "❌ Invalid selection"
selected_paper = results[selected_index]
# Get PDF URL
engine = PaperDiscoveryEngine()
pdf_url = engine.get_pdf_url(selected_paper)
if not pdf_url:
return None, f"❌ No PDF available for: {selected_paper.title}"
# Return PDF URL and success message
authors_str = ", ".join(selected_paper.authors[:3])
if len(selected_paper.authors) > 3:
authors_str += " et al."
success_msg = f"βœ… Selected: **{selected_paper.title}**\n\nπŸ‘₯ {authors_str}\nπŸ“… {selected_paper.year or 'N/A'}\nπŸ”— {pdf_url}"
return pdf_url, success_msg
except Exception as e:
return None, f"❌ Selection failed: {str(e)}"
# --- Main UI ---
def main():
# Use a dark theme base but override heavily with CSS
theme = gr.themes.Soft(
primary_hue="violet",
secondary_hue="slate",
neutral_hue="slate",
font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"],
).set(
body_background_fill="#0f172a",
block_background_fill="#1e293b",
block_border_width="1px",
block_border_color="rgba(255,255,255,0.1)",
)
with gr.Blocks(title="PaperCast") as demo:
# Session State
user_llm_choice = gr.State(value="Own Inference")
user_own_base_url = gr.State(value="")
user_own_api_key = gr.State(value="")
user_own_model = gr.State(value="")
user_openai_key = gr.State(value="")
user_openai_model = gr.State(value="")
user_tts_provider = gr.State(value="elevenlabs")
user_elevenlabs_key = gr.State(value="")
user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # ElevenLabs default
user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # ElevenLabs default
user_podcast_length = gr.State(value=4096)
user_persona_mode = gr.State(value="friendly_explainer") # PPF default
# Hero Section
with gr.Row(elem_classes="hero-container"):
gr.HTML("""
<h1 class="hero-title"><span>PaperCast</span> πŸŽ™οΈ</h1>
<p class="hero-subtitle">
Experience the future of knowledge consumption. <br>
An autonomous agentic system that transforms complex research papers into engaging, studio-quality audio experiences.
</p>
""")
with gr.Tabs():
# --- Tab 1: Create ---
with gr.Tab("✨ Create Podcast"):
with gr.Row():
# Left Col: Inputs
with gr.Column(scale=4, elem_classes="glass-panel"):
gr.Markdown("### πŸ“₯ Source Material")
with gr.Tabs(selected=0) as input_tabs:
with gr.Tab("πŸ”— URL", id=0):
url_input = gr.Textbox(
label="Paper URL",
placeholder="https://arxiv.org/abs/...",
show_label=False,
container=False
)
with gr.Tab("πŸ“„ PDF Upload"):
pdf_upload = gr.File(
label="Upload PDF",
file_types=[".pdf"],
container=False
)
with gr.Tab("πŸ” Search (PAD)"):
gr.Markdown("**Paper Auto-Discovery** β€” Search across Semantic Scholar & arXiv")
with gr.Row():
search_query = gr.Textbox(
label="Search Query",
placeholder="e.g., 'diffusion models', 'Grok reasoning', 'transformer attention'...",
show_label=False,
container=False,
scale=4,
lines=1,
max_lines=1
)
search_btn = gr.Button("πŸ”Ž Search", variant="primary", scale=1)
search_status = gr.Markdown("", visible=True)
# Container for search results (always visible)
with gr.Column(visible=True) as search_results_container:
search_results = gr.Radio(
label="πŸ“‹ Select a Paper",
choices=[],
interactive=True,
show_label=True,
)
use_selected_btn = gr.Button(
"βœ… Use Selected Paper",
variant="primary",
size="lg"
)
# Hidden state to store selected PDF URL from search
selected_pdf_url = gr.State(value=None)
selected_search_query = gr.State(value=None)
# Wire search functionality
def handle_search(query):
"""Handle search button click"""
if not query or not query.strip():
return (
gr.update(choices=[], value=None),
"⚠️ Please enter a search query",
query
)
try:
# Search using PAD
results = search_papers(query.strip(), max_results=5)
if not results:
return (
gr.update(choices=[], value=None),
"❌ No papers found. Try a different query.",
query
)
# Format results for Radio display
choices = []
for i, paper in enumerate(results, 1):
authors_str = ", ".join(paper.authors[:2])
if len(paper.authors) > 2:
authors_str += " et al."
year_str = f" ({paper.year})" if paper.year else ""
source_emoji = "πŸ“š" if paper.source == "semantic_scholar" else "πŸ”¬"
# Create display label
label = f"{i}. {source_emoji} {paper.title}{year_str} | {authors_str}"
choices.append(label)
first_choice = choices[0] if choices else None
status_msg = f"βœ… Found {len(results)} papers from Semantic Scholar & arXiv"
status_msg += "\n\n**➑️ Next:** Select a paper from the list below, then click 'Use Selected Paper'"
print(f"[DEBUG] handle_search - found {len(choices)} papers")
print(f"[DEBUG] choices: {choices[:2]}...")
return (
gr.update(choices=choices, value=first_choice),
status_msg,
query
)
except Exception as e:
print(f"[ERROR] Search failed: {e}")
return (
gr.update(choices=[], value=None),
f"❌ Search failed: {str(e)}",
query
)
search_btn.click(
fn=handle_search,
inputs=[search_query],
outputs=[search_results, search_status, selected_search_query]
)
def handle_use_selected(selected_idx, query):
"""Handle 'Use Selected Paper' button click"""
pdf_url, status_msg = on_paper_select(selected_idx, query)
# Add instruction to the status message
if pdf_url:
status_msg += "\n\n➑️ **Next:** Switch to the 'πŸ”— URL' tab to see the paper URL, then click 'πŸŽ™οΈ Generate Podcast'"
return pdf_url, status_msg, pdf_url # Update url_input with PDF URL
use_selected_btn.click(
fn=handle_use_selected,
inputs=[search_results, selected_search_query],
outputs=[selected_pdf_url, search_status, url_input]
)
with gr.Accordion("βš™οΈ Advanced Options", open=False, visible=True) as advanced_accordion:
advanced_mode = gr.Checkbox(label="Batch Mode (Multiple Papers)")
# Warning message (only visible in batch mode)
batch_warning = gr.Markdown(
"""
> **⚠️ Experimental Feature**
>
> Batch mode is currently experimental and may not work reliably in all cases.
> Some attempts may fail due to model limitations or processing errors.
> If you experience issues, try processing papers individually.
""",
visible=False
)
with gr.Group(visible=False) as batch_inputs:
multi_url_input = gr.Textbox(label="Multiple URLs (one per line)", lines=3)
multi_pdf_upload = gr.File(label="Multiple PDFs", file_count="multiple")
gr.Markdown("---")
gr.Markdown("### πŸ“Š Context Settings")
# Context limit slider (only visible in batch mode)
context_limit_slider = gr.Slider(
minimum=50000,
maximum=500000,
value=80000,
step=10000,
label="Max Context Limit (characters)",
info="⚠️ Warning: Increasing this limit will increase token costs and processing time."
)
def toggle_advanced(adv):
return {
batch_warning: gr.update(visible=adv),
batch_inputs: gr.update(visible=adv),
url_input: gr.update(visible=not adv),
pdf_upload: gr.update(visible=not adv)
}
advanced_mode.change(toggle_advanced, advanced_mode, [batch_warning, batch_inputs, url_input, pdf_upload])
# Hide Advanced Options when Search (PAD) tab is selected
def on_tab_select(evt: gr.SelectData):
"""Handle tab selection - hide batch mode for Search tab"""
# Tab indices: 0=URL, 1=PDF Upload, 2=Search (PAD)
is_search_tab = (evt.index == 2)
return gr.update(visible=not is_search_tab)
input_tabs.select(
fn=on_tab_select,
outputs=[advanced_accordion]
)
generate_btn = gr.Button(
"πŸŽ™οΈ Generate Podcast",
variant="primary",
elem_classes="primary-btn",
size="lg"
)
# Right Col: Output
with gr.Column(scale=5, elem_classes="glass-panel"):
gr.Markdown("### πŸ“‘ Live Feed")
# Progress Steps
progress_html = gr.HTML(visible=False)
# Terminal Log
status_output = gr.Code(
label="System Logs",
language="shell",
interactive=False,
lines=12,
elem_classes="terminal-window"
)
# Audio Player
audio_output = gr.Audio(
label="🎧 Final Podcast",
type="filepath",
interactive=False,
visible=False
)
# Wiring
generate_btn.click(
fn=validated_generate_agent,
inputs=[
url_input, pdf_upload, advanced_mode, multi_url_input, multi_pdf_upload,
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
user_host_voice, user_guest_voice, user_podcast_length, context_limit_slider,
user_persona_mode
],
outputs=[progress_html, status_output, audio_output]
)
# --- Tab 2: Library ---
with gr.Tab("πŸ“š Library"):
with gr.Row(elem_classes="glass-panel"):
with gr.Column():
refresh_btn = gr.Button("πŸ”„ Refresh Library", size="sm", variant="secondary")
history_table = gr.Dataframe(
headers=["Date", "Source", "Audio Path"],
datatype=["str", "str", "str"],
value=get_history_data(),
interactive=False,
label="Recent Podcasts"
)
with gr.Column():
history_player = gr.Audio(label="Playback")
refresh_btn.click(lambda: get_history_data(), None, history_table)
history_table.select(on_history_select, history_table, history_player)
# --- Tab 3: Settings ---
with gr.Tab("βš™οΈ Settings"):
with gr.Row(elem_classes="glass-panel"):
with gr.Column():
gr.Markdown("### πŸ€– Model Configuration")
llm_choice = gr.Radio(
["Own Inference", "OpenAI"],
value="Own Inference",
label="Provider"
)
# Own Inference
with gr.Group(visible=True) as own_group:
own_base = gr.Textbox(label="Base URL", placeholder="http://localhost:1234/v1")
own_key = gr.Textbox(label="API Key", type="password")
own_model = gr.Textbox(label="Model Name", placeholder="llama-3.1-8b")
# OpenAI
with gr.Group(visible=False) as openai_group:
openai_key = gr.Textbox(label="OpenAI Key", type="password")
openai_model = gr.Textbox(label="Model", value="gpt-4o-mini")
def toggle_llm(choice):
return [
gr.update(visible=choice=="Own Inference"), # own_group
gr.update(visible=choice=="OpenAI") # openai_group
]
llm_choice.change(toggle_llm, llm_choice, [own_group, openai_group])
with gr.Column():
gr.Markdown("### πŸ—£οΈ Voice Settings")
tts_choice = gr.Radio(
["ElevenLabs", "Supertonic (CPU)"],
value="ElevenLabs",
label="TTS Provider",
info="Supertonic runs on CPU (no API key required, but may be slower than cloud-based TTS)"
)
# ElevenLabs Settings
with gr.Group(visible=True) as elevenlabs_group:
eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password")
host_voice_eleven = gr.Dropdown(
choices=list(ELEVENLABS_VOICES.keys()),
value="Antoni (Male - Well-rounded)",
label="Host Voice"
)
guest_voice_eleven = gr.Dropdown(
choices=list(ELEVENLABS_VOICES.keys()),
value="Bella (Female - Soft)",
label="Guest Voice"
)
# Supertonic Settings
with gr.Group(visible=False) as supertonic_group:
gr.Markdown("**CPU-based TTS** (no API key required)\n\n⚠️ *Note: CPU processing may be slower than cloud-based services*")
host_voice_supertonic = gr.Dropdown(
choices=list(SUPERTONIC_VOICES.keys()),
value="M1 (Male 1)",
label="Host Voice"
)
guest_voice_supertonic = gr.Dropdown(
choices=list(SUPERTONIC_VOICES.keys()),
value="F1 (Female 1)",
label="Guest Voice"
)
length_slider = gr.Dropdown(
choices=list(PODCAST_LENGTH_PRESETS.keys()),
value="πŸ“„ Medium (14-16 exchanges, ~5-6 min)",
label="Podcast Length"
)
gr.Markdown("### 🎭 Podcast Persona Framework (PPF)")
persona_dropdown = gr.Dropdown(
choices=[
"🀝 Friendly Explainer (Default)",
"βš”οΈ Academic Debate",
"πŸ”₯ Savage Roast",
"πŸŽ“ Pedagogical",
"🌐 Interdisciplinary Clash"
],
value="🀝 Friendly Explainer (Default)",
label="Conversation Style",
info="Choose the podcast conversation style and character personalities"
)
gr.Markdown("""
**Persona Descriptions:**
- **🀝 Friendly Explainer** β€” *Alex & Jamie*
Two friends casually discussing the paper. Accessible, warm, ideal for general audiences. (Default mode)
- **βš”οΈ Academic Debate** β€” *Dr. Morgan & Prof. Rivera*
Dr. Morgan defends the paper, Prof. Rivera politely challenges claims and methodology.
*"This claim is strong, but Table 2's baseline seems weak..."*
- **πŸ”₯ Savage Roast** β€” *The Critic & The Defender*
The Critic brutally roasts the paper, The Defender stubbornly fights back.
*"This ablation is an absolute clown show!", "Figure 4 is just statistical noise!"*
Fun and bold approach!
- **πŸŽ“ Pedagogical** β€” *Professor Chen & Student Sam*
Professor teaches step-by-step, Student constantly asks questions.
Perfect for learning complex concepts from scratch.
- **🌐 Interdisciplinary Clash** β€” *Domain Expert & The Outsider*
Domain Expert explains technical details, Outsider critiques from a completely different field perspective.
*"This neuron analogy makes zero biological sense!"*
""")
def toggle_tts_provider(choice):
is_elevenlabs = choice == "ElevenLabs"
return [
gr.update(visible=is_elevenlabs), # elevenlabs_group
gr.update(visible=not is_elevenlabs) # supertonic_group
]
def update_voices_on_tts_change(choice):
"""Update voice IDs when TTS provider changes"""
if choice == "ElevenLabs":
# Return ElevenLabs default voices
return "ErXwobaYiN019PkySvjV", "EXAVITQu4vr4xnSDxMaL"
else: # Supertonic
# Return Supertonic default voices (M1, F1)
return "M1", "F1"
tts_choice.change(toggle_tts_provider, tts_choice, [elevenlabs_group, supertonic_group])
tts_choice.change(update_voices_on_tts_change, tts_choice, [user_host_voice, user_guest_voice])
# Bind settings to state
llm_choice.change(lambda x: x, llm_choice, user_llm_choice)
own_base.change(lambda x: x, own_base, user_own_base_url)
own_key.change(lambda x: x, own_key, user_own_api_key)
own_model.change(lambda x: x, own_model, user_own_model)
openai_key.change(lambda x: x, openai_key, user_openai_key)
openai_model.change(lambda x: x, openai_model, user_openai_model)
# TTS Provider binding
def update_tts_provider(choice):
return "elevenlabs" if choice == "ElevenLabs" else "supertonic"
tts_choice.change(update_tts_provider, tts_choice, user_tts_provider)
# Voice bindings - need to handle both providers
def update_host_voice(tts_provider, eleven_voice, super_voice):
if tts_provider == "ElevenLabs":
return ELEVENLABS_VOICES.get(eleven_voice, "ErXwobaYiN019PkySvjV")
else:
return SUPERTONIC_VOICES.get(super_voice, "M1")
def update_guest_voice(tts_provider, eleven_voice, super_voice):
if tts_provider == "ElevenLabs":
return ELEVENLABS_VOICES.get(eleven_voice, "EXAVITQu4vr4xnSDxMaL")
else:
return SUPERTONIC_VOICES.get(super_voice, "F1")
eleven_key.change(lambda x: x, eleven_key, user_elevenlabs_key)
# Update voice states when either provider's voice changes
host_voice_eleven.change(
lambda v: ELEVENLABS_VOICES.get(v, "ErXwobaYiN019PkySvjV"),
host_voice_eleven,
user_host_voice
)
guest_voice_eleven.change(
lambda v: ELEVENLABS_VOICES.get(v, "EXAVITQu4vr4xnSDxMaL"),
guest_voice_eleven,
user_guest_voice
)
host_voice_supertonic.change(
lambda v: SUPERTONIC_VOICES.get(v, "M1"),
host_voice_supertonic,
user_host_voice
)
guest_voice_supertonic.change(
lambda v: SUPERTONIC_VOICES.get(v, "F1"),
guest_voice_supertonic,
user_guest_voice
)
length_slider.change(lambda x: x, length_slider, user_podcast_length)
# Persona binding
def map_persona_to_key(display_name):
"""Map UI display names to internal persona keys"""
mapping = {
"🀝 Friendly Explainer (Default)": "friendly_explainer",
"βš”οΈ Academic Debate": "academic_debate",
"πŸ”₯ Savage Roast": "savage_roast",
"πŸŽ“ Pedagogical": "pedagogical",
"🌐 Interdisciplinary Clash": "interdisciplinary_clash"
}
return mapping.get(display_name, "friendly_explainer")
persona_dropdown.change(map_persona_to_key, persona_dropdown, user_persona_mode)
# --- Tab 4: About ---
with gr.Tab("ℹ️ About"):
with gr.Row(elem_classes="glass-panel"):
with gr.Column(scale=1):
pass
with gr.Column(scale=3):
gr.Markdown(f"""
<div style="text-align: center;">
# About PaperCast
**The world's first adaptive persona-driven academic podcast platform with intelligent paper discovery.**
Transform any research paper into engaging audio conversations with your choice of style β€” from casual explanations to brutal critiques. Powered by our revolutionary **Podcast Persona Framework (PPF)**, **Paper Auto-Discovery (PAD)** engine, MCP tools, and studio-quality TTS.
---
## πŸš€ Revolutionary Frameworks
### **PAD** β€” Paper Auto-Discovery Engine
**The world's first intelligent multi-source paper discovery system built specifically for podcast generation.**
Finding the right research paper shouldn't be a chore. We built **PAD (Paper Auto-Discovery)** from the ground up β€” a custom-engineered search system that goes beyond simple keyword matching.
**What makes PAD revolutionary:**
πŸ” **Multi-Source Intelligence** β€” Searches across multiple academic databases simultaneously:
- **Semantic Scholar Graph API** - Access to 200M+ papers with semantic understanding
- **arXiv** - Latest preprints and cutting-edge research
- Parallel execution for lightning-fast results (under 2 seconds)
🧠 **Smart Result Aggregation** β€” Built from scratch with advanced deduplication:
- Intelligent title matching across sources
- Eliminates duplicates while preserving metadata quality
- Prioritizes papers with open-access PDFs
⚑ **Seamless Integration** β€” No copy-paste, no manual URL hunting:
- Search directly within PaperCast interface
- One-click paper selection
- Automatic PDF URL extraction and validation
- Instant transition to podcast generation
🎯 **Research-Grade Quality** β€” Enterprise-level reliability:
- Graceful handling of API rate limits
- Fallback strategies when one source fails
- Comprehensive error handling and user feedback
- Extracts full metadata (authors, year, abstract, citations)
**Why we built PAD from scratch:**
Existing search tools are designed for reading papers, not generating podcasts. We needed:
- **Speed**: Parallel API calls return results in under 2 seconds
- **Reliability**: Custom retry logic and fallback strategies
- **Integration**: Direct pipeline from search β†’ PDF β†’ podcast
- **User Experience**: No context switching, no tab juggling
**Technical Innovation:**
- Custom Python engine using `ThreadPoolExecutor` for concurrent API calls
- Smart result ranking combining relevance scores from multiple sources
- Automatic PDF URL construction for arXiv papers
- State-of-the-art deduplication using fuzzy title matching
---
### **PPF** β€” Podcast Persona Framework
**The world's first adaptive persona system for AI-generated academic podcasts.**
Every other podcast generator treats all papers the same way: bland, generic conversations that put you to sleep. We solved the **one-size-fits-all problem** by inventing the **Podcast Persona Framework (PPF)** β€” a groundbreaking system that adapts conversation style, character dynamics, and educational approach to **your** preference.
**What makes PPF revolutionary:**
🎭 **5 Distinct Persona Modes** β€” Not just voice changes, but fundamentally different conversation dynamics:
- 🀝 **Friendly Explainer** β€” Two colleagues casually discussing research over coffee
- βš”οΈ **Academic Debate** β€” Rigorous defense vs. constructive criticism (perfect for critical analysis)
- πŸ”₯ **Savage Roast** β€” Brutally entertaining critique meets passionate defense (most engaging!)
- πŸŽ“ **Pedagogical** β€” Patient professor teaching eager student (best for learning complex topics)
- 🌐 **Interdisciplinary Clash** β€” Domain expert vs. outsider perspective (reveals hidden assumptions)
🧠 **Dynamic Character Intelligence** β€” Each persona features unique characters with distinct personalities:
- Not generic "Host" and "Guest" β€” real names like **Dr. Morgan**, **The Critic**, **Professor Chen**
- Characters maintain consistent perspectives throughout entire podcast
- Authentic reactions, natural interruptions, genuine debates
⚑ **Zero Overhead** β€” Works seamlessly with any TTS provider (ElevenLabs, Supertonic, etc.)
- First speaker β†’ Host voice
- Second speaker β†’ Guest voice
- Automatic voice mapping regardless of character names
🎯 **Universal Compatibility** β€” PPF is provider-agnostic:
- Works with any LLM (OpenAI, local models, reasoning models)
- Compatible with all TTS engines
- No special configuration required
**Why this matters:**
Traditional podcast generators produce the same monotonous style for every paper. A groundbreaking ML paper gets the same treatment as a medical study. A complex theoretical physics paper sounds identical to an introductory survey.
**PPF changes everything.** Now you choose how you want to consume research:
- Need to learn? β†’ **Pedagogical mode**
- Want entertainment? β†’ **Savage Roast**
- Seeking critical analysis? β†’ **Academic Debate**
- Quick overview? β†’ **Friendly Explainer**
- Fresh perspective? β†’ **Interdisciplinary Clash**
**Built from scratch, perfected for you.** We didn't just add a "tone" parameter β€” we architected an entire persona system with character-aware prompts, dynamic speaker mapping, and adaptive conversation strategies.
---
## 🎯 How It Works
Our intelligent agent orchestrates a **dual-innovation pipeline** combining PAD and PPF:
1. **πŸ” Discovery (PAD)** - Search across Semantic Scholar & arXiv simultaneously, get results in <2 seconds
2. **πŸ“₯ Input** - Select paper from PAD results, or use URL/PDF upload
3. **πŸ“„ Extraction** - PyMuPDF intelligently extracts paper structure
4. **🎭 Persona Selection** - Choose from 5 unique conversation modes (PPF)
5. **🎬 Script Generation** - LLM generates character-specific dialogue with distinct personalities
6. **πŸ—£οΈ Dynamic Mapping** - Automatic voice assignment based on persona characters
7. **🎀 Voice Synthesis** - Studio-quality audio with ElevenLabs Turbo v2.5 or Supertonic
8. **βœ… Delivery** - Listen, download, share your personalized podcast
**What makes this special:** Unlike generic converters, we built **two groundbreaking systems from scratch** β€” PAD for intelligent discovery and PPF for adaptive personas.
---
## 🌟 Key Features
πŸ” **PAD - Paper Auto-Discovery** β€” Custom-built multi-source search engine (Semantic Scholar + arXiv) with parallel execution
🎭 **5 Revolutionary Persona Modes** β€” First-of-its-kind adaptive conversation system (PPF)
🧠 **Dynamic Character Intelligence** β€” Real personalities, not generic voices
⚑ **Lightning-Fast Search** β€” Get 5 relevant papers in under 2 seconds with intelligent deduplication
πŸŽ™οΈ **Studio-Quality Audio** β€” ElevenLabs Turbo v2.5 (250ms latency, cinematic quality)
πŸ”§ **Universal Compatibility** β€” Works with any LLM (OpenAI, local models, reasoning models)
πŸ“š **Complete History** β€” All podcasts saved locally with metadata
πŸ”„ **Multi-Paper Support** β€” Batch process multiple papers into comprehensive discussions
🎯 **Provider Agnostic** β€” Bring your own API keys, use local models, total flexibility
πŸš€ **Zero Friction Workflow** β€” From search query to podcast in 60 seconds
---
## πŸ”§ Technology Stack
**Core Innovations**:
- **PAD (Paper Auto-Discovery)** β€” Custom multi-source search engine built from scratch
- **PPF (Podcast Persona Framework)** β€” Proprietary adaptive conversation system
**LLM**: Universal support (OpenAI GPT-4o/o1, local LLMs, reasoning models)
**TTS**: ElevenLabs Turbo v2.5 (premium) or Supertonic (free CPU-based)
**PDF Processing**: PyMuPDF for fast, accurate text extraction
**UI Framework**: Gradio 6 with custom glass-morphism design
**Agent Architecture**: Custom Python orchestrator with MCP tools
---
## πŸŽ“ Built For
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
*Tag: `mcp-in-action-track-consumer`*
**What we're showcasing:**
- πŸ” **PAD Innovation** - First-ever custom multi-source paper discovery engine built for podcast generation
- 🎭 **PPF Innovation** - First-ever adaptive persona system for academic podcasts
- πŸ€– **Autonomous Agent** - Intelligent planning, reasoning, and persona-aware execution
- πŸ”§ **MCP Integration** - Tools as cognitive extensions for the agent
- 🎨 **Gradio 6 UX** - Glass-morphism design with intuitive search & persona controls
- πŸš€ **Real Impact** - Making research accessible and engaging for everyone
**Why PAD + PPF matter for this hackathon:** We didn't just build a tool β€” we invented **two new paradigms**. PAD solves the discovery problem (finding papers), PPF solves the consumption problem (understanding papers). Together, they create a **zero-friction pipeline** from curiosity to knowledge.
---
## πŸ“ About the Agent
PaperCast's **discovery-aware, persona-driven autonomous agent** makes intelligent decisions at every step:
- **πŸ” Discovery Intelligence** - Orchestrates parallel API calls to multiple paper sources, ranks and deduplicates results
- **🧠 Persona Analysis** - Evaluates paper complexity and matches optimal persona mode
- **πŸ“‹ Strategic Planning** - Determines conversation flow based on selected persona (debate-style vs. teaching-style)
- **🎭 Character Orchestration** - Generates distinct personalities for each persona (Dr. Morgan β‰  The Critic β‰  Professor Chen)
- **πŸ’¬ Adaptive Dialogue** - Adjusts technical depth, humor level, and interaction style per persona
- **πŸ—£οΈ Dynamic Synthesis** - Maps persona characters to voice IDs automatically
- **πŸ”„ Multi-Paper Intelligence** - Synthesizes insights across papers while maintaining persona consistency
**The key insight:** The agent doesn't just process papers β€” it **discovers and performs** them. PAD finds the perfect paper, PPF delivers it in your perfect style.
---
## πŸ’‘ Use Cases
### 🎧 **Learning & Education**
- **PAD Search** β†’ Find "transformer attention mechanisms" β†’ Get 5 papers instantly
- **Pedagogical mode** for complex topics you want to master
- **Friendly Explainer** for quick overviews during commutes
- **Interdisciplinary Clash** to understand papers outside your field
### πŸ”¬ **Research & Analysis**
- **PAD Search** β†’ Discover latest papers on your research topic
- **Academic Debate** for critical evaluation of methodologies
- **Savage Roast** to identify weak points and overstated claims
- Quick paper screening before deep reading (60 seconds from search to audio)
### 🌍 **Accessibility**
- **Zero barrier to entry** β€” No URLs, no downloads, just search and listen
- Make cutting-edge research understandable for non-experts
- Bridge knowledge gaps between disciplines
- Learn through conversation, not dry text
### 🎭 **Entertainment**
- **PAD + Savage Roast combo** β€” Find trending papers and roast them
- Host paper "debate clubs" with Academic Debate mode
- Share entertaining takes on research with Savage Roast clips
---
## πŸ† What Makes Us Different
πŸ” **We built PAD from scratch** β€” First custom multi-source academic search engine designed for podcast generation. Parallel API orchestration, smart deduplication, zero-friction UX.
🎭 **We invented PPF** β€” The Podcast Persona Framework is a **world-first innovation**. No other platform offers adaptive conversation personas.
⚑ **End-to-end innovation** β€” Most tools stop at URL β†’ podcast. We solved **discovery + consumption** with two custom-built systems.
🧠 **Real characters, not voices** β€” Other tools change tone. We create **distinct personalities** with names, perspectives, and consistent behavior.
πŸš€ **60-second pipeline** β€” From search query ("diffusion models") to finished podcast in under a minute. No other platform comes close.
πŸ”§ **Built for flexibility** β€” Provider-agnostic design works with any LLM, any TTS, any infrastructure.
🎯 **User empowerment** β€” You choose what to listen to (PAD) and how to listen (PPF). Complete control over discovery and consumption.
**The bottom line:** Every other podcast generator is a one-trick pony. PaperCast is a **research discovery platform + repertory theater company** β€” we find papers you love and perform them your way.
---
## πŸ™ Special Thanks
This project was made possible by the incredible support from:
<div style="display: flex; justify-content: center; align-items: center; gap: 80px; margin: 50px 0; flex-wrap: wrap;">
<div style="text-align: center;">
<a href="https://modal.com" target="_blank">
<img src="https://images.prismic.io/contrary-research/aDnorSdWJ-7kSv6V_ModalLabs_Cover.png?auto=format,compress" alt="Modal" style="height: 140px; width: auto; display: block; margin: 0 auto;">
</a>
</div>
<div style="text-align: center;">
<a href="https://elevenlabs.io" target="_blank">
<img src="https://eleven-public-cdn.elevenlabs.io/payloadcms/9trrmnj2sj8-logo-logo.svg" alt="ElevenLabs" style="height: 100px; width: auto; display: block; margin: 0 auto;">
</a>
</div>
</div>
**Why we chose these partners:**
πŸš€ **Modal** - Serverless AI infrastructure that gives us instant access to powerful GPUs (A100, H100) with sub-second cold starts. Their platform handles automatic scaling, letting us process papers efficiently without managing infrastructure. Perfect for variable workloads and rapid iteration.
πŸŽ™οΈ **ElevenLabs** - We use their **Turbo v2.5** model for studio-quality voice synthesis. This model delivers incredibly natural, emotionally expressive voices with low latency (~250-300ms) and 50% lower cost. The voice quality makes our podcasts truly engaging and professional.
---
Made with ❀️ using Anthropic, OpenAI, Modal, ElevenLabs, Gradio, and MCP
</div>
""")
with gr.Column(scale=1):
pass
demo.launch(
theme=theme,
css=CUSTOM_CSS,
mcp_server=True # Enable MCP support
)
if __name__ == "__main__":
main()