Spaces:
Running
Running
| import os | |
| from datetime import datetime | |
| import gradio as gr | |
| from agents.podcast_agent import PodcastAgent | |
| from synthesis.tts_engine import ELEVENLABS_VOICES | |
| from synthesis.supertonic_tts import SUPERTONIC_VOICES | |
| from utils.config import ( | |
| OUTPUT_DIR, | |
| SCRIPT_GENERATION_MODEL, | |
| ) | |
| from utils.history import get_history_items, load_history | |
| from processing.paper_discovery import search_papers, PaperDiscoveryEngine | |
| # Ensure output directory exists | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # --- Configuration & Constants --- | |
| PODCAST_LENGTH_PRESETS = { | |
| "β‘ Very Short (6-8 exchanges, ~2-3 min)": (7, 2000), | |
| "π Short (10-12 exchanges, ~3-4 min)": (11, 3000), | |
| "π Medium (14-16 exchanges, ~5-6 min)": (15, 4000), | |
| "π Medium-Long (18-20 exchanges, ~7-8 min)": (19, 5000), | |
| "π Long (22-25 exchanges, ~9-11 min)": (23, 6000), | |
| "π Very Long (28-32 exchanges, ~12-15 min)": (30, 8000), | |
| } | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600;800&family=Inter:wght@300;400;500;600&display=swap'); | |
| :root { | |
| --primary-gradient: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%); | |
| --glass-bg: rgba(17, 24, 39, 0.7); | |
| --glass-border: rgba(255, 255, 255, 0.1); | |
| } | |
| body, .gradio-container { | |
| font-family: 'Inter', sans-serif !important; | |
| background-color: #0f172a !important; /* Dark slate background */ | |
| } | |
| h1, h2, h3, h4, h5, h6 { | |
| font-family: 'Outfit', sans-serif !important; | |
| } | |
| /* Hero Section */ | |
| .hero-container { | |
| text-align: center; | |
| padding: 40px 20px; | |
| margin-bottom: 20px; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .hero-title { | |
| font-size: 4rem !important; | |
| font-weight: 800 !important; | |
| margin-bottom: 10px; | |
| letter-spacing: -0.02em; | |
| color: white; | |
| } | |
| .hero-title span { | |
| background: linear-gradient(135deg, #6366f1 0%, #a855f7 50%, #ec4899 100%); | |
| -webkit-background-clip: text; | |
| background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| color: #a855f7; /* Fallback */ | |
| } | |
| .hero-subtitle { | |
| font-size: 1.2rem; | |
| color: #94a3b8; | |
| max-width: 600px; | |
| margin: 0 auto; | |
| line-height: 1.6; | |
| } | |
| /* Cards & Containers */ | |
| .glass-panel { | |
| background: var(--glass-bg) !important; | |
| backdrop-filter: blur(12px); | |
| border: 1px solid var(--glass-border) !important; | |
| border-radius: 16px !important; | |
| box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); | |
| padding: 20px; | |
| } | |
| /* Buttons */ | |
| .primary-btn { | |
| background: var(--primary-gradient) !important; | |
| border: none !important; | |
| color: white !important; | |
| font-weight: 600 !important; | |
| transition: all 0.3s ease !important; | |
| box-shadow: 0 10px 15px -3px rgba(168, 85, 247, 0.4) !important; | |
| } | |
| .primary-btn:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 20px 25px -5px rgba(168, 85, 247, 0.5) !important; | |
| } | |
| /* Inputs */ | |
| input, textarea, select { | |
| background-color: rgba(30, 41, 59, 0.8) !important; | |
| border: 1px solid rgba(71, 85, 105, 0.5) !important; | |
| color: #e2e8f0 !important; | |
| } | |
| /* Progress Steps */ | |
| .step-container { | |
| display: flex; | |
| justify-content: space-between; | |
| margin-bottom: 20px; | |
| position: relative; | |
| } | |
| .step-line { | |
| position: absolute; | |
| top: 15px; | |
| left: 0; | |
| right: 0; | |
| height: 2px; | |
| background: #334155; | |
| z-index: 0; | |
| } | |
| .step-item { | |
| position: relative; | |
| z-index: 1; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| width: 25%; | |
| } | |
| .step-circle { | |
| width: 32px; | |
| height: 32px; | |
| border-radius: 50%; | |
| background: #1e293b; | |
| border: 2px solid #475569; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-weight: bold; | |
| color: #94a3b8; | |
| transition: all 0.3s ease; | |
| margin-bottom: 8px; | |
| } | |
| .step-item.active .step-circle { | |
| background: #a855f7; | |
| border-color: #a855f7; | |
| color: white; | |
| box-shadow: 0 0 15px rgba(168, 85, 247, 0.5); | |
| } | |
| .step-item.completed .step-circle { | |
| background: #10b981; | |
| border-color: #10b981; | |
| color: white; | |
| } | |
| .step-label { | |
| font-size: 0.8rem; | |
| color: #64748b; | |
| font-weight: 500; | |
| } | |
| .step-item.active .step-label { | |
| color: #e2e8f0; | |
| } | |
| /* Terminal Output */ | |
| .terminal-window { | |
| background: #0f172a !important; | |
| border: 1px solid #334155 !important; | |
| border-radius: 8px !important; | |
| font-family: 'JetBrains Mono', monospace !important; | |
| color: #22c55e !important; | |
| padding: 15px !important; | |
| } | |
| """ | |
| # --- Helper Functions --- | |
| def get_podcast_length_params(length_choice): | |
| return PODCAST_LENGTH_PRESETS.get(length_choice, (15, 4000)) | |
| def validate_settings_for_generation(llm_choice, own_base_url, own_api_key, openai_key, tts_provider, elevenlabs_key): | |
| errors = [] | |
| if llm_choice == "Own Inference": | |
| if not own_base_url: | |
| errors.append("β **Own Inference**: Base URL is required") | |
| elif not (own_base_url.startswith("http://") or own_base_url.startswith("https://")): | |
| errors.append("β **Own Inference**: Base URL must start with http:// or https://") | |
| elif llm_choice == "OpenAI": | |
| if not openai_key: | |
| errors.append("β **OpenAI**: API key is required") | |
| elif not openai_key.startswith("sk-"): | |
| errors.append("β **OpenAI**: API key must start with 'sk-'") | |
| # Only require ElevenLabs API key if using ElevenLabs | |
| if tts_provider == "elevenlabs": | |
| if not elevenlabs_key: | |
| errors.append("β **ElevenLabs TTS**: API key is required") | |
| elif not elevenlabs_key.startswith("sk_"): | |
| errors.append("β **ElevenLabs TTS**: API key must start with 'sk_'") | |
| # Supertonic doesn't require an API key (CPU-based) | |
| if errors: | |
| return False, "\n".join(errors) | |
| return True, "" | |
| def get_stats(): | |
| history = load_history() | |
| return f"π Total Podcasts: {len(history)}" | |
| def generate_progress_html(current_step): | |
| """Generate modern HTML progress steps""" | |
| steps = ["Fetch", "Extract", "Script", "Audio"] | |
| html = '<div class="step-container"><div class="step-line"></div>' | |
| for i, name in enumerate(steps): | |
| step_num = i + 1 | |
| status_class = "" | |
| icon = str(step_num) | |
| if step_num < current_step: | |
| status_class = "completed" | |
| icon = "β" | |
| elif step_num == current_step: | |
| status_class = "active" | |
| html += f""" | |
| <div class="step-item {status_class}"> | |
| <div class="step-circle">{icon}</div> | |
| <div class="step-label">{name}</div> | |
| </div> | |
| """ | |
| html += '</div>' | |
| return html | |
| def validated_generate_agent( | |
| url, pdf_file, advanced_mode, multi_urls, multi_pdfs, | |
| user_llm_choice, user_own_base_url, user_own_api_key, user_own_model, | |
| user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key, | |
| user_host_voice, user_guest_voice, user_podcast_length, user_context_limit, | |
| user_persona_mode, | |
| progress=gr.Progress() | |
| ): | |
| is_valid, error_message = validate_settings_for_generation( | |
| user_llm_choice, user_own_base_url, user_own_api_key, | |
| user_openai_key, user_tts_provider, user_elevenlabs_key | |
| ) | |
| if not is_valid: | |
| raise gr.Error(error_message) | |
| # Show progress container | |
| yield gr.update(visible=True, value=generate_progress_html(0)), "π Initializing...", gr.update(visible=False) | |
| try: | |
| # Run the generator | |
| iterator = run_agent( | |
| url, pdf_file, advanced_mode, multi_urls, multi_pdfs, | |
| user_llm_choice, user_own_base_url, user_own_api_key, user_own_model, | |
| user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key, | |
| user_host_voice, user_guest_voice, user_podcast_length, user_context_limit, | |
| user_persona_mode, progress | |
| ) | |
| logs_history = "" | |
| current_step = 0 | |
| for item in iterator: | |
| if isinstance(item, tuple): | |
| # Final result | |
| audio_path, final_logs = item | |
| generate_transcript(audio_path, final_logs) | |
| progress(1.0, desc="Done!") | |
| yield gr.update(value=generate_progress_html(5)), final_logs + "\n\n⨠DONE!", gr.update(value=audio_path, visible=True) | |
| else: | |
| # Log update | |
| log_entry = item | |
| logs_history += log_entry + "\n" | |
| # Determine step | |
| new_step = current_step | |
| step_desc = "Processing..." | |
| if "fetch_paper" in log_entry or "downloaded" in log_entry: | |
| new_step = 1 | |
| step_desc = "Fetching Paper..." | |
| elif "Extracted" in log_entry or "read_pdf" in log_entry: | |
| new_step = 2 | |
| step_desc = "Extracting Text..." | |
| elif "generate_script" in log_entry or "Generated script" in log_entry: | |
| new_step = 3 | |
| step_desc = "Generating Script..." | |
| elif "synthesize_podcast" in log_entry or "Synthesizing" in log_entry: | |
| new_step = 4 | |
| step_desc = "Synthesizing Audio..." | |
| if new_step != current_step: | |
| current_step = new_step | |
| # Map step to progress (1-4 -> 0.2-0.8) | |
| prog_val = 0.2 * current_step | |
| progress(prog_val, desc=step_desc) | |
| yield gr.update(value=generate_progress_html(current_step)), logs_history, gr.update(visible=False) | |
| else: | |
| yield gr.update(), logs_history, gr.update(visible=False) | |
| except Exception as e: | |
| raise gr.Error(f"System Error: {str(e)}") | |
| def run_agent( | |
| url, pdf_file, advanced_mode, multi_urls, multi_pdfs, | |
| user_llm_choice, user_own_base_url, user_own_api_key, user_own_model, | |
| user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key, | |
| user_host_voice, user_guest_voice, user_podcast_length, user_context_limit, | |
| user_persona_mode, | |
| progress=gr.Progress() | |
| ): | |
| # Determine provider mode | |
| if user_llm_choice == "Own Inference": | |
| provider_mode = "own_inference" | |
| else: # OpenAI | |
| provider_mode = "openai" | |
| target_exchanges, max_tokens = get_podcast_length_params(user_podcast_length) | |
| agent = PodcastAgent( | |
| provider_mode=provider_mode, | |
| own_base_url=user_own_base_url if user_own_base_url else None, | |
| own_api_key=user_own_api_key if user_own_api_key else None, | |
| own_model=user_own_model if user_own_model else None, | |
| openai_key=user_openai_key if user_openai_key else None, | |
| openai_model=user_openai_model if user_openai_model else None, | |
| tts_provider=user_tts_provider if user_tts_provider else "elevenlabs", | |
| elevenlabs_key=user_elevenlabs_key if user_elevenlabs_key else None, | |
| host_voice=user_host_voice if user_host_voice else None, | |
| guest_voice=user_guest_voice if user_guest_voice else None, | |
| max_tokens=max_tokens, | |
| target_dialogue_count=target_exchanges, | |
| context_limit=user_context_limit, | |
| persona_mode=user_persona_mode if user_persona_mode else "friendly_explainer", | |
| ) | |
| yield f"Starting Agent... [Mode: {provider_mode}]" | |
| # Logic for single vs multi | |
| if advanced_mode: | |
| # Parse URLs if provided | |
| urls = None | |
| if multi_urls and multi_urls.strip(): | |
| urls = [u.strip() for u in multi_urls.strip().split("\n") if u.strip()] | |
| # Parse PDFs if provided | |
| pdfs = None | |
| if multi_pdfs: | |
| if not isinstance(multi_pdfs, list): | |
| pdfs = [multi_pdfs] | |
| else: | |
| pdfs = multi_pdfs | |
| # Check if any input provided | |
| if not urls and not pdfs: | |
| raise Exception("No input provided for advanced mode") | |
| # Process both URLs and PDFs together | |
| url_count = len(urls) if urls else 0 | |
| pdf_count = len(pdfs) if pdfs else 0 | |
| total = url_count + pdf_count | |
| yield f"Processing {total} items ({url_count} URLs + {pdf_count} PDFs)..." | |
| yield from agent.process_multiple(urls=urls, pdf_files=pdfs) | |
| else: | |
| if not url and not pdf_file: | |
| raise Exception("Please provide a URL or PDF") | |
| yield from agent.process(url=url if url else None, pdf_file=pdf_file) | |
| def generate_transcript(audio_path, logs): | |
| if not audio_path: return None | |
| base_name = os.path.splitext(os.path.basename(audio_path))[0] | |
| transcript_path = os.path.join(OUTPUT_DIR, f"{base_name}_transcript.txt") | |
| with open(transcript_path, "w") as f: | |
| f.write(f"PAPERCAST TRANSCRIPT - {datetime.now()}\n{'='*30}\n\n{logs}") | |
| return transcript_path | |
| def get_history_data(): | |
| items = get_history_items() | |
| if not items: return [] | |
| return [[ | |
| item.get("timestamp", "N/A"), | |
| item.get("url", "PDF Upload") or "PDF Upload", | |
| item.get("audio_path", "") | |
| ] for item in items] | |
| def on_history_select(evt: gr.SelectData, data): | |
| try: | |
| return data.iloc[evt.index[0]].iloc[2] # Audio path is column 2 | |
| except: | |
| return None | |
| def perform_paper_search(query: str, progress=gr.Progress()): | |
| """ | |
| PAD: Search for papers using Paper Auto-Discovery | |
| Returns formatted results for display in UI | |
| """ | |
| if not query or not query.strip(): | |
| return gr.update(choices=[], value=None, visible=False), "β οΈ Please enter a search query" | |
| progress(0.2, desc="Searching Semantic Scholar & arXiv...") | |
| try: | |
| # Search using PAD | |
| results = search_papers(query.strip(), max_results=5) | |
| if not results: | |
| return gr.update(choices=[], value=None, visible=False), "β No papers found. Try a different query." | |
| progress(0.8, desc=f"Found {len(results)} papers") | |
| # Format results for Dropdown display | |
| choices = [] | |
| for i, paper in enumerate(results, 1): | |
| authors_str = ", ".join(paper.authors[:2]) | |
| if len(paper.authors) > 2: | |
| authors_str += " et al." | |
| year_str = f" ({paper.year})" if paper.year else "" | |
| source_emoji = "π" if paper.source == "semantic_scholar" else "π¬" | |
| # Create display label for dropdown | |
| label = f"{i}. {source_emoji} {paper.title}{year_str} | {authors_str}" | |
| choices.append(label) # Dropdown just needs the labels | |
| progress(1.0, desc="Search complete!") | |
| print(f"[DEBUG] Search found {len(results)} papers") | |
| print(f"[DEBUG] Choices created: {len(choices)}") | |
| print(f"[DEBUG] First choice: {choices[0] if choices else 'NONE'}") | |
| # Store results in a global variable (we'll use State instead) | |
| # Return updated Dropdown and success message | |
| success_msg = f"β Found {len(results)} papers from Semantic Scholar & arXiv" | |
| # Select the first option by default to ensure visibility/interaction | |
| first_choice = choices[0] if choices else None | |
| return gr.update(choices=choices, value=first_choice, visible=True, interactive=True), success_msg | |
| except Exception as e: | |
| return gr.update(choices=[], value=None, visible=False), f"β Search failed: {str(e)}" | |
| def on_paper_select(selected_label, query): | |
| """ | |
| Handle paper selection from search results. | |
| Returns the PDF URL to be used for podcast generation. | |
| """ | |
| if not selected_label: | |
| return None, "β οΈ Please select a paper from the search results" | |
| try: | |
| # Extract index from label (format: "1. emoji title...") | |
| selected_index = int(selected_label.split(".")[0]) - 1 | |
| # Re-run search to get results (since we can't pass complex objects through Gradio) | |
| results = search_papers(query.strip(), max_results=5) | |
| if not results or selected_index >= len(results) or selected_index < 0: | |
| return None, "β Invalid selection" | |
| selected_paper = results[selected_index] | |
| # Get PDF URL | |
| engine = PaperDiscoveryEngine() | |
| pdf_url = engine.get_pdf_url(selected_paper) | |
| if not pdf_url: | |
| return None, f"β No PDF available for: {selected_paper.title}" | |
| # Return PDF URL and success message | |
| authors_str = ", ".join(selected_paper.authors[:3]) | |
| if len(selected_paper.authors) > 3: | |
| authors_str += " et al." | |
| success_msg = f"β Selected: **{selected_paper.title}**\n\nπ₯ {authors_str}\nπ {selected_paper.year or 'N/A'}\nπ {pdf_url}" | |
| return pdf_url, success_msg | |
| except Exception as e: | |
| return None, f"β Selection failed: {str(e)}" | |
| # --- Main UI --- | |
| def main(): | |
| # Use a dark theme base but override heavily with CSS | |
| theme = gr.themes.Soft( | |
| primary_hue="violet", | |
| secondary_hue="slate", | |
| neutral_hue="slate", | |
| font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"], | |
| ).set( | |
| body_background_fill="#0f172a", | |
| block_background_fill="#1e293b", | |
| block_border_width="1px", | |
| block_border_color="rgba(255,255,255,0.1)", | |
| ) | |
| with gr.Blocks(title="PaperCast") as demo: | |
| # Session State | |
| user_llm_choice = gr.State(value="Own Inference") | |
| user_own_base_url = gr.State(value="") | |
| user_own_api_key = gr.State(value="") | |
| user_own_model = gr.State(value="") | |
| user_openai_key = gr.State(value="") | |
| user_openai_model = gr.State(value="") | |
| user_tts_provider = gr.State(value="elevenlabs") | |
| user_elevenlabs_key = gr.State(value="") | |
| user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # ElevenLabs default | |
| user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # ElevenLabs default | |
| user_podcast_length = gr.State(value=4096) | |
| user_persona_mode = gr.State(value="friendly_explainer") # PPF default | |
| # Hero Section | |
| with gr.Row(elem_classes="hero-container"): | |
| gr.HTML(""" | |
| <h1 class="hero-title"><span>PaperCast</span> ποΈ</h1> | |
| <p class="hero-subtitle"> | |
| Experience the future of knowledge consumption. <br> | |
| An autonomous agentic system that transforms complex research papers into engaging, studio-quality audio experiences. | |
| </p> | |
| """) | |
| with gr.Tabs(): | |
| # --- Tab 1: Create --- | |
| with gr.Tab("β¨ Create Podcast"): | |
| with gr.Row(): | |
| # Left Col: Inputs | |
| with gr.Column(scale=4, elem_classes="glass-panel"): | |
| gr.Markdown("### π₯ Source Material") | |
| with gr.Tabs(selected=0) as input_tabs: | |
| with gr.Tab("π URL", id=0): | |
| url_input = gr.Textbox( | |
| label="Paper URL", | |
| placeholder="https://arxiv.org/abs/...", | |
| show_label=False, | |
| container=False | |
| ) | |
| with gr.Tab("π PDF Upload"): | |
| pdf_upload = gr.File( | |
| label="Upload PDF", | |
| file_types=[".pdf"], | |
| container=False | |
| ) | |
| with gr.Tab("π Search (PAD)"): | |
| gr.Markdown("**Paper Auto-Discovery** β Search across Semantic Scholar & arXiv") | |
| with gr.Row(): | |
| search_query = gr.Textbox( | |
| label="Search Query", | |
| placeholder="e.g., 'diffusion models', 'Grok reasoning', 'transformer attention'...", | |
| show_label=False, | |
| container=False, | |
| scale=4, | |
| lines=1, | |
| max_lines=1 | |
| ) | |
| search_btn = gr.Button("π Search", variant="primary", scale=1) | |
| search_status = gr.Markdown("", visible=True) | |
| # Container for search results (always visible) | |
| with gr.Column(visible=True) as search_results_container: | |
| search_results = gr.Radio( | |
| label="π Select a Paper", | |
| choices=[], | |
| interactive=True, | |
| show_label=True, | |
| ) | |
| use_selected_btn = gr.Button( | |
| "β Use Selected Paper", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| # Hidden state to store selected PDF URL from search | |
| selected_pdf_url = gr.State(value=None) | |
| selected_search_query = gr.State(value=None) | |
| # Wire search functionality | |
| def handle_search(query): | |
| """Handle search button click""" | |
| if not query or not query.strip(): | |
| return ( | |
| gr.update(choices=[], value=None), | |
| "β οΈ Please enter a search query", | |
| query | |
| ) | |
| try: | |
| # Search using PAD | |
| results = search_papers(query.strip(), max_results=5) | |
| if not results: | |
| return ( | |
| gr.update(choices=[], value=None), | |
| "β No papers found. Try a different query.", | |
| query | |
| ) | |
| # Format results for Radio display | |
| choices = [] | |
| for i, paper in enumerate(results, 1): | |
| authors_str = ", ".join(paper.authors[:2]) | |
| if len(paper.authors) > 2: | |
| authors_str += " et al." | |
| year_str = f" ({paper.year})" if paper.year else "" | |
| source_emoji = "π" if paper.source == "semantic_scholar" else "π¬" | |
| # Create display label | |
| label = f"{i}. {source_emoji} {paper.title}{year_str} | {authors_str}" | |
| choices.append(label) | |
| first_choice = choices[0] if choices else None | |
| status_msg = f"β Found {len(results)} papers from Semantic Scholar & arXiv" | |
| status_msg += "\n\n**β‘οΈ Next:** Select a paper from the list below, then click 'Use Selected Paper'" | |
| print(f"[DEBUG] handle_search - found {len(choices)} papers") | |
| print(f"[DEBUG] choices: {choices[:2]}...") | |
| return ( | |
| gr.update(choices=choices, value=first_choice), | |
| status_msg, | |
| query | |
| ) | |
| except Exception as e: | |
| print(f"[ERROR] Search failed: {e}") | |
| return ( | |
| gr.update(choices=[], value=None), | |
| f"β Search failed: {str(e)}", | |
| query | |
| ) | |
| search_btn.click( | |
| fn=handle_search, | |
| inputs=[search_query], | |
| outputs=[search_results, search_status, selected_search_query] | |
| ) | |
| def handle_use_selected(selected_idx, query): | |
| """Handle 'Use Selected Paper' button click""" | |
| pdf_url, status_msg = on_paper_select(selected_idx, query) | |
| # Add instruction to the status message | |
| if pdf_url: | |
| status_msg += "\n\nβ‘οΈ **Next:** Switch to the 'π URL' tab to see the paper URL, then click 'ποΈ Generate Podcast'" | |
| return pdf_url, status_msg, pdf_url # Update url_input with PDF URL | |
| use_selected_btn.click( | |
| fn=handle_use_selected, | |
| inputs=[search_results, selected_search_query], | |
| outputs=[selected_pdf_url, search_status, url_input] | |
| ) | |
| with gr.Accordion("βοΈ Advanced Options", open=False, visible=True) as advanced_accordion: | |
| advanced_mode = gr.Checkbox(label="Batch Mode (Multiple Papers)") | |
| # Warning message (only visible in batch mode) | |
| batch_warning = gr.Markdown( | |
| """ | |
| > **β οΈ Experimental Feature** | |
| > | |
| > Batch mode is currently experimental and may not work reliably in all cases. | |
| > Some attempts may fail due to model limitations or processing errors. | |
| > If you experience issues, try processing papers individually. | |
| """, | |
| visible=False | |
| ) | |
| with gr.Group(visible=False) as batch_inputs: | |
| multi_url_input = gr.Textbox(label="Multiple URLs (one per line)", lines=3) | |
| multi_pdf_upload = gr.File(label="Multiple PDFs", file_count="multiple") | |
| gr.Markdown("---") | |
| gr.Markdown("### π Context Settings") | |
| # Context limit slider (only visible in batch mode) | |
| context_limit_slider = gr.Slider( | |
| minimum=50000, | |
| maximum=500000, | |
| value=80000, | |
| step=10000, | |
| label="Max Context Limit (characters)", | |
| info="β οΈ Warning: Increasing this limit will increase token costs and processing time." | |
| ) | |
| def toggle_advanced(adv): | |
| return { | |
| batch_warning: gr.update(visible=adv), | |
| batch_inputs: gr.update(visible=adv), | |
| url_input: gr.update(visible=not adv), | |
| pdf_upload: gr.update(visible=not adv) | |
| } | |
| advanced_mode.change(toggle_advanced, advanced_mode, [batch_warning, batch_inputs, url_input, pdf_upload]) | |
| # Hide Advanced Options when Search (PAD) tab is selected | |
| def on_tab_select(evt: gr.SelectData): | |
| """Handle tab selection - hide batch mode for Search tab""" | |
| # Tab indices: 0=URL, 1=PDF Upload, 2=Search (PAD) | |
| is_search_tab = (evt.index == 2) | |
| return gr.update(visible=not is_search_tab) | |
| input_tabs.select( | |
| fn=on_tab_select, | |
| outputs=[advanced_accordion] | |
| ) | |
| generate_btn = gr.Button( | |
| "ποΈ Generate Podcast", | |
| variant="primary", | |
| elem_classes="primary-btn", | |
| size="lg" | |
| ) | |
| # Right Col: Output | |
| with gr.Column(scale=5, elem_classes="glass-panel"): | |
| gr.Markdown("### π‘ Live Feed") | |
| # Progress Steps | |
| progress_html = gr.HTML(visible=False) | |
| # Terminal Log | |
| status_output = gr.Code( | |
| label="System Logs", | |
| language="shell", | |
| interactive=False, | |
| lines=12, | |
| elem_classes="terminal-window" | |
| ) | |
| # Audio Player | |
| audio_output = gr.Audio( | |
| label="π§ Final Podcast", | |
| type="filepath", | |
| interactive=False, | |
| visible=False | |
| ) | |
| # Wiring | |
| generate_btn.click( | |
| fn=validated_generate_agent, | |
| inputs=[ | |
| url_input, pdf_upload, advanced_mode, multi_url_input, multi_pdf_upload, | |
| user_llm_choice, user_own_base_url, user_own_api_key, user_own_model, | |
| user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key, | |
| user_host_voice, user_guest_voice, user_podcast_length, context_limit_slider, | |
| user_persona_mode | |
| ], | |
| outputs=[progress_html, status_output, audio_output] | |
| ) | |
| # --- Tab 2: Library --- | |
| with gr.Tab("π Library"): | |
| with gr.Row(elem_classes="glass-panel"): | |
| with gr.Column(): | |
| refresh_btn = gr.Button("π Refresh Library", size="sm", variant="secondary") | |
| history_table = gr.Dataframe( | |
| headers=["Date", "Source", "Audio Path"], | |
| datatype=["str", "str", "str"], | |
| value=get_history_data(), | |
| interactive=False, | |
| label="Recent Podcasts" | |
| ) | |
| with gr.Column(): | |
| history_player = gr.Audio(label="Playback") | |
| refresh_btn.click(lambda: get_history_data(), None, history_table) | |
| history_table.select(on_history_select, history_table, history_player) | |
| # --- Tab 3: Settings --- | |
| with gr.Tab("βοΈ Settings"): | |
| with gr.Row(elem_classes="glass-panel"): | |
| with gr.Column(): | |
| gr.Markdown("### π€ Model Configuration") | |
| llm_choice = gr.Radio( | |
| ["Own Inference", "OpenAI"], | |
| value="Own Inference", | |
| label="Provider" | |
| ) | |
| # Own Inference | |
| with gr.Group(visible=True) as own_group: | |
| own_base = gr.Textbox(label="Base URL", placeholder="http://localhost:1234/v1") | |
| own_key = gr.Textbox(label="API Key", type="password") | |
| own_model = gr.Textbox(label="Model Name", placeholder="llama-3.1-8b") | |
| # OpenAI | |
| with gr.Group(visible=False) as openai_group: | |
| openai_key = gr.Textbox(label="OpenAI Key", type="password") | |
| openai_model = gr.Textbox(label="Model", value="gpt-4o-mini") | |
| def toggle_llm(choice): | |
| return [ | |
| gr.update(visible=choice=="Own Inference"), # own_group | |
| gr.update(visible=choice=="OpenAI") # openai_group | |
| ] | |
| llm_choice.change(toggle_llm, llm_choice, [own_group, openai_group]) | |
| with gr.Column(): | |
| gr.Markdown("### π£οΈ Voice Settings") | |
| tts_choice = gr.Radio( | |
| ["ElevenLabs", "Supertonic (CPU)"], | |
| value="ElevenLabs", | |
| label="TTS Provider", | |
| info="Supertonic runs on CPU (no API key required, but may be slower than cloud-based TTS)" | |
| ) | |
| # ElevenLabs Settings | |
| with gr.Group(visible=True) as elevenlabs_group: | |
| eleven_key = gr.Textbox(label="ElevenLabs API Key", type="password") | |
| host_voice_eleven = gr.Dropdown( | |
| choices=list(ELEVENLABS_VOICES.keys()), | |
| value="Antoni (Male - Well-rounded)", | |
| label="Host Voice" | |
| ) | |
| guest_voice_eleven = gr.Dropdown( | |
| choices=list(ELEVENLABS_VOICES.keys()), | |
| value="Bella (Female - Soft)", | |
| label="Guest Voice" | |
| ) | |
| # Supertonic Settings | |
| with gr.Group(visible=False) as supertonic_group: | |
| gr.Markdown("**CPU-based TTS** (no API key required)\n\nβ οΈ *Note: CPU processing may be slower than cloud-based services*") | |
| host_voice_supertonic = gr.Dropdown( | |
| choices=list(SUPERTONIC_VOICES.keys()), | |
| value="M1 (Male 1)", | |
| label="Host Voice" | |
| ) | |
| guest_voice_supertonic = gr.Dropdown( | |
| choices=list(SUPERTONIC_VOICES.keys()), | |
| value="F1 (Female 1)", | |
| label="Guest Voice" | |
| ) | |
| length_slider = gr.Dropdown( | |
| choices=list(PODCAST_LENGTH_PRESETS.keys()), | |
| value="π Medium (14-16 exchanges, ~5-6 min)", | |
| label="Podcast Length" | |
| ) | |
| gr.Markdown("### π Podcast Persona Framework (PPF)") | |
| persona_dropdown = gr.Dropdown( | |
| choices=[ | |
| "π€ Friendly Explainer (Default)", | |
| "βοΈ Academic Debate", | |
| "π₯ Savage Roast", | |
| "π Pedagogical", | |
| "π Interdisciplinary Clash" | |
| ], | |
| value="π€ Friendly Explainer (Default)", | |
| label="Conversation Style", | |
| info="Choose the podcast conversation style and character personalities" | |
| ) | |
| gr.Markdown(""" | |
| **Persona Descriptions:** | |
| - **π€ Friendly Explainer** β *Alex & Jamie* | |
| Two friends casually discussing the paper. Accessible, warm, ideal for general audiences. (Default mode) | |
| - **βοΈ Academic Debate** β *Dr. Morgan & Prof. Rivera* | |
| Dr. Morgan defends the paper, Prof. Rivera politely challenges claims and methodology. | |
| *"This claim is strong, but Table 2's baseline seems weak..."* | |
| - **π₯ Savage Roast** β *The Critic & The Defender* | |
| The Critic brutally roasts the paper, The Defender stubbornly fights back. | |
| *"This ablation is an absolute clown show!", "Figure 4 is just statistical noise!"* | |
| Fun and bold approach! | |
| - **π Pedagogical** β *Professor Chen & Student Sam* | |
| Professor teaches step-by-step, Student constantly asks questions. | |
| Perfect for learning complex concepts from scratch. | |
| - **π Interdisciplinary Clash** β *Domain Expert & The Outsider* | |
| Domain Expert explains technical details, Outsider critiques from a completely different field perspective. | |
| *"This neuron analogy makes zero biological sense!"* | |
| """) | |
| def toggle_tts_provider(choice): | |
| is_elevenlabs = choice == "ElevenLabs" | |
| return [ | |
| gr.update(visible=is_elevenlabs), # elevenlabs_group | |
| gr.update(visible=not is_elevenlabs) # supertonic_group | |
| ] | |
| def update_voices_on_tts_change(choice): | |
| """Update voice IDs when TTS provider changes""" | |
| if choice == "ElevenLabs": | |
| # Return ElevenLabs default voices | |
| return "ErXwobaYiN019PkySvjV", "EXAVITQu4vr4xnSDxMaL" | |
| else: # Supertonic | |
| # Return Supertonic default voices (M1, F1) | |
| return "M1", "F1" | |
| tts_choice.change(toggle_tts_provider, tts_choice, [elevenlabs_group, supertonic_group]) | |
| tts_choice.change(update_voices_on_tts_change, tts_choice, [user_host_voice, user_guest_voice]) | |
| # Bind settings to state | |
| llm_choice.change(lambda x: x, llm_choice, user_llm_choice) | |
| own_base.change(lambda x: x, own_base, user_own_base_url) | |
| own_key.change(lambda x: x, own_key, user_own_api_key) | |
| own_model.change(lambda x: x, own_model, user_own_model) | |
| openai_key.change(lambda x: x, openai_key, user_openai_key) | |
| openai_model.change(lambda x: x, openai_model, user_openai_model) | |
| # TTS Provider binding | |
| def update_tts_provider(choice): | |
| return "elevenlabs" if choice == "ElevenLabs" else "supertonic" | |
| tts_choice.change(update_tts_provider, tts_choice, user_tts_provider) | |
| # Voice bindings - need to handle both providers | |
| def update_host_voice(tts_provider, eleven_voice, super_voice): | |
| if tts_provider == "ElevenLabs": | |
| return ELEVENLABS_VOICES.get(eleven_voice, "ErXwobaYiN019PkySvjV") | |
| else: | |
| return SUPERTONIC_VOICES.get(super_voice, "M1") | |
| def update_guest_voice(tts_provider, eleven_voice, super_voice): | |
| if tts_provider == "ElevenLabs": | |
| return ELEVENLABS_VOICES.get(eleven_voice, "EXAVITQu4vr4xnSDxMaL") | |
| else: | |
| return SUPERTONIC_VOICES.get(super_voice, "F1") | |
| eleven_key.change(lambda x: x, eleven_key, user_elevenlabs_key) | |
| # Update voice states when either provider's voice changes | |
| host_voice_eleven.change( | |
| lambda v: ELEVENLABS_VOICES.get(v, "ErXwobaYiN019PkySvjV"), | |
| host_voice_eleven, | |
| user_host_voice | |
| ) | |
| guest_voice_eleven.change( | |
| lambda v: ELEVENLABS_VOICES.get(v, "EXAVITQu4vr4xnSDxMaL"), | |
| guest_voice_eleven, | |
| user_guest_voice | |
| ) | |
| host_voice_supertonic.change( | |
| lambda v: SUPERTONIC_VOICES.get(v, "M1"), | |
| host_voice_supertonic, | |
| user_host_voice | |
| ) | |
| guest_voice_supertonic.change( | |
| lambda v: SUPERTONIC_VOICES.get(v, "F1"), | |
| guest_voice_supertonic, | |
| user_guest_voice | |
| ) | |
| length_slider.change(lambda x: x, length_slider, user_podcast_length) | |
| # Persona binding | |
| def map_persona_to_key(display_name): | |
| """Map UI display names to internal persona keys""" | |
| mapping = { | |
| "π€ Friendly Explainer (Default)": "friendly_explainer", | |
| "βοΈ Academic Debate": "academic_debate", | |
| "π₯ Savage Roast": "savage_roast", | |
| "π Pedagogical": "pedagogical", | |
| "π Interdisciplinary Clash": "interdisciplinary_clash" | |
| } | |
| return mapping.get(display_name, "friendly_explainer") | |
| persona_dropdown.change(map_persona_to_key, persona_dropdown, user_persona_mode) | |
| # --- Tab 4: About --- | |
| with gr.Tab("βΉοΈ About"): | |
| with gr.Row(elem_classes="glass-panel"): | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Column(scale=3): | |
| gr.Markdown(f""" | |
| <div style="text-align: center;"> | |
| # About PaperCast | |
| **The world's first adaptive persona-driven academic podcast platform with intelligent paper discovery.** | |
| Transform any research paper into engaging audio conversations with your choice of style β from casual explanations to brutal critiques. Powered by our revolutionary **Podcast Persona Framework (PPF)**, **Paper Auto-Discovery (PAD)** engine, MCP tools, and studio-quality TTS. | |
| --- | |
| ## π Revolutionary Frameworks | |
| ### **PAD** β Paper Auto-Discovery Engine | |
| **The world's first intelligent multi-source paper discovery system built specifically for podcast generation.** | |
| Finding the right research paper shouldn't be a chore. We built **PAD (Paper Auto-Discovery)** from the ground up β a custom-engineered search system that goes beyond simple keyword matching. | |
| **What makes PAD revolutionary:** | |
| π **Multi-Source Intelligence** β Searches across multiple academic databases simultaneously: | |
| - **Semantic Scholar Graph API** - Access to 200M+ papers with semantic understanding | |
| - **arXiv** - Latest preprints and cutting-edge research | |
| - Parallel execution for lightning-fast results (under 2 seconds) | |
| π§ **Smart Result Aggregation** β Built from scratch with advanced deduplication: | |
| - Intelligent title matching across sources | |
| - Eliminates duplicates while preserving metadata quality | |
| - Prioritizes papers with open-access PDFs | |
| β‘ **Seamless Integration** β No copy-paste, no manual URL hunting: | |
| - Search directly within PaperCast interface | |
| - One-click paper selection | |
| - Automatic PDF URL extraction and validation | |
| - Instant transition to podcast generation | |
| π― **Research-Grade Quality** β Enterprise-level reliability: | |
| - Graceful handling of API rate limits | |
| - Fallback strategies when one source fails | |
| - Comprehensive error handling and user feedback | |
| - Extracts full metadata (authors, year, abstract, citations) | |
| **Why we built PAD from scratch:** | |
| Existing search tools are designed for reading papers, not generating podcasts. We needed: | |
| - **Speed**: Parallel API calls return results in under 2 seconds | |
| - **Reliability**: Custom retry logic and fallback strategies | |
| - **Integration**: Direct pipeline from search β PDF β podcast | |
| - **User Experience**: No context switching, no tab juggling | |
| **Technical Innovation:** | |
| - Custom Python engine using `ThreadPoolExecutor` for concurrent API calls | |
| - Smart result ranking combining relevance scores from multiple sources | |
| - Automatic PDF URL construction for arXiv papers | |
| - State-of-the-art deduplication using fuzzy title matching | |
| --- | |
| ### **PPF** β Podcast Persona Framework | |
| **The world's first adaptive persona system for AI-generated academic podcasts.** | |
| Every other podcast generator treats all papers the same way: bland, generic conversations that put you to sleep. We solved the **one-size-fits-all problem** by inventing the **Podcast Persona Framework (PPF)** β a groundbreaking system that adapts conversation style, character dynamics, and educational approach to **your** preference. | |
| **What makes PPF revolutionary:** | |
| π **5 Distinct Persona Modes** β Not just voice changes, but fundamentally different conversation dynamics: | |
| - π€ **Friendly Explainer** β Two colleagues casually discussing research over coffee | |
| - βοΈ **Academic Debate** β Rigorous defense vs. constructive criticism (perfect for critical analysis) | |
| - π₯ **Savage Roast** β Brutally entertaining critique meets passionate defense (most engaging!) | |
| - π **Pedagogical** β Patient professor teaching eager student (best for learning complex topics) | |
| - π **Interdisciplinary Clash** β Domain expert vs. outsider perspective (reveals hidden assumptions) | |
| π§ **Dynamic Character Intelligence** β Each persona features unique characters with distinct personalities: | |
| - Not generic "Host" and "Guest" β real names like **Dr. Morgan**, **The Critic**, **Professor Chen** | |
| - Characters maintain consistent perspectives throughout entire podcast | |
| - Authentic reactions, natural interruptions, genuine debates | |
| β‘ **Zero Overhead** β Works seamlessly with any TTS provider (ElevenLabs, Supertonic, etc.) | |
| - First speaker β Host voice | |
| - Second speaker β Guest voice | |
| - Automatic voice mapping regardless of character names | |
| π― **Universal Compatibility** β PPF is provider-agnostic: | |
| - Works with any LLM (OpenAI, local models, reasoning models) | |
| - Compatible with all TTS engines | |
| - No special configuration required | |
| **Why this matters:** | |
| Traditional podcast generators produce the same monotonous style for every paper. A groundbreaking ML paper gets the same treatment as a medical study. A complex theoretical physics paper sounds identical to an introductory survey. | |
| **PPF changes everything.** Now you choose how you want to consume research: | |
| - Need to learn? β **Pedagogical mode** | |
| - Want entertainment? β **Savage Roast** | |
| - Seeking critical analysis? β **Academic Debate** | |
| - Quick overview? β **Friendly Explainer** | |
| - Fresh perspective? β **Interdisciplinary Clash** | |
| **Built from scratch, perfected for you.** We didn't just add a "tone" parameter β we architected an entire persona system with character-aware prompts, dynamic speaker mapping, and adaptive conversation strategies. | |
| --- | |
| ## π― How It Works | |
| Our intelligent agent orchestrates a **dual-innovation pipeline** combining PAD and PPF: | |
| 1. **π Discovery (PAD)** - Search across Semantic Scholar & arXiv simultaneously, get results in <2 seconds | |
| 2. **π₯ Input** - Select paper from PAD results, or use URL/PDF upload | |
| 3. **π Extraction** - PyMuPDF intelligently extracts paper structure | |
| 4. **π Persona Selection** - Choose from 5 unique conversation modes (PPF) | |
| 5. **π¬ Script Generation** - LLM generates character-specific dialogue with distinct personalities | |
| 6. **π£οΈ Dynamic Mapping** - Automatic voice assignment based on persona characters | |
| 7. **π€ Voice Synthesis** - Studio-quality audio with ElevenLabs Turbo v2.5 or Supertonic | |
| 8. **β Delivery** - Listen, download, share your personalized podcast | |
| **What makes this special:** Unlike generic converters, we built **two groundbreaking systems from scratch** β PAD for intelligent discovery and PPF for adaptive personas. | |
| --- | |
| ## π Key Features | |
| π **PAD - Paper Auto-Discovery** β Custom-built multi-source search engine (Semantic Scholar + arXiv) with parallel execution | |
| π **5 Revolutionary Persona Modes** β First-of-its-kind adaptive conversation system (PPF) | |
| π§ **Dynamic Character Intelligence** β Real personalities, not generic voices | |
| β‘ **Lightning-Fast Search** β Get 5 relevant papers in under 2 seconds with intelligent deduplication | |
| ποΈ **Studio-Quality Audio** β ElevenLabs Turbo v2.5 (250ms latency, cinematic quality) | |
| π§ **Universal Compatibility** β Works with any LLM (OpenAI, local models, reasoning models) | |
| π **Complete History** β All podcasts saved locally with metadata | |
| π **Multi-Paper Support** β Batch process multiple papers into comprehensive discussions | |
| π― **Provider Agnostic** β Bring your own API keys, use local models, total flexibility | |
| π **Zero Friction Workflow** β From search query to podcast in 60 seconds | |
| --- | |
| ## π§ Technology Stack | |
| **Core Innovations**: | |
| - **PAD (Paper Auto-Discovery)** β Custom multi-source search engine built from scratch | |
| - **PPF (Podcast Persona Framework)** β Proprietary adaptive conversation system | |
| **LLM**: Universal support (OpenAI GPT-4o/o1, local LLMs, reasoning models) | |
| **TTS**: ElevenLabs Turbo v2.5 (premium) or Supertonic (free CPU-based) | |
| **PDF Processing**: PyMuPDF for fast, accurate text extraction | |
| **UI Framework**: Gradio 6 with custom glass-morphism design | |
| **Agent Architecture**: Custom Python orchestrator with MCP tools | |
| --- | |
| ## π Built For | |
| **MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer) | |
| *Tag: `mcp-in-action-track-consumer`* | |
| **What we're showcasing:** | |
| - π **PAD Innovation** - First-ever custom multi-source paper discovery engine built for podcast generation | |
| - π **PPF Innovation** - First-ever adaptive persona system for academic podcasts | |
| - π€ **Autonomous Agent** - Intelligent planning, reasoning, and persona-aware execution | |
| - π§ **MCP Integration** - Tools as cognitive extensions for the agent | |
| - π¨ **Gradio 6 UX** - Glass-morphism design with intuitive search & persona controls | |
| - π **Real Impact** - Making research accessible and engaging for everyone | |
| **Why PAD + PPF matter for this hackathon:** We didn't just build a tool β we invented **two new paradigms**. PAD solves the discovery problem (finding papers), PPF solves the consumption problem (understanding papers). Together, they create a **zero-friction pipeline** from curiosity to knowledge. | |
| --- | |
| ## π About the Agent | |
| PaperCast's **discovery-aware, persona-driven autonomous agent** makes intelligent decisions at every step: | |
| - **π Discovery Intelligence** - Orchestrates parallel API calls to multiple paper sources, ranks and deduplicates results | |
| - **π§ Persona Analysis** - Evaluates paper complexity and matches optimal persona mode | |
| - **π Strategic Planning** - Determines conversation flow based on selected persona (debate-style vs. teaching-style) | |
| - **π Character Orchestration** - Generates distinct personalities for each persona (Dr. Morgan β The Critic β Professor Chen) | |
| - **π¬ Adaptive Dialogue** - Adjusts technical depth, humor level, and interaction style per persona | |
| - **π£οΈ Dynamic Synthesis** - Maps persona characters to voice IDs automatically | |
| - **π Multi-Paper Intelligence** - Synthesizes insights across papers while maintaining persona consistency | |
| **The key insight:** The agent doesn't just process papers β it **discovers and performs** them. PAD finds the perfect paper, PPF delivers it in your perfect style. | |
| --- | |
| ## π‘ Use Cases | |
| ### π§ **Learning & Education** | |
| - **PAD Search** β Find "transformer attention mechanisms" β Get 5 papers instantly | |
| - **Pedagogical mode** for complex topics you want to master | |
| - **Friendly Explainer** for quick overviews during commutes | |
| - **Interdisciplinary Clash** to understand papers outside your field | |
| ### π¬ **Research & Analysis** | |
| - **PAD Search** β Discover latest papers on your research topic | |
| - **Academic Debate** for critical evaluation of methodologies | |
| - **Savage Roast** to identify weak points and overstated claims | |
| - Quick paper screening before deep reading (60 seconds from search to audio) | |
| ### π **Accessibility** | |
| - **Zero barrier to entry** β No URLs, no downloads, just search and listen | |
| - Make cutting-edge research understandable for non-experts | |
| - Bridge knowledge gaps between disciplines | |
| - Learn through conversation, not dry text | |
| ### π **Entertainment** | |
| - **PAD + Savage Roast combo** β Find trending papers and roast them | |
| - Host paper "debate clubs" with Academic Debate mode | |
| - Share entertaining takes on research with Savage Roast clips | |
| --- | |
| ## π What Makes Us Different | |
| π **We built PAD from scratch** β First custom multi-source academic search engine designed for podcast generation. Parallel API orchestration, smart deduplication, zero-friction UX. | |
| π **We invented PPF** β The Podcast Persona Framework is a **world-first innovation**. No other platform offers adaptive conversation personas. | |
| β‘ **End-to-end innovation** β Most tools stop at URL β podcast. We solved **discovery + consumption** with two custom-built systems. | |
| π§ **Real characters, not voices** β Other tools change tone. We create **distinct personalities** with names, perspectives, and consistent behavior. | |
| π **60-second pipeline** β From search query ("diffusion models") to finished podcast in under a minute. No other platform comes close. | |
| π§ **Built for flexibility** β Provider-agnostic design works with any LLM, any TTS, any infrastructure. | |
| π― **User empowerment** β You choose what to listen to (PAD) and how to listen (PPF). Complete control over discovery and consumption. | |
| **The bottom line:** Every other podcast generator is a one-trick pony. PaperCast is a **research discovery platform + repertory theater company** β we find papers you love and perform them your way. | |
| --- | |
| ## π Special Thanks | |
| This project was made possible by the incredible support from: | |
| <div style="display: flex; justify-content: center; align-items: center; gap: 80px; margin: 50px 0; flex-wrap: wrap;"> | |
| <div style="text-align: center;"> | |
| <a href="https://modal.com" target="_blank"> | |
| <img src="https://images.prismic.io/contrary-research/aDnorSdWJ-7kSv6V_ModalLabs_Cover.png?auto=format,compress" alt="Modal" style="height: 140px; width: auto; display: block; margin: 0 auto;"> | |
| </a> | |
| </div> | |
| <div style="text-align: center;"> | |
| <a href="https://elevenlabs.io" target="_blank"> | |
| <img src="https://eleven-public-cdn.elevenlabs.io/payloadcms/9trrmnj2sj8-logo-logo.svg" alt="ElevenLabs" style="height: 100px; width: auto; display: block; margin: 0 auto;"> | |
| </a> | |
| </div> | |
| </div> | |
| **Why we chose these partners:** | |
| π **Modal** - Serverless AI infrastructure that gives us instant access to powerful GPUs (A100, H100) with sub-second cold starts. Their platform handles automatic scaling, letting us process papers efficiently without managing infrastructure. Perfect for variable workloads and rapid iteration. | |
| ποΈ **ElevenLabs** - We use their **Turbo v2.5** model for studio-quality voice synthesis. This model delivers incredibly natural, emotionally expressive voices with low latency (~250-300ms) and 50% lower cost. The voice quality makes our podcasts truly engaging and professional. | |
| --- | |
| Made with β€οΈ using Anthropic, OpenAI, Modal, ElevenLabs, Gradio, and MCP | |
| </div> | |
| """) | |
| with gr.Column(scale=1): | |
| pass | |
| demo.launch( | |
| theme=theme, | |
| css=CUSTOM_CSS, | |
| mcp_server=True # Enable MCP support | |
| ) | |
| if __name__ == "__main__": | |
| main() | |