import gradio as gr from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import os import json import re import threading import time import uuid from datetime import datetime import nltk from transformers import pipeline from huggingface_hub import HfApi, HfFolder import requests # --- NLTK Setup --- try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') nltk.download("punkt_tab") # --- Logging Setup --- log_dir = "./logs" os.makedirs(log_dir, exist_ok=True) log_file_path = os.path.join(log_dir, "Second_Opinion_Logs.log") def upload_logs_to_hf(): try: api = HfApi() token = HfFolder.get_token() or os.getenv("HUGGINGFACE_HUB_TOKEN") if not token: return api.upload_file( path_or_fileobj=log_file_path, path_in_repo="logs/Second_Opinion_Logs.log", repo_id="singhn9/privateSOWN", repo_type="space", token=token, ) except Exception as e: print(f"Log upload failed: {e}") def background_log_uploader(interval=300): while True: time.sleep(interval) if os.path.exists(log_file_path): upload_logs_to_hf() if os.getenv("ENABLE_LOG_UPLOAD") == "1": threading.Thread(target=background_log_uploader, daemon=True).start() def log_action(action, request: gr.Request = None): user_ip = request.client.host if request else "Unknown IP" timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"{timestamp} (GMT) - IP: {user_ip} - {action}\n" with open(log_file_path, 'a') as f: f.write(log_entry) def log_input_text(resume_text, job_list, request: gr.Request = None): user_ip = request.client.host if request else "Unknown IP" timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") jobs_str = "\n---\n".join(job_list) if isinstance(job_list, list) else str(job_list) log_entry = ( f"{timestamp} (GMT) - IP: {user_ip}\n" f"--- Resume Input ---\n{resume_text}\n\n" f"--- Job Descriptions Input ---\n{jobs_str}\n" "---------------------------------------------\n" ) with open(log_file_path, 'a') as f: f.write(log_entry) # --- ASR & Models --- asr = None def get_asr(): global asr if asr is None: asr = pipeline( "automatic-speech-recognition", model="openai/whisper-tiny.en", chunk_length_s=30, device=-1 ) return asr # Load embedding models models = { "all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2"), "paraphrase-MiniLM-L6-v2": SentenceTransformer("paraphrase-MiniLM-L6-v2"), "multi-qa-MiniLM-L6-cos-v1": SentenceTransformer("multi-qa-MiniLM-L6-cos-v1"), "all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2"), "paraphrase-mpnet-base-v2": SentenceTransformer("paraphrase-mpnet-base-v2"), "all-distilroberta-v1": SentenceTransformer("all-distilroberta-v1"), "paraphrase-albert-small-v2": SentenceTransformer("paraphrase-albert-small-v2"), "multi-qa-distilbert-cos-v1": SentenceTransformer("multi-qa-distilbert-cos-v1"), "distiluse-base-multilingual-cased-v2": SentenceTransformer("distiluse-base-multilingual-cased-v2"), "all-MiniLM-L12-v2": SentenceTransformer("all-MiniLM-L12-v2"), } # --- LLM Integration --- HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") API_URL = "https://router.huggingface.co/v1/chat/completions" def llm_error(message): return { "questions": [], "status": "error", "message": message } def call_llama(prompt, max_tokens=3000): HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") if not HF_TOKEN: return llm_error("HF token missing") API_URL = "https://router.huggingface.co/v1/chat/completions" headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", } payload = { "model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [ {"role": "system", "content": "You are a senior interviewer generating structured interview questions."}, {"role": "user", "content": prompt} ], "temperature": 0.3, "max_tokens": max_tokens, "stream": False } try: resp = requests.post(API_URL, headers=headers, json=payload, timeout=90) if resp.status_code != 200: return llm_error(f"HF Router error {resp.status_code}: {resp.text}") data = resp.json() msg = ( data.get("choices", [{}])[0] .get("message", {}) .get("content", "") .strip() ) if not msg: return llm_error("Empty LLM response") return msg except Exception as e: return llm_error(str(e)) def copy_snapshot(notes, transcript): return f""" ### 📋 Interview Snapshot **Notes** {notes} **Transcript** {transcript} """ def build_question_prompt(resume, jd): return f""" You are a senior interviewer. STRICT RULES: - Output VALID JSON only - No markdown - No explanations - Every field MUST be present - If unsure, use empty strings "" - If JSON would be invalid, return exactly: {{ "questions": [] }} Schema: {{ "questions": [ {{ "id": "Q1", "type": "Technical", "question": "string", "strong_answer": "" }} ] }} Resume: \"\"\"{resume[:3000]}\"\"\" Job Description: \"\"\"{jd[:3000]}\"\"\" """ def gradio_generate_questions(resume, jd_text, jd_list, job_idx): print(" gradio_generate_questions CALLED") """ Generates questions. Priority: Uses jd_text (current input). If empty, uses the first job from jd_list. """ if not resume: resume = "No resume provided." # Logic fix: If current text box is empty, check the list if jd_text.strip(): active_jd = jd_text elif jd_list and 0 <= job_idx < len(jd_list): active_jd = jd_list[job_idx] else: active_jd = "No JD provided." print("Using first job from list for generation.") if not active_jd.strip(): active_jd = "No JD provided." try: raw = call_llama(build_question_prompt(resume, active_jd)) if isinstance(raw, dict) and raw.get("status") == "error": return raw try: parsed = safe_extract_json(raw) return { "questions": parsed.get("questions", []), "status": "ok" } except Exception as e: print("JSON parse failure:", e) return { "questions": [], "status": "error", "message": "LLM returned truncated or invalid JSON. Please retry." } except Exception as e: print(f"Gen Q Error: {e}") return { "questions": [], "status": "error", "message": str(e) } # --- Similarity Computation --- def compute_similarity(resume_text, job_list, request: gr.Request = None): try: log_action("Compute Similarity Triggered", request) if not resume_text.strip(): return "Error: Resume cannot be empty.", None # Clean list job_list = [j for j in job_list if j.strip()] if len(job_list) == 0: return "Error: Please add at least one job description.", None results = {} for model_name, model in models.items(): documents = [resume_text] + job_list embeddings = model.encode(documents) resume_embedding = embeddings[0] job_embeddings = embeddings[1:] similarities = cosine_similarity([resume_embedding], job_embeddings).flatten() results[model_name] = similarities df = pd.DataFrame(results, index=[f"Job {i+1}" for i in range(len(job_list))]).T # Calculate Metrics metrics = { "Average": df.mean(axis=0), "Variance": df.var(axis=0), "Median": df.median(axis=0), "Standard Deviation": df.std(axis=0), "Certainty Score": 1 - (df.var(axis=0) / (df.var(axis=0).max() + 1e-9)), # Added small epsilon } for metric_name, values in metrics.items(): df.loc[metric_name] = values model_rows = df.iloc[:-5] metrics_rows = df.iloc[-5:] styled_df = model_rows.style.background_gradient(cmap="Greens", axis=None).to_html() styled_df += metrics_rows.to_html(header=False) best_job = metrics["Average"].idxmax() reasoning = f"The best job match is {best_job} based on the highest average similarity score." description = """

Explanation: Higher 'Certainty Score' means models agree more.

""" return reasoning, styled_df + description except Exception as e: return f"Error during computation: {str(e)}", None def explain_scores_by_sentences(model, resume_text, job_text, top_k=3): from nltk.tokenize import sent_tokenize resume_sents = sent_tokenize(resume_text) job_sents = sent_tokenize(job_text) if not resume_sents or not job_sents: return "No sentences found." resume_embeddings = model.encode(resume_sents) job_embeddings = model.encode(job_sents) sim_matrix = cosine_similarity(resume_embeddings, job_embeddings) flat_sim = sim_matrix.flatten() top_k_indices = np.argsort(flat_sim)[::-1][:top_k] explanation_html = "

Top Similar Sentence Pairs

" for rank, idx in enumerate(top_k_indices, start=1): row = idx // job_embeddings.shape[0] col = idx % job_embeddings.shape[0] score = sim_matrix[row, col] explanation_html += f"

#{rank} (Score {score:.4f}):
Resume: {resume_sents[row]}
Job: {job_sents[col]}

" return explanation_html def set_active_question(label): if not label: return "General" return label.split(":")[0] def enable_audio(question): if question: return gr.update(interactive=True) return gr.update(interactive=False) def safe_extract_json(text: str): """ Extracts and repairs JSON object from LLM output. Returns parsed dict or raises ValueError. """ # Extract first {...} block match = re.search(r"\{[\s\S]*", text) if not match: raise ValueError("No JSON object found") json_text = match.group(0) # HARD STOP: if model cut off mid-key or mid-value if json_text.strip().endswith(":"): raise ValueError("JSON truncated at key") # Try strict parse return json.loads(json_text) def explain_model_scores(model_name, resume, job_list, selected_job_idx): try: model = models[model_name] if not job_list: return gr.update(value="Error: No jobs added.", visible=True) idx = int(selected_job_idx) if idx < 0 or idx >= len(job_list): return gr.update(value=f"Error: Invalid job index.", visible=True) return gr.update(value=explain_scores_by_sentences(model, resume, job_list[idx]), visible=True) except Exception as e: return gr.update(value=f"Error: {str(e)}", visible=True) # --- Interview Logic --- def transcribe_chunk(audio, current_transcript, active_question, epoch): if audio is None or audio[1] is None or len(audio[1]) < 8000: return current_transcript, current_transcript sr, y = audio y = y.astype(np.float32) # Silence guard if np.max(np.abs(y)) < 0.01: return current_transcript, current_transcript y /= np.max(np.abs(y)) filename = f"temp_{uuid.uuid4().hex}.wav" import scipy.io.wavfile as wavfile wavfile.write(filename, sr, y) try: result = get_asr()(filename) text = result.get("text", "").strip() # 🔒 DUPLICATE GUARD if not text or text in current_transcript: return current_transcript, current_transcript prefix = f"[{active_question}] " if active_question else "" new_seg = f"{prefix}{text}" updated = current_transcript + "\n" + new_seg if current_transcript else new_seg return updated, updated finally: if os.path.exists(filename): os.remove(filename) # --- UI --- INVITE_CODE = "INDIAMBA" def add_job_to_list(current_job, job_list): if not current_job.strip(): return job_list, job_list, "" updated = job_list + [current_job] display = "" choices = [f"Job {i+1}" for i in range(len(updated))] return updated, display, "", gr.update(choices=choices, value=choices[-1]) def clear_jobs(): return [], "No jobs...", "", gr.update(choices=[]) def check_invite(user_code): if user_code.strip() == INVITE_CODE: return gr.update(visible=False), gr.update(visible=True) return gr.update(visible=True), gr.update(visible=False) def generate_questions_ui(resume, jd_text, jd_list, job_idx): result = gradio_generate_questions(resume, jd_text, jd_list, job_idx) # ERROR PATH — MUST RETURN EXACTLY 3 VALUES if result.get("status") == "error": return ( gr.update(choices=[]), # questions_radio gr.update(value=f"❌ {result.get('message')}", visible=True), # status_msg "" # transcript_state reset ) qs = result.get("questions", []) labels = [f"{q['id']}: {q['question']}" for q in qs] return ( gr.update(choices=labels), # questions_radio gr.update(value="✅ Questions generated", visible=True), # status_msg "" # transcript reset ) # CSS to make the app look cleaner custom_css = """ .job-display { border: 1px solid #ddd; padding: 10px; background: #f4f4f4; border-radius: 5px; } """ with gr.Blocks(css=custom_css) as app: # State transcript_state = gr.State("") active_question_state = gr.State("") job_list_state = gr.State([]) audio_epoch = gr.State(0) selected_job_index = gr.State(0) # --- Login UI (Top Level) --- with gr.Column(visible=True) as login_ui: gr.Markdown("# 🔐 Second Opinion with Naval") code_input = gr.Textbox(label="Invite Code", type="password") access_button = gr.Button("Enter") # --- Main UI (Top Level, Hidden) --- with gr.Column(visible=False) as main_ui: with gr.Tabs(): # TAB 1: Matcher with gr.Tab("📄 Resume Matcher"): gr.Markdown("### ✍️ Input") with gr.Row(): resume_input = gr.Textbox(label="Paste Resume", lines=10) with gr.Column(): single_job_input = gr.Textbox(label="Job Description", lines=5) with gr.Row(): add_job_btn = gr.Button("➕ Add Job") clear_jobs_btn = gr.Button("🗑️ Clear") job_display_area = gr.HTML("No jobs added yet...", elem_classes="job-display") match_button = gr.Button("Match", variant="primary") recommendation_output = gr.HTML() table_output = gr.HTML() nerd_button = gr.Button("Papa Please Preach More (Details)", visible=False) explanation_output = gr.HTML(visible=False) with gr.Row(visible=False) as explain_controls: job_index_to_explain = gr.Number(label="Job Index (0=Job 1)", value=0, precision=0) for m in list(models.keys())[:3]: btn = gr.Button(f"Explain {m}") btn.click(explain_model_scores, [gr.State(m), resume_input, job_list_state, job_index_to_explain], explanation_output) nerd_button.click(lambda: (gr.update(visible=True), gr.update(visible=True)), None, [table_output, explain_controls]) # TAB 2: Interviewer with gr.Tab("🎙️ Live Interview"): # interview_questions = gr.State([]) gr.Markdown("### 📌 Interview Questions") job_selector = gr.Dropdown( label="Select Job Description", choices=[], interactive=True ) selected_jd_preview = gr.Markdown() job_selector.change( lambda x: int(x.split()[-1]) - 1 if x else 0, inputs=job_selector, outputs=selected_job_index ) job_selector.change( lambda i, jds: jds[i] if jds and 0 <= i < len(jds) else "", [selected_job_index, job_list_state], selected_jd_preview ) gen_questions_btn = gr.Button("Generate Questions") status_msg = gr.Markdown(visible=False) questions_radio = gr.Radio( label="Select Question", choices=[], interactive=True ) gr.Markdown("### 📝 Live Transcript") transcript_box = gr.Textbox( lines=8, interactive=False, placeholder="Waiting for speech..." ) gr.Markdown("### 🧠 Interviewer Notes") notes_box = gr.Textbox(lines=3) copy_btn = gr.Button("Copy Interview Snapshot") copy_status = gr.Markdown(visible=False) copy_btn.click( copy_snapshot, inputs=[notes_box, transcript_box], outputs=[copy_status] ) audio_bridge = gr.Audio( sources=["microphone"], type="numpy", streaming=True, interactive=False # start disabled ) audio_bridge.stream( transcribe_chunk, [audio_bridge, transcript_state, active_question_state, audio_epoch], [transcript_box, transcript_state] ) # --- Wire Events --- gen_questions_btn.click( generate_questions_ui, inputs=[resume_input, single_job_input, job_list_state, selected_job_index], outputs=[questions_radio, status_msg, transcript_state] ) questions_radio.change( lambda q, e: (set_active_question(q), "", e + 1), inputs=[questions_radio, audio_epoch], outputs=[active_question_state, transcript_state, audio_epoch] ).then( lambda: "", None, transcript_box ) questions_radio.change( enable_audio, inputs=questions_radio, outputs=audio_bridge ) add_job_btn.click( add_job_to_list, [single_job_input, job_list_state], [job_list_state, job_display_area, single_job_input, job_selector] ) clear_jobs_btn.click( clear_jobs, [], [job_list_state, job_display_area, single_job_input, job_selector] ) match_button.click( compute_similarity, [resume_input, job_list_state], [recommendation_output, table_output] ).then(lambda: gr.update(visible=True), None, nerd_button) access_button.click(check_invite, [code_input], [login_ui, main_ui]) # warm up ASR on startup if asr is None: try: get_asr() except Exception as e: print("ASR warmup failed:", e) app.queue().launch()