import gradio as gr from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import numpy as np import os import json import re import threading import time import uuid from datetime import datetime import nltk from transformers import pipeline from huggingface_hub import HfApi, HfFolder import requests # --- NLTK Setup --- try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') nltk.download("punkt_tab") # --- Logging Setup --- log_dir = "./logs" os.makedirs(log_dir, exist_ok=True) log_file_path = os.path.join(log_dir, "Second_Opinion_Logs.log") def upload_logs_to_hf(): try: api = HfApi() token = HfFolder.get_token() or os.getenv("HUGGINGFACE_HUB_TOKEN") if not token: return api.upload_file( path_or_fileobj=log_file_path, path_in_repo="logs/Second_Opinion_Logs.log", repo_id="singhn9/privateSOWN", repo_type="space", token=token, ) except Exception as e: print(f"Log upload failed: {e}") def background_log_uploader(interval=300): while True: time.sleep(interval) if os.path.exists(log_file_path): upload_logs_to_hf() if os.getenv("ENABLE_LOG_UPLOAD") == "1": threading.Thread(target=background_log_uploader, daemon=True).start() def log_action(action, request: gr.Request = None): user_ip = request.client.host if request else "Unknown IP" timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") log_entry = f"{timestamp} (GMT) - IP: {user_ip} - {action}\n" with open(log_file_path, 'a') as f: f.write(log_entry) def log_input_text(resume_text, job_list, request: gr.Request = None): user_ip = request.client.host if request else "Unknown IP" timestamp = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") jobs_str = "\n---\n".join(job_list) if isinstance(job_list, list) else str(job_list) log_entry = ( f"{timestamp} (GMT) - IP: {user_ip}\n" f"--- Resume Input ---\n{resume_text}\n\n" f"--- Job Descriptions Input ---\n{jobs_str}\n" "---------------------------------------------\n" ) with open(log_file_path, 'a') as f: f.write(log_entry) # --- ASR & Models --- asr = None def get_asr(): global asr if asr is None: asr = pipeline( "automatic-speech-recognition", model="openai/whisper-tiny.en", chunk_length_s=30, device=-1 ) return asr # Load embedding models models = { "all-MiniLM-L6-v2": SentenceTransformer("all-MiniLM-L6-v2"), "paraphrase-MiniLM-L6-v2": SentenceTransformer("paraphrase-MiniLM-L6-v2"), "multi-qa-MiniLM-L6-cos-v1": SentenceTransformer("multi-qa-MiniLM-L6-cos-v1"), "all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2"), "paraphrase-mpnet-base-v2": SentenceTransformer("paraphrase-mpnet-base-v2"), "all-distilroberta-v1": SentenceTransformer("all-distilroberta-v1"), "paraphrase-albert-small-v2": SentenceTransformer("paraphrase-albert-small-v2"), "multi-qa-distilbert-cos-v1": SentenceTransformer("multi-qa-distilbert-cos-v1"), "distiluse-base-multilingual-cased-v2": SentenceTransformer("distiluse-base-multilingual-cased-v2"), "all-MiniLM-L12-v2": SentenceTransformer("all-MiniLM-L12-v2"), } # --- LLM Integration --- HF_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN") API_URL = "https://router.huggingface.co/v1/chat/completions" def llm_error(message): return { "questions": [], "status": "error", "message": message } def call_llama(prompt, max_tokens=3000): HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") if not HF_TOKEN: return llm_error("HF token missing") API_URL = "https://router.huggingface.co/v1/chat/completions" headers = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", } payload = { "model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [ {"role": "system", "content": "You are a senior interviewer generating structured interview questions."}, {"role": "user", "content": prompt} ], "temperature": 0.3, "max_tokens": max_tokens, "stream": False } try: resp = requests.post(API_URL, headers=headers, json=payload, timeout=90) if resp.status_code != 200: return llm_error(f"HF Router error {resp.status_code}: {resp.text}") data = resp.json() msg = ( data.get("choices", [{}])[0] .get("message", {}) .get("content", "") .strip() ) if not msg: return llm_error("Empty LLM response") return msg except Exception as e: return llm_error(str(e)) def copy_snapshot(notes, transcript): return f""" ### 📋 Interview Snapshot **Notes** {notes} **Transcript** {transcript} """ def build_question_prompt(resume, jd): return f""" You are a senior interviewer. STRICT RULES: - Output VALID JSON only - No markdown - No explanations - Every field MUST be present - If unsure, use empty strings "" - If JSON would be invalid, return exactly: {{ "questions": [] }} Schema: {{ "questions": [ {{ "id": "Q1", "type": "Technical", "question": "string", "strong_answer": "" }} ] }} Resume: \"\"\"{resume[:3000]}\"\"\" Job Description: \"\"\"{jd[:3000]}\"\"\" """ def gradio_generate_questions(resume, jd_text, jd_list, job_idx): print(" gradio_generate_questions CALLED") """ Generates questions. Priority: Uses jd_text (current input). If empty, uses the first job from jd_list. """ if not resume: resume = "No resume provided." # Logic fix: If current text box is empty, check the list if jd_text.strip(): active_jd = jd_text elif jd_list and 0 <= job_idx < len(jd_list): active_jd = jd_list[job_idx] else: active_jd = "No JD provided." print("Using first job from list for generation.") if not active_jd.strip(): active_jd = "No JD provided." try: raw = call_llama(build_question_prompt(resume, active_jd)) if isinstance(raw, dict) and raw.get("status") == "error": return raw try: parsed = safe_extract_json(raw) return { "questions": parsed.get("questions", []), "status": "ok" } except Exception as e: print("JSON parse failure:", e) return { "questions": [], "status": "error", "message": "LLM returned truncated or invalid JSON. Please retry." } except Exception as e: print(f"Gen Q Error: {e}") return { "questions": [], "status": "error", "message": str(e) } # --- Similarity Computation --- def compute_similarity(resume_text, job_list, request: gr.Request = None): try: log_action("Compute Similarity Triggered", request) if not resume_text.strip(): return "Error: Resume cannot be empty.", None # Clean list job_list = [j for j in job_list if j.strip()] if len(job_list) == 0: return "Error: Please add at least one job description.", None results = {} for model_name, model in models.items(): documents = [resume_text] + job_list embeddings = model.encode(documents) resume_embedding = embeddings[0] job_embeddings = embeddings[1:] similarities = cosine_similarity([resume_embedding], job_embeddings).flatten() results[model_name] = similarities df = pd.DataFrame(results, index=[f"Job {i+1}" for i in range(len(job_list))]).T # Calculate Metrics metrics = { "Average": df.mean(axis=0), "Variance": df.var(axis=0), "Median": df.median(axis=0), "Standard Deviation": df.std(axis=0), "Certainty Score": 1 - (df.var(axis=0) / (df.var(axis=0).max() + 1e-9)), # Added small epsilon } for metric_name, values in metrics.items(): df.loc[metric_name] = values model_rows = df.iloc[:-5] metrics_rows = df.iloc[-5:] styled_df = model_rows.style.background_gradient(cmap="Greens", axis=None).to_html() styled_df += metrics_rows.to_html(header=False) best_job = metrics["Average"].idxmax() reasoning = f"The best job match is {best_job} based on the highest average similarity score." description = """
Explanation: Higher 'Certainty Score' means models agree more.
""" return reasoning, styled_df + description except Exception as e: return f"Error during computation: {str(e)}", None def explain_scores_by_sentences(model, resume_text, job_text, top_k=3): from nltk.tokenize import sent_tokenize resume_sents = sent_tokenize(resume_text) job_sents = sent_tokenize(job_text) if not resume_sents or not job_sents: return "No sentences found." resume_embeddings = model.encode(resume_sents) job_embeddings = model.encode(job_sents) sim_matrix = cosine_similarity(resume_embeddings, job_embeddings) flat_sim = sim_matrix.flatten() top_k_indices = np.argsort(flat_sim)[::-1][:top_k] explanation_html = "#{rank} (Score {score:.4f}):
Resume: {resume_sents[row]}
Job: {job_sents[col]}