# app.py """ Jajabor – SEBA Assamese Class 10 Tutor Hugging Face Spaces ready Gradio app (single-file) This file contains a working, lightweight adaptation of your Colab notebook so it can run on Hugging Face Spaces (CPU-friendly demo). IMPORTANT notes for deployment: - Spaces has limited CPU/GPU. Large models (Qwen2.5, BAAI/bge-m3) won't run locally in most Spaces. This app uses smaller models for a working demo. - For production-quality behavior, switch embeddings/LLM calls to the Hugging Face Inference API (use your HF token) or host on Colab/VM with GPU. Create a `requirements.txt` with these entries (add to your repo): gradio==4.44.0 pymupdf sentence-transformers faiss-cpu transformers accelerate torch pytesseract pillow sympy huggingface_hub Place your SEBA Class10 PDFs in the repository under `pdfs/class10/`. Usage on Spaces: - Upload the repo (app.py + requirements.txt + pdfs/class10/*). - If you want higher-quality LLMs/embeddings, set a repo secret HF_TOKEN and configure INFERENCE_MODELS below. """ import os import io import sqlite3 from datetime import datetime import threading import fitz # PyMuPDF import numpy as np from PIL import Image import gradio as gr import faiss import pytesseract from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import sympy as sp from huggingface_hub import InferenceApi # ---------------------- Configuration ---------------------- APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces demo)" BASE_DIR = os.path.abspath(".") PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10") DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db") # Lightweight defaults for Spaces demo. Replace with heavier models via Inference API. EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" LLM_MODEL_LOCAL = "sshleifer/tiny-gpt2" # very small demo model (optional local) # If you set HF_TOKEN as a repo secret / environment variable, the app will # use the Inference API models below for better results. HF_TOKEN = os.environ.get("HF_TOKEN", None) INFERENCE_EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" # example INFERENCE_LLM_MODEL = "bigscience/bloomz-1b1" # example remote model CHUNK_SIZE = 600 CHUNK_OVERLAP = 120 TOP_K = 5 # Global variables initialized later embedding_model = None index = None corpus_chunks = [] corpus_metas = [] # If HF_TOKEN provided, create inference clients inference_embed_client = None inference_llm_client = None if HF_TOKEN: try: inference_embed_client = InferenceApi(repo_id=INFERENCE_EMBED_MODEL, token=HF_TOKEN) inference_llm_client = InferenceApi(repo_id=INFERENCE_LLM_MODEL, token=HF_TOKEN) except Exception: inference_embed_client = None inference_llm_client = None # ---------------------- Database ---------------------- def init_db(db_path=DB_PATH): os.makedirs(os.path.dirname(db_path), exist_ok=True) conn = sqlite3.connect(db_path) cur = conn.cursor() cur.execute( """ CREATE TABLE IF NOT EXISTS users ( id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT UNIQUE, created_at TEXT ) """ ) cur.execute( """ CREATE TABLE IF NOT EXISTS interactions ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER, timestamp TEXT, query TEXT, answer TEXT, is_math INTEGER, FOREIGN KEY(user_id) REFERENCES users(id) ) """ ) conn.commit() conn.close() def get_or_create_user(username: str): username = username.strip() if not username: return None conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute("SELECT id FROM users WHERE username=?", (username,)) row = cur.fetchone() if row: user_id = row[0] else: cur.execute( "INSERT INTO users (username, created_at) VALUES (?, ?)", (username, datetime.utcnow().isoformat()), ) conn.commit() user_id = cur.lastrowid conn.close() return user_id def log_interaction(user_id, query, answer, is_math: bool): conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute( """ INSERT INTO interactions (user_id, timestamp, query, answer, is_math) VALUES (?, ?, ?, ?, ?) """, (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0), ) conn.commit() conn.close() def get_user_stats(user_id): conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,)) row = cur.fetchone() conn.close() total = row[0] or 0 math_count = row[1] or 0 return total, math_count init_db() # ---------------------- PDF loading + RAG ---------------------- def extract_text_from_pdf(pdf_path: str) -> str: try: doc = fitz.open(pdf_path) except Exception: return "" pages = [] for page in doc: txt = page.get_text("text") if txt: pages.append(txt) return "\n".join(pages) def load_all_pdfs(pdf_dir: str): texts = [] metas = [] if not os.path.exists(pdf_dir): print("PDF_DIR does not exist:", pdf_dir) return texts, metas for fname in sorted(os.listdir(pdf_dir)): if fname.lower().endswith(".pdf"): path = os.path.join(pdf_dir, fname) print("Reading:", path) text = extract_text_from_pdf(path) if text: texts.append(text) metas.append({"source": fname}) return texts, metas def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): chunks = [] start = 0 L = len(text) while start < L: end = min(start + chunk_size, L) chunk = text[start:end] if chunk.strip(): chunks.append(chunk) if end == L: break start = end - overlap return chunks def build_embedding_index(): global embedding_model, index, corpus_chunks, corpus_metas print("Loading embedding model:", EMBEDDING_MODEL_NAME) embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) all_texts, all_metas = load_all_pdfs(PDF_DIR) corpus_chunks = [] corpus_metas = [] for text, meta in zip(all_texts, all_metas): chs = split_text(text) corpus_chunks.extend(chs) corpus_metas.extend([meta] * len(chs)) if not corpus_chunks: print("No document chunks found - RAG will be empty.") index = None return print("Encoding", len(corpus_chunks), "chunks...") embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32") dim = embs.shape[1] index = faiss.IndexFlatL2(dim) index.add(embs) print("FAISS index ready with dim", dim) # Build in a background thread so Spaces can start quickly threading.Thread(target=build_embedding_index, daemon=True).start() def rag_search(query: str, k: int = TOP_K): if index is None or embedding_model is None: return [] q_vec = embedding_model.encode([query]).astype("float32") D, I = index.search(q_vec, k) results = [] for dist, idx in zip(D[0], I[0]): if idx == -1: continue results.append({ "score": float(dist), "text": corpus_chunks[idx], "meta": corpus_metas[idx], }) return results # ---------------------- LLM + RAG prompt building ---------------------- # Try to create a small local LLM pipeline for demo; if not present, fallback to Inference API local_llm = None try: tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_LOCAL) model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_LOCAL) local_llm = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, do_sample=True, temperature=0.3, top_p=0.9, ) print("Local tiny LLM loaded for demo.") except Exception: local_llm = None print("Local LLM not available; will use Inference API if HF_TOKEN is set.") SYSTEM_PROMPT = """ You are "Jajabor", an expert SEBA Assamese tutor for Class 10. Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English. Rules: - Use ONLY the given textbook context when requested. - If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।" - বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক। - If it is a maths question, explain step-by-step clearly. """ def build_rag_prompt(context_blocks, question, chat_history): ctx = "" for i, block in enumerate(context_blocks, start=1): src = block["meta"].get("source", "textbook") ctx += f"\n[Context {i} – {src}]\n{block['text']}\n" hist = "" for role, msg in chat_history: hist += f"{role}: {msg}\n" prompt = f"{SYSTEM_PROMPT}\n\nপূর্বৰ বাৰ্তাসমূহ:\n{hist}\nসদস্যৰ প্ৰশ্ন:\n{question}\n\nসম্পৰ্কিত পাঠ্যপুথিৰ অংশ:\n{ctx}\n\nএতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।" return prompt def llm_answer_with_rag(question: str, chat_history): retrieved = rag_search(question, TOP_K) prompt = build_rag_prompt(retrieved, question, chat_history) # Prefer Inference API if available if inference_llm_client is not None: try: resp = inference_llm_client(inputs=prompt, params={"max_new_tokens": 512}) # InferenceApi returns a dict or string depending on model if isinstance(resp, dict) and "generated_text" in resp: out_text = resp["generated_text"] elif isinstance(resp, str): out_text = resp else: out_text = str(resp) # Some remote models echo the prompt; try to strip prompt if out_text.startswith(prompt): answer = out_text[len(prompt):].strip() else: answer = out_text.strip() return answer except Exception: pass # Fallback to local tiny model if local_llm is not None: out = local_llm(prompt, num_return_sequences=1)[0]["generated_text"] if out.startswith(prompt): return out[len(prompt):].strip() return out # If nothing available, return a safe fallback return ( "দুখঃখিত—এই Spaces ইনষ্টলেশ্যনটোৱে প্ৰতিস্থাপন কৰিব পৰা কোনো LLM নাপালে।" " যদি আপুনি HF_TOKEN হিচাপে এক্সেস টোকেন যোগ কৰে, মই অনলাইন Inference API ব্যৱহাৰ কৰি উত্তৰ দিম." ) # ---------------------- OCR + math helpers ---------------------- def ocr_from_image(img: Image.Image): if img is None: return "" try: img = img.convert("RGB") except Exception: pass try: text = pytesseract.image_to_string(img, lang="asm+eng") except Exception: try: text = pytesseract.image_to_string(img) except Exception: text = "" return text.strip() def is_likely_math(text: str) -> bool: math_chars = set("0123456789+-*/=^()%") if any(ch in text for ch in math_chars): return True kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"] return any(k in text for k in kws) def solve_math_expression(expr: str): try: expr = expr.replace("^", "**") if "=" in expr: left, right = expr.split("=", 1) left_s = sp.sympify(left) right_s = sp.sympify(right) eq = sp.Eq(left_s, right_s) sol = sp.solve(eq) steps = [] steps.append("প্ৰথমে সমীকৰণ লওঁ:") steps.append(f"{sp.pretty(eq)}") steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:") steps.append(str(sol)) explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps) explanation += f"\n\nসেয়েহে সমাধান: {sol}" else: expr_s = sp.sympify(expr) simp = sp.simplify(expr_s) explanation = ( "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n" f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}" ) return explanation except Exception: return ( "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। " "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7" ) # ---------------------- Chat logic ---------------------- def login_user(username, user_state): username = (username or "").strip() if not username: return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।" user_id = get_or_create_user(username) user_state = {"username": username, "user_id": user_id} total, math_count = get_user_stats(user_id) stats = ( f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n" f"📊 মোট প্ৰশ্ন: **{total}**\n" f"🧮 গণিত প্ৰশ্ন: **{math_count}**" ) return user_state, stats def chat_logic( username, text_input, image_input, audio_input, chat_history, user_state, ): if not user_state or not user_state.get("user_id"): sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।" chat_history = chat_history + [[text_input or "", sys_msg]] return chat_history, user_state, None user_id = user_state["user_id"] final_query_parts = [] # audio_input not handled in this demo ocr_text = "" if image_input is not None: try: # Handle gradio image types: filepath (string), PIL Image, bytes/file-like if isinstance(image_input, str): img = Image.open(image_input) elif hasattr(image_input, "name") and isinstance(image_input.name, str): # uploaded file-like with .name img = Image.open(image_input.name) elif isinstance(image_input, (bytes, bytearray)): img = Image.open(io.BytesIO(image_input)) else: img = image_input except Exception: try: if hasattr(image_input, "read"): img = Image.open(io.BytesIO(image_input.read())) else: img = None except Exception: img = None if img is not None: ocr_text = ocr_from_image(img) if ocr_text: final_query_parts.append(ocr_text) if text_input: final_query_parts.append(text_input) if not final_query_parts: sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।" chat_history = chat_history + [["", sys_msg]] return chat_history, user_state, None full_query = "\n".join(final_query_parts) conv = [] for u, b in chat_history: if u: conv.append(("Student", u)) if b: conv.append(("Tutor", b)) is_math = is_likely_math(full_query) if is_math: math_answer = solve_math_expression(full_query) combined_question = ( full_query + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n" + math_answer + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।" ) final_answer = llm_answer_with_rag(combined_question, conv) else: final_answer = llm_answer_with_rag(full_query, conv) log_interaction(user_id, full_query, final_answer, is_math) display_question = text_input or ocr_text or "(empty)" chat_history = chat_history + [[display_question, final_answer]] return chat_history, user_state, None # ---------------------- Gradio UI ---------------------- # Wrap UI creation + launch in try/except so runtime errors are logged clearly import traceback try: with gr.Blocks(title=APP_NAME, theme="soft") as demo: gr.Markdown( """ # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor - 📘 SEBA ক্লাছ ১০ পাঠ্যপুথিৰ ওপৰত ভিত্তি কৰি উত্তৰ - 🗣️ টেক্স্ট + ছবি (OCR) ইনপুট - 🧮 গণিত প্ৰশ্নৰ ধাপ-ধাপে সমাধান - 👤 ইউজাৰ লগিন + প্ৰগতি (progress) সংৰক্ষণ """ ) user_state = gr.State({}) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 👤 লগিন") username_inp = gr.Textbox( label="নাম / ইউজাৰ আইডি", placeholder="উদাহৰণ: abu10, student01 ...", ) login_btn = gr.Button("✅ Login / লগিন") stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box") gr.Markdown( """ ### 💡 টিপছ - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল - ফটো আপলোড করলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয় - সম্ভব হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂 """ ) with gr.Column(scale=3): chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500) text_inp = gr.Textbox( label="আপোনাৰ প্ৰশ্ন লিখক", placeholder="উদাহৰণ: \"ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?\"", lines=2, ) with gr.Row(): # Use a gr.Image type compatible with this Gradio version: 'filepath' or 'pil' or 'numpy' # 'filepath' returns a string path in Spaces; code above handles it. image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath") audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy") with gr.Row(): ask_btn = gr.Button("🤖 জাজাবৰক সোধক") login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md]) def wrapped_chat(text, image, audio, history, user_state_inner, username_inner): if user_state_inner and username_inner and not user_state_inner.get("username"): user_state_inner["username"] = username_inner return chat_logic(username_inner, text, image, audio, history, user_state_inner) ask_btn.click( wrapped_chat, inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp], outputs=[chat, user_state], concurrency_limit=4, ) # On Hugging Face Spaces localhost may be inaccessible from the container; create a shareable link. # Using share=True here forces Gradio to create a public tunnel which Spaces allows. demo.launch(share=True) except Exception as e: # Write full traceback to a file for debugging in Spaces logs and print to stdout tb = traceback.format_exc() print("--- Exception during UI startup ---") print(tb) with open("startup_error.log", "w") as f: f.write(tb) # Re-raise so the container shows the failure (useful for CI/Spaces logs) raise