Spaces:
Sleeping
Sleeping
| # app.py | |
| """ | |
| Jajabor – SEBA Assamese Class 10 Tutor | |
| Hugging Face Spaces ready Gradio app (single-file) | |
| This file contains a working, lightweight adaptation of your Colab notebook | |
| so it can run on Hugging Face Spaces (CPU-friendly demo). | |
| IMPORTANT notes for deployment: | |
| - Spaces has limited CPU/GPU. Large models (Qwen2.5, BAAI/bge-m3) won't run | |
| locally in most Spaces. This app uses smaller models for a working demo. | |
| - For production-quality behavior, switch embeddings/LLM calls to the | |
| Hugging Face Inference API (use your HF token) or host on Colab/VM with GPU. | |
| Create a `requirements.txt` with these entries (add to your repo): | |
| gradio==4.44.0 | |
| pymupdf | |
| sentence-transformers | |
| faiss-cpu | |
| transformers | |
| accelerate | |
| torch | |
| pytesseract | |
| pillow | |
| sympy | |
| huggingface_hub | |
| Place your SEBA Class10 PDFs in the repository under `pdfs/class10/`. | |
| Usage on Spaces: | |
| - Upload the repo (app.py + requirements.txt + pdfs/class10/*). | |
| - If you want higher-quality LLMs/embeddings, set a repo secret HF_TOKEN | |
| and configure INFERENCE_MODELS below. | |
| """ | |
| import os | |
| import io | |
| import sqlite3 | |
| from datetime import datetime | |
| import threading | |
| import fitz # PyMuPDF | |
| import numpy as np | |
| from PIL import Image | |
| import gradio as gr | |
| import faiss | |
| import pytesseract | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import sympy as sp | |
| from huggingface_hub import InferenceApi | |
| # ---------------------- Configuration ---------------------- | |
| APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces demo)" | |
| BASE_DIR = os.path.abspath(".") | |
| PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10") | |
| DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db") | |
| # Lightweight defaults for Spaces demo. Replace with heavier models via Inference API. | |
| EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
| LLM_MODEL_LOCAL = "sshleifer/tiny-gpt2" # very small demo model (optional local) | |
| # If you set HF_TOKEN as a repo secret / environment variable, the app will | |
| # use the Inference API models below for better results. | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| INFERENCE_EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" # example | |
| INFERENCE_LLM_MODEL = "bigscience/bloomz-1b1" # example remote model | |
| CHUNK_SIZE = 600 | |
| CHUNK_OVERLAP = 120 | |
| TOP_K = 5 | |
| # Global variables initialized later | |
| embedding_model = None | |
| index = None | |
| corpus_chunks = [] | |
| corpus_metas = [] | |
| # If HF_TOKEN provided, create inference clients | |
| inference_embed_client = None | |
| inference_llm_client = None | |
| if HF_TOKEN: | |
| try: | |
| inference_embed_client = InferenceApi(repo_id=INFERENCE_EMBED_MODEL, token=HF_TOKEN) | |
| inference_llm_client = InferenceApi(repo_id=INFERENCE_LLM_MODEL, token=HF_TOKEN) | |
| except Exception: | |
| inference_embed_client = None | |
| inference_llm_client = None | |
| # ---------------------- Database ---------------------- | |
| def init_db(db_path=DB_PATH): | |
| os.makedirs(os.path.dirname(db_path), exist_ok=True) | |
| conn = sqlite3.connect(db_path) | |
| cur = conn.cursor() | |
| cur.execute( | |
| """ | |
| CREATE TABLE IF NOT EXISTS users ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| username TEXT UNIQUE, | |
| created_at TEXT | |
| ) | |
| """ | |
| ) | |
| cur.execute( | |
| """ | |
| CREATE TABLE IF NOT EXISTS interactions ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| user_id INTEGER, | |
| timestamp TEXT, | |
| query TEXT, | |
| answer TEXT, | |
| is_math INTEGER, | |
| FOREIGN KEY(user_id) REFERENCES users(id) | |
| ) | |
| """ | |
| ) | |
| conn.commit() | |
| conn.close() | |
| def get_or_create_user(username: str): | |
| username = username.strip() | |
| if not username: | |
| return None | |
| conn = sqlite3.connect(DB_PATH) | |
| cur = conn.cursor() | |
| cur.execute("SELECT id FROM users WHERE username=?", (username,)) | |
| row = cur.fetchone() | |
| if row: | |
| user_id = row[0] | |
| else: | |
| cur.execute( | |
| "INSERT INTO users (username, created_at) VALUES (?, ?)", | |
| (username, datetime.utcnow().isoformat()), | |
| ) | |
| conn.commit() | |
| user_id = cur.lastrowid | |
| conn.close() | |
| return user_id | |
| def log_interaction(user_id, query, answer, is_math: bool): | |
| conn = sqlite3.connect(DB_PATH) | |
| cur = conn.cursor() | |
| cur.execute( | |
| """ | |
| INSERT INTO interactions (user_id, timestamp, query, answer, is_math) | |
| VALUES (?, ?, ?, ?, ?) | |
| """, | |
| (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0), | |
| ) | |
| conn.commit() | |
| conn.close() | |
| def get_user_stats(user_id): | |
| conn = sqlite3.connect(DB_PATH) | |
| cur = conn.cursor() | |
| cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,)) | |
| row = cur.fetchone() | |
| conn.close() | |
| total = row[0] or 0 | |
| math_count = row[1] or 0 | |
| return total, math_count | |
| init_db() | |
| # ---------------------- PDF loading + RAG ---------------------- | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| try: | |
| doc = fitz.open(pdf_path) | |
| except Exception: | |
| return "" | |
| pages = [] | |
| for page in doc: | |
| txt = page.get_text("text") | |
| if txt: | |
| pages.append(txt) | |
| return "\n".join(pages) | |
| def load_all_pdfs(pdf_dir: str): | |
| texts = [] | |
| metas = [] | |
| if not os.path.exists(pdf_dir): | |
| print("PDF_DIR does not exist:", pdf_dir) | |
| return texts, metas | |
| for fname in sorted(os.listdir(pdf_dir)): | |
| if fname.lower().endswith(".pdf"): | |
| path = os.path.join(pdf_dir, fname) | |
| print("Reading:", path) | |
| text = extract_text_from_pdf(path) | |
| if text: | |
| texts.append(text) | |
| metas.append({"source": fname}) | |
| return texts, metas | |
| def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): | |
| chunks = [] | |
| start = 0 | |
| L = len(text) | |
| while start < L: | |
| end = min(start + chunk_size, L) | |
| chunk = text[start:end] | |
| if chunk.strip(): | |
| chunks.append(chunk) | |
| if end == L: | |
| break | |
| start = end - overlap | |
| return chunks | |
| def build_embedding_index(): | |
| global embedding_model, index, corpus_chunks, corpus_metas | |
| print("Loading embedding model:", EMBEDDING_MODEL_NAME) | |
| embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) | |
| all_texts, all_metas = load_all_pdfs(PDF_DIR) | |
| corpus_chunks = [] | |
| corpus_metas = [] | |
| for text, meta in zip(all_texts, all_metas): | |
| chs = split_text(text) | |
| corpus_chunks.extend(chs) | |
| corpus_metas.extend([meta] * len(chs)) | |
| if not corpus_chunks: | |
| print("No document chunks found - RAG will be empty.") | |
| index = None | |
| return | |
| print("Encoding", len(corpus_chunks), "chunks...") | |
| embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32") | |
| dim = embs.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embs) | |
| print("FAISS index ready with dim", dim) | |
| # Build in a background thread so Spaces can start quickly | |
| threading.Thread(target=build_embedding_index, daemon=True).start() | |
| def rag_search(query: str, k: int = TOP_K): | |
| if index is None or embedding_model is None: | |
| return [] | |
| q_vec = embedding_model.encode([query]).astype("float32") | |
| D, I = index.search(q_vec, k) | |
| results = [] | |
| for dist, idx in zip(D[0], I[0]): | |
| if idx == -1: | |
| continue | |
| results.append({ | |
| "score": float(dist), | |
| "text": corpus_chunks[idx], | |
| "meta": corpus_metas[idx], | |
| }) | |
| return results | |
| # ---------------------- LLM + RAG prompt building ---------------------- | |
| # Try to create a small local LLM pipeline for demo; if not present, fallback to Inference API | |
| local_llm = None | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_LOCAL) | |
| model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_LOCAL) | |
| local_llm = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=256, | |
| do_sample=True, | |
| temperature=0.3, | |
| top_p=0.9, | |
| ) | |
| print("Local tiny LLM loaded for demo.") | |
| except Exception: | |
| local_llm = None | |
| print("Local LLM not available; will use Inference API if HF_TOKEN is set.") | |
| SYSTEM_PROMPT = """ | |
| You are "Jajabor", an expert SEBA Assamese tutor for Class 10. | |
| Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English. | |
| Rules: | |
| - Use ONLY the given textbook context when requested. | |
| - If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।" | |
| - বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক। | |
| - If it is a maths question, explain step-by-step clearly. | |
| """ | |
| def build_rag_prompt(context_blocks, question, chat_history): | |
| ctx = "" | |
| for i, block in enumerate(context_blocks, start=1): | |
| src = block["meta"].get("source", "textbook") | |
| ctx += f"\n[Context {i} – {src}]\n{block['text']}\n" | |
| hist = "" | |
| for role, msg in chat_history: | |
| hist += f"{role}: {msg}\n" | |
| prompt = f"{SYSTEM_PROMPT}\n\nপূর্বৰ বাৰ্তাসমূহ:\n{hist}\nসদস্যৰ প্ৰশ্ন:\n{question}\n\nসম্পৰ্কিত পাঠ্যপুথিৰ অংশ:\n{ctx}\n\nএতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।" | |
| return prompt | |
| def llm_answer_with_rag(question: str, chat_history): | |
| retrieved = rag_search(question, TOP_K) | |
| prompt = build_rag_prompt(retrieved, question, chat_history) | |
| # Prefer Inference API if available | |
| if inference_llm_client is not None: | |
| try: | |
| resp = inference_llm_client(inputs=prompt, params={"max_new_tokens": 512}) | |
| # InferenceApi returns a dict or string depending on model | |
| if isinstance(resp, dict) and "generated_text" in resp: | |
| out_text = resp["generated_text"] | |
| elif isinstance(resp, str): | |
| out_text = resp | |
| else: | |
| out_text = str(resp) | |
| # Some remote models echo the prompt; try to strip prompt | |
| if out_text.startswith(prompt): | |
| answer = out_text[len(prompt):].strip() | |
| else: | |
| answer = out_text.strip() | |
| return answer | |
| except Exception: | |
| pass | |
| # Fallback to local tiny model | |
| if local_llm is not None: | |
| out = local_llm(prompt, num_return_sequences=1)[0]["generated_text"] | |
| if out.startswith(prompt): | |
| return out[len(prompt):].strip() | |
| return out | |
| # If nothing available, return a safe fallback | |
| return ( | |
| "দুখঃখিত—এই Spaces ইনষ্টলেশ্যনটোৱে প্ৰতিস্থাপন কৰিব পৰা কোনো LLM নাপালে।" | |
| " যদি আপুনি HF_TOKEN হিচাপে এক্সেস টোকেন যোগ কৰে, মই অনলাইন Inference API ব্যৱহাৰ কৰি উত্তৰ দিম." | |
| ) | |
| # ---------------------- OCR + math helpers ---------------------- | |
| def ocr_from_image(img: Image.Image): | |
| if img is None: | |
| return "" | |
| try: | |
| img = img.convert("RGB") | |
| except Exception: | |
| pass | |
| try: | |
| text = pytesseract.image_to_string(img, lang="asm+eng") | |
| except Exception: | |
| try: | |
| text = pytesseract.image_to_string(img) | |
| except Exception: | |
| text = "" | |
| return text.strip() | |
| def is_likely_math(text: str) -> bool: | |
| math_chars = set("0123456789+-*/=^()%") | |
| if any(ch in text for ch in math_chars): | |
| return True | |
| kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"] | |
| return any(k in text for k in kws) | |
| def solve_math_expression(expr: str): | |
| try: | |
| expr = expr.replace("^", "**") | |
| if "=" in expr: | |
| left, right = expr.split("=", 1) | |
| left_s = sp.sympify(left) | |
| right_s = sp.sympify(right) | |
| eq = sp.Eq(left_s, right_s) | |
| sol = sp.solve(eq) | |
| steps = [] | |
| steps.append("প্ৰথমে সমীকৰণ লওঁ:") | |
| steps.append(f"{sp.pretty(eq)}") | |
| steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:") | |
| steps.append(str(sol)) | |
| explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps) | |
| explanation += f"\n\nসেয়েহে সমাধান: {sol}" | |
| else: | |
| expr_s = sp.sympify(expr) | |
| simp = sp.simplify(expr_s) | |
| explanation = ( | |
| "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n" | |
| f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}" | |
| ) | |
| return explanation | |
| except Exception: | |
| return ( | |
| "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। " | |
| "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7" | |
| ) | |
| # ---------------------- Chat logic ---------------------- | |
| def login_user(username, user_state): | |
| username = (username or "").strip() | |
| if not username: | |
| return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।" | |
| user_id = get_or_create_user(username) | |
| user_state = {"username": username, "user_id": user_id} | |
| total, math_count = get_user_stats(user_id) | |
| stats = ( | |
| f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n" | |
| f"📊 মোট প্ৰশ্ন: **{total}**\n" | |
| f"🧮 গণিত প্ৰশ্ন: **{math_count}**" | |
| ) | |
| return user_state, stats | |
| def chat_logic( | |
| username, | |
| text_input, | |
| image_input, | |
| audio_input, | |
| chat_history, | |
| user_state, | |
| ): | |
| if not user_state or not user_state.get("user_id"): | |
| sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।" | |
| chat_history = chat_history + [[text_input or "", sys_msg]] | |
| return chat_history, user_state, None | |
| user_id = user_state["user_id"] | |
| final_query_parts = [] | |
| # audio_input not handled in this demo | |
| ocr_text = "" | |
| if image_input is not None: | |
| try: | |
| # Handle gradio image types: filepath (string), PIL Image, bytes/file-like | |
| if isinstance(image_input, str): | |
| img = Image.open(image_input) | |
| elif hasattr(image_input, "name") and isinstance(image_input.name, str): | |
| # uploaded file-like with .name | |
| img = Image.open(image_input.name) | |
| elif isinstance(image_input, (bytes, bytearray)): | |
| img = Image.open(io.BytesIO(image_input)) | |
| else: | |
| img = image_input | |
| except Exception: | |
| try: | |
| if hasattr(image_input, "read"): | |
| img = Image.open(io.BytesIO(image_input.read())) | |
| else: | |
| img = None | |
| except Exception: | |
| img = None | |
| if img is not None: | |
| ocr_text = ocr_from_image(img) | |
| if ocr_text: | |
| final_query_parts.append(ocr_text) | |
| if text_input: | |
| final_query_parts.append(text_input) | |
| if not final_query_parts: | |
| sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।" | |
| chat_history = chat_history + [["", sys_msg]] | |
| return chat_history, user_state, None | |
| full_query = "\n".join(final_query_parts) | |
| conv = [] | |
| for u, b in chat_history: | |
| if u: | |
| conv.append(("Student", u)) | |
| if b: | |
| conv.append(("Tutor", b)) | |
| is_math = is_likely_math(full_query) | |
| if is_math: | |
| math_answer = solve_math_expression(full_query) | |
| combined_question = ( | |
| full_query | |
| + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n" | |
| + math_answer | |
| + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।" | |
| ) | |
| final_answer = llm_answer_with_rag(combined_question, conv) | |
| else: | |
| final_answer = llm_answer_with_rag(full_query, conv) | |
| log_interaction(user_id, full_query, final_answer, is_math) | |
| display_question = text_input or ocr_text or "(empty)" | |
| chat_history = chat_history + [[display_question, final_answer]] | |
| return chat_history, user_state, None | |
| # ---------------------- Gradio UI ---------------------- | |
| # Wrap UI creation + launch in try/except so runtime errors are logged clearly | |
| import traceback | |
| try: | |
| with gr.Blocks(title=APP_NAME, theme="soft") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor | |
| - 📘 SEBA ক্লাছ ১০ পাঠ্যপুথিৰ ওপৰত ভিত্তি কৰি উত্তৰ | |
| - 🗣️ টেক্স্ট + ছবি (OCR) ইনপুট | |
| - 🧮 গণিত প্ৰশ্নৰ ধাপ-ধাপে সমাধান | |
| - 👤 ইউজাৰ লগিন + প্ৰগতি (progress) সংৰক্ষণ | |
| """ | |
| ) | |
| user_state = gr.State({}) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 👤 লগিন") | |
| username_inp = gr.Textbox( | |
| label="নাম / ইউজাৰ আইডি", | |
| placeholder="উদাহৰণ: abu10, student01 ...", | |
| ) | |
| login_btn = gr.Button("✅ Login / লগিন") | |
| stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box") | |
| gr.Markdown( | |
| """ | |
| ### 💡 টিপছ | |
| - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল | |
| - ফটো আপলোড করলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয় | |
| - সম্ভব হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂 | |
| """ | |
| ) | |
| with gr.Column(scale=3): | |
| chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500) | |
| text_inp = gr.Textbox( | |
| label="আপোনাৰ প্ৰশ্ন লিখক", | |
| placeholder="উদাহৰণ: \"ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?\"", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| # Use a gr.Image type compatible with this Gradio version: 'filepath' or 'pil' or 'numpy' | |
| # 'filepath' returns a string path in Spaces; code above handles it. | |
| image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath") | |
| audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy") | |
| with gr.Row(): | |
| ask_btn = gr.Button("🤖 জাজাবৰক সোধক") | |
| login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md]) | |
| def wrapped_chat(text, image, audio, history, user_state_inner, username_inner): | |
| if user_state_inner and username_inner and not user_state_inner.get("username"): | |
| user_state_inner["username"] = username_inner | |
| return chat_logic(username_inner, text, image, audio, history, user_state_inner) | |
| ask_btn.click( | |
| wrapped_chat, | |
| inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp], | |
| outputs=[chat, user_state], | |
| concurrency_limit=4, | |
| ) | |
| # On Hugging Face Spaces localhost may be inaccessible from the container; create a shareable link. | |
| # Using share=True here forces Gradio to create a public tunnel which Spaces allows. | |
| demo.launch(share=True) | |
| except Exception as e: | |
| # Write full traceback to a file for debugging in Spaces logs and print to stdout | |
| tb = traceback.format_exc() | |
| print("--- Exception during UI startup ---") | |
| print(tb) | |
| with open("startup_error.log", "w") as f: | |
| f.write(tb) | |
| # Re-raise so the container shows the failure (useful for CI/Spaces logs) | |
| raise | |