Spaces:

Sazid2
/

Assamese

Sleeping

File size: 21,644 Bytes

# app.py
"""
Jajabor – SEBA Assamese Class 10 Tutor
Hugging Face Spaces ready Gradio app (single-file)

This file contains a working, lightweight adaptation of your Colab notebook
so it can run on Hugging Face Spaces (CPU-friendly demo).

IMPORTANT notes for deployment:
- Spaces has limited CPU/GPU. Large models (Qwen2.5, BAAI/bge-m3) won't run
  locally in most Spaces. This app uses smaller models for a working demo.
- For production-quality behavior, switch embeddings/LLM calls to the
  Hugging Face Inference API (use your HF token) or host on Colab/VM with GPU.

Create a `requirements.txt` with these entries (add to your repo):

    gradio==4.44.0
    pymupdf
    sentence-transformers
    faiss-cpu
    transformers
    accelerate
    torch
    pytesseract
    pillow
    sympy
    huggingface_hub

Place your SEBA Class10 PDFs in the repository under `pdfs/class10/`.

Usage on Spaces:
- Upload the repo (app.py + requirements.txt + pdfs/class10/*).
- If you want higher-quality LLMs/embeddings, set a repo secret HF_TOKEN
  and configure INFERENCE_MODELS below.

"""

import os
import io
import sqlite3
from datetime import datetime
import threading

import fitz  # PyMuPDF
import numpy as np
from PIL import Image

import gradio as gr
import faiss
import pytesseract
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import sympy as sp
from huggingface_hub import InferenceApi

# ---------------------- Configuration ----------------------
APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces demo)"

BASE_DIR = os.path.abspath(".")
PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")

# Lightweight defaults for Spaces demo. Replace with heavier models via Inference API.
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_LOCAL = "sshleifer/tiny-gpt2"  # very small demo model (optional local)

# If you set HF_TOKEN as a repo secret / environment variable, the app will
# use the Inference API models below for better results.
HF_TOKEN = os.environ.get("HF_TOKEN", None)
INFERENCE_EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2"  # example
INFERENCE_LLM_MODEL = "bigscience/bloomz-1b1"  # example remote model

CHUNK_SIZE = 600
CHUNK_OVERLAP = 120
TOP_K = 5

# Global variables initialized later
embedding_model = None
index = None
corpus_chunks = []
corpus_metas = []

# If HF_TOKEN provided, create inference clients
inference_embed_client = None
inference_llm_client = None
if HF_TOKEN:
    try:
        inference_embed_client = InferenceApi(repo_id=INFERENCE_EMBED_MODEL, token=HF_TOKEN)
        inference_llm_client = InferenceApi(repo_id=INFERENCE_LLM_MODEL, token=HF_TOKEN)
    except Exception:
        inference_embed_client = None
        inference_llm_client = None

# ---------------------- Database ----------------------

def init_db(db_path=DB_PATH):
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS users (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            username TEXT UNIQUE,
            created_at TEXT
        )
        """
    )
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS interactions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user_id INTEGER,
            timestamp TEXT,
            query TEXT,
            answer TEXT,
            is_math INTEGER,
            FOREIGN KEY(user_id) REFERENCES users(id)
        )
        """
    )
    conn.commit()
    conn.close()


def get_or_create_user(username: str):
    username = username.strip()
    if not username:
        return None

    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("SELECT id FROM users WHERE username=?", (username,))
    row = cur.fetchone()
    if row:
        user_id = row[0]
    else:
        cur.execute(
            "INSERT INTO users (username, created_at) VALUES (?, ?)",
            (username, datetime.utcnow().isoformat()),
        )
        conn.commit()
        user_id = cur.lastrowid
    conn.close()
    return user_id


def log_interaction(user_id, query, answer, is_math: bool):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute(
        """
        INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
        VALUES (?, ?, ?, ?, ?)
        """,
        (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0),
    )
    conn.commit()
    conn.close()


def get_user_stats(user_id):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,))
    row = cur.fetchone()
    conn.close()
    total = row[0] or 0
    math_count = row[1] or 0
    return total, math_count

init_db()

# ---------------------- PDF loading + RAG ----------------------

def extract_text_from_pdf(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
    except Exception:
        return ""
    pages = []
    for page in doc:
        txt = page.get_text("text")
        if txt:
            pages.append(txt)
    return "\n".join(pages)


def load_all_pdfs(pdf_dir: str):
    texts = []
    metas = []
    if not os.path.exists(pdf_dir):
        print("PDF_DIR does not exist:", pdf_dir)
        return texts, metas
    for fname in sorted(os.listdir(pdf_dir)):
        if fname.lower().endswith(".pdf"):
            path = os.path.join(pdf_dir, fname)
            print("Reading:", path)
            text = extract_text_from_pdf(path)
            if text:
                texts.append(text)
                metas.append({"source": fname})
    return texts, metas


def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    start = 0
    L = len(text)
    while start < L:
        end = min(start + chunk_size, L)
        chunk = text[start:end]
        if chunk.strip():
            chunks.append(chunk)
        if end == L:
            break
        start = end - overlap
    return chunks


def build_embedding_index():
    global embedding_model, index, corpus_chunks, corpus_metas

    print("Loading embedding model:", EMBEDDING_MODEL_NAME)
    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

    all_texts, all_metas = load_all_pdfs(PDF_DIR)
    corpus_chunks = []
    corpus_metas = []
    for text, meta in zip(all_texts, all_metas):
        chs = split_text(text)
        corpus_chunks.extend(chs)
        corpus_metas.extend([meta] * len(chs))

    if not corpus_chunks:
        print("No document chunks found - RAG will be empty.")
        index = None
        return

    print("Encoding", len(corpus_chunks), "chunks...")
    embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
    dim = embs.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embs)
    print("FAISS index ready with dim", dim)


# Build in a background thread so Spaces can start quickly
threading.Thread(target=build_embedding_index, daemon=True).start()


def rag_search(query: str, k: int = TOP_K):
    if index is None or embedding_model is None:
        return []
    q_vec = embedding_model.encode([query]).astype("float32")
    D, I = index.search(q_vec, k)
    results = []
    for dist, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        results.append({
            "score": float(dist),
            "text": corpus_chunks[idx],
            "meta": corpus_metas[idx],
        })
    return results

# ---------------------- LLM + RAG prompt building ----------------------

# Try to create a small local LLM pipeline for demo; if not present, fallback to Inference API
local_llm = None
try:
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_LOCAL)
    model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_LOCAL)
    local_llm = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.9,
    )
    print("Local tiny LLM loaded for demo.")
except Exception:
    local_llm = None
    print("Local LLM not available; will use Inference API if HF_TOKEN is set.")

SYSTEM_PROMPT = """
You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.

Rules:
- Use ONLY the given textbook context when requested.
- If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।"
- বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক।
- If it is a maths question, explain step-by-step clearly.
"""


def build_rag_prompt(context_blocks, question, chat_history):
    ctx = ""
    for i, block in enumerate(context_blocks, start=1):
        src = block["meta"].get("source", "textbook")
        ctx += f"\n[Context {i} – {src}]\n{block['text']}\n"

    hist = ""
    for role, msg in chat_history:
        hist += f"{role}: {msg}\n"

    prompt = f"{SYSTEM_PROMPT}\n\nপূর্বৰ বাৰ্তাসমূহ:\n{hist}\nসদস্যৰ প্ৰশ্ন:\n{question}\n\nসম্পৰ্কিত পাঠ্যপুথিৰ অংশ:\n{ctx}\n\nএতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।"
    return prompt


def llm_answer_with_rag(question: str, chat_history):
    retrieved = rag_search(question, TOP_K)
    prompt = build_rag_prompt(retrieved, question, chat_history)

    # Prefer Inference API if available
    if inference_llm_client is not None:
        try:
            resp = inference_llm_client(inputs=prompt, params={"max_new_tokens": 512})
            # InferenceApi returns a dict or string depending on model
            if isinstance(resp, dict) and "generated_text" in resp:
                out_text = resp["generated_text"]
            elif isinstance(resp, str):
                out_text = resp
            else:
                out_text = str(resp)
            # Some remote models echo the prompt; try to strip prompt
            if out_text.startswith(prompt):
                answer = out_text[len(prompt):].strip()
            else:
                answer = out_text.strip()
            return answer
        except Exception:
            pass

    # Fallback to local tiny model
    if local_llm is not None:
        out = local_llm(prompt, num_return_sequences=1)[0]["generated_text"]
        if out.startswith(prompt):
            return out[len(prompt):].strip()
        return out

    # If nothing available, return a safe fallback
    return (
        "দুখঃখিত—এই Spaces ইনষ্টলেশ্যনটোৱে প্ৰতিস্থাপন কৰিব পৰা কোনো LLM নাপালে।"
        " যদি আপুনি HF_TOKEN হিচাপে এক্সেস টোকেন যোগ কৰে, মই অনলাইন Inference API ব্যৱহাৰ কৰি উত্তৰ দিম."
    )

# ---------------------- OCR + math helpers ----------------------

def ocr_from_image(img: Image.Image):
    if img is None:
        return ""
    try:
        img = img.convert("RGB")
    except Exception:
        pass
    try:
        text = pytesseract.image_to_string(img, lang="asm+eng")
    except Exception:
        try:
            text = pytesseract.image_to_string(img)
        except Exception:
            text = ""
    return text.strip()


def is_likely_math(text: str) -> bool:
    math_chars = set("0123456789+-*/=^()%")
    if any(ch in text for ch in math_chars):
        return True
    kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"]
    return any(k in text for k in kws)


def solve_math_expression(expr: str):
    try:
        expr = expr.replace("^", "**")
        if "=" in expr:
            left, right = expr.split("=", 1)
            left_s = sp.sympify(left)
            right_s = sp.sympify(right)
            eq = sp.Eq(left_s, right_s)
            sol = sp.solve(eq)
            steps = []
            steps.append("প্ৰথমে সমীকৰণ লওঁ:")
            steps.append(f"{sp.pretty(eq)}")
            steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
            steps.append(str(sol))
            explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
            explanation += f"\n\nসেয়েহে সমাধান: {sol}"
        else:
            expr_s = sp.sympify(expr)
            simp = sp.simplify(expr_s)
            explanation = (
                "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
                f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
            )
        return explanation
    except Exception:
        return (
            "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
            "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7"
        )

# ---------------------- Chat logic ----------------------

def login_user(username, user_state):
    username = (username or "").strip()
    if not username:
        return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।"

    user_id = get_or_create_user(username)
    user_state = {"username": username, "user_id": user_id}
    total, math_count = get_user_stats(user_id)

    stats = (
        f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n"
        f"📊 মোট প্ৰশ্ন: **{total}**\n"
        f"🧮 গণিত প্ৰশ্ন: **{math_count}**"
    )
    return user_state, stats


def chat_logic(
    username,
    text_input,
    image_input,
    audio_input,
    chat_history,
    user_state,
):
    if not user_state or not user_state.get("user_id"):
        sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
        chat_history = chat_history + [[text_input or "", sys_msg]]
        return chat_history, user_state, None

    user_id = user_state["user_id"]

    final_query_parts = []

    # audio_input not handled in this demo

    ocr_text = ""
    if image_input is not None:
        try:
            # Handle gradio image types: filepath (string), PIL Image, bytes/file-like
            if isinstance(image_input, str):
                img = Image.open(image_input)
            elif hasattr(image_input, "name") and isinstance(image_input.name, str):
                # uploaded file-like with .name
                img = Image.open(image_input.name)
            elif isinstance(image_input, (bytes, bytearray)):
                img = Image.open(io.BytesIO(image_input))
            else:
                img = image_input
        except Exception:
            try:
                if hasattr(image_input, "read"):
                    img = Image.open(io.BytesIO(image_input.read()))
                else:
                    img = None
            except Exception:
                img = None
        if img is not None:
            ocr_text = ocr_from_image(img)
            if ocr_text:
                final_query_parts.append(ocr_text)

    if text_input:
        final_query_parts.append(text_input)

    if not final_query_parts:
        sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।"
        chat_history = chat_history + [["", sys_msg]]
        return chat_history, user_state, None

    full_query = "\n".join(final_query_parts)

    conv = []
    for u, b in chat_history:
        if u:
            conv.append(("Student", u))
        if b:
            conv.append(("Tutor", b))

    is_math = is_likely_math(full_query)

    if is_math:
        math_answer = solve_math_expression(full_query)
        combined_question = (
            full_query
            + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n"
            + math_answer
            + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।"
        )
        final_answer = llm_answer_with_rag(combined_question, conv)
    else:
        final_answer = llm_answer_with_rag(full_query, conv)

    log_interaction(user_id, full_query, final_answer, is_math)

    display_question = text_input or ocr_text or "(empty)"
    chat_history = chat_history + [[display_question, final_answer]]

    return chat_history, user_state, None

# ---------------------- Gradio UI ----------------------

# Wrap UI creation + launch in try/except so runtime errors are logged clearly
import traceback

try:
    with gr.Blocks(title=APP_NAME, theme="soft") as demo:
        gr.Markdown(
            """
            # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor

            - 📘 SEBA ক্লাছ ১০ পাঠ্যপুথিৰ ওপৰত ভিত্তি কৰি উত্তৰ
            - 🗣️ টেক্স্ট + ছবি (OCR) ইনপুট
            - 🧮 গণিত প্ৰশ্নৰ ধাপ-ধাপে সমাধান
            - 👤 ইউজাৰ লগিন + প্ৰগতি (progress) সংৰক্ষণ
            """
        )

        user_state = gr.State({})

        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 👤 লগিন")
                username_inp = gr.Textbox(
                    label="নাম / ইউজাৰ আইডি",
                    placeholder="উদাহৰণ: abu10, student01 ...",
                )
                login_btn = gr.Button("✅ Login / লগিন")
                stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")

                gr.Markdown(
                    """
                    ### 💡 টিপছ
                    - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল
                    - ফটো আপলোড করলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয়
                    - সম্ভব হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂
                    """
                )

            with gr.Column(scale=3):
                chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)

                text_inp = gr.Textbox(
                    label="আপোনাৰ প্ৰশ্ন লিখক",
                    placeholder="উদাহৰণ: \"ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?\"",
                    lines=2,
                )

                with gr.Row():
                    # Use a gr.Image type compatible with this Gradio version: 'filepath' or 'pil' or 'numpy'
                    # 'filepath' returns a string path in Spaces; code above handles it.
                    image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath")
                    audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy")

                with gr.Row():
                    ask_btn = gr.Button("🤖 জাজাবৰক সোধক")

        login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])

        def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
            if user_state_inner and username_inner and not user_state_inner.get("username"):
                user_state_inner["username"] = username_inner
            return chat_logic(username_inner, text, image, audio, history, user_state_inner)

        ask_btn.click(
        wrapped_chat,
        inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
        outputs=[chat, user_state],
        concurrency_limit=4,
    )

    # On Hugging Face Spaces localhost may be inaccessible from the container; create a shareable link.
    # Using share=True here forces Gradio to create a public tunnel which Spaces allows.
    demo.launch(share=True)



except Exception as e:
    # Write full traceback to a file for debugging in Spaces logs and print to stdout
    tb = traceback.format_exc()
    print("--- Exception during UI startup ---")
    print(tb)
    with open("startup_error.log", "w") as f:
        f.write(tb)
    # Re-raise so the container shows the failure (useful for CI/Spaces logs)
    raise