Spaces:

Sazid2
/

Assamese

Sleeping

App Files Files Community

Assamese / app.py

Sazid2

Update app.py

7abafe2 verified 21 days ago

raw

history blame contribute delete

21.6 kB

	# app.py
	"""
	Jajabor – SEBA Assamese Class 10 Tutor
	Hugging Face Spaces ready Gradio app (single-file)

	This file contains a working, lightweight adaptation of your Colab notebook
	so it can run on Hugging Face Spaces (CPU-friendly demo).

	IMPORTANT notes for deployment:
	- Spaces has limited CPU/GPU. Large models (Qwen2.5, BAAI/bge-m3) won't run
	locally in most Spaces. This app uses smaller models for a working demo.
	- For production-quality behavior, switch embeddings/LLM calls to the
	Hugging Face Inference API (use your HF token) or host on Colab/VM with GPU.

	Create a `requirements.txt` with these entries (add to your repo):

	gradio==4.44.0
	pymupdf
	sentence-transformers
	faiss-cpu
	transformers
	accelerate
	torch
	pytesseract
	pillow
	sympy
	huggingface_hub

	Place your SEBA Class10 PDFs in the repository under `pdfs/class10/`.

	Usage on Spaces:
	- Upload the repo (app.py + requirements.txt + pdfs/class10/*).
	- If you want higher-quality LLMs/embeddings, set a repo secret HF_TOKEN
	and configure INFERENCE_MODELS below.

	"""

	import os
	import io
	import sqlite3
	from datetime import datetime
	import threading

	import fitz # PyMuPDF
	import numpy as np
	from PIL import Image

	import gradio as gr
	import faiss
	import pytesseract
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import sympy as sp
	from huggingface_hub import InferenceApi

	# ---------------------- Configuration ----------------------
	APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces demo)"

	BASE_DIR = os.path.abspath(".")
	PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
	DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")

	# Lightweight defaults for Spaces demo. Replace with heavier models via Inference API.
	EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
	LLM_MODEL_LOCAL = "sshleifer/tiny-gpt2" # very small demo model (optional local)

	# If you set HF_TOKEN as a repo secret / environment variable, the app will
	# use the Inference API models below for better results.
	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	INFERENCE_EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" # example
	INFERENCE_LLM_MODEL = "bigscience/bloomz-1b1" # example remote model

	CHUNK_SIZE = 600
	CHUNK_OVERLAP = 120
	TOP_K = 5

	# Global variables initialized later
	embedding_model = None
	index = None
	corpus_chunks = []
	corpus_metas = []

	# If HF_TOKEN provided, create inference clients
	inference_embed_client = None
	inference_llm_client = None
	if HF_TOKEN:
	try:
	inference_embed_client = InferenceApi(repo_id=INFERENCE_EMBED_MODEL, token=HF_TOKEN)
	inference_llm_client = InferenceApi(repo_id=INFERENCE_LLM_MODEL, token=HF_TOKEN)
	except Exception:
	inference_embed_client = None
	inference_llm_client = None

	# ---------------------- Database ----------------------

	def init_db(db_path=DB_PATH):
	os.makedirs(os.path.dirname(db_path), exist_ok=True)
	conn = sqlite3.connect(db_path)
	cur = conn.cursor()
	cur.execute(
	"""
	CREATE TABLE IF NOT EXISTS users (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	username TEXT UNIQUE,
	created_at TEXT
	)
	"""
	)
	cur.execute(
	"""
	CREATE TABLE IF NOT EXISTS interactions (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	user_id INTEGER,
	timestamp TEXT,
	query TEXT,
	answer TEXT,
	is_math INTEGER,
	FOREIGN KEY(user_id) REFERENCES users(id)
	)
	"""
	)
	conn.commit()
	conn.close()


	def get_or_create_user(username: str):
	username = username.strip()
	if not username:
	return None

	conn = sqlite3.connect(DB_PATH)
	cur = conn.cursor()
	cur.execute("SELECT id FROM users WHERE username=?", (username,))
	row = cur.fetchone()
	if row:
	user_id = row[0]
	else:
	cur.execute(
	"INSERT INTO users (username, created_at) VALUES (?, ?)",
	(username, datetime.utcnow().isoformat()),
	)
	conn.commit()
	user_id = cur.lastrowid
	conn.close()
	return user_id


	def log_interaction(user_id, query, answer, is_math: bool):
	conn = sqlite3.connect(DB_PATH)
	cur = conn.cursor()
	cur.execute(
	"""
	INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
	VALUES (?, ?, ?, ?, ?)
	""",
	(user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0),
	)
	conn.commit()
	conn.close()


	def get_user_stats(user_id):
	conn = sqlite3.connect(DB_PATH)
	cur = conn.cursor()
	cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,))
	row = cur.fetchone()
	conn.close()
	total = row[0] or 0
	math_count = row[1] or 0
	return total, math_count

	init_db()

	# ---------------------- PDF loading + RAG ----------------------

	def extract_text_from_pdf(pdf_path: str) -> str:
	try:
	doc = fitz.open(pdf_path)
	except Exception:
	return ""
	pages = []
	for page in doc:
	txt = page.get_text("text")
	if txt:
	pages.append(txt)
	return "\n".join(pages)


	def load_all_pdfs(pdf_dir: str):
	texts = []
	metas = []
	if not os.path.exists(pdf_dir):
	print("PDF_DIR does not exist:", pdf_dir)
	return texts, metas
	for fname in sorted(os.listdir(pdf_dir)):
	if fname.lower().endswith(".pdf"):
	path = os.path.join(pdf_dir, fname)
	print("Reading:", path)
	text = extract_text_from_pdf(path)
	if text:
	texts.append(text)
	metas.append({"source": fname})
	return texts, metas


	def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	chunks = []
	start = 0
	L = len(text)
	while start < L:
	end = min(start + chunk_size, L)
	chunk = text[start:end]
	if chunk.strip():
	chunks.append(chunk)
	if end == L:
	break
	start = end - overlap
	return chunks


	def build_embedding_index():
	global embedding_model, index, corpus_chunks, corpus_metas

	print("Loading embedding model:", EMBEDDING_MODEL_NAME)
	embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

	all_texts, all_metas = load_all_pdfs(PDF_DIR)
	corpus_chunks = []
	corpus_metas = []
	for text, meta in zip(all_texts, all_metas):
	chs = split_text(text)
	corpus_chunks.extend(chs)
	corpus_metas.extend([meta] * len(chs))

	if not corpus_chunks:
	print("No document chunks found - RAG will be empty.")
	index = None
	return

	print("Encoding", len(corpus_chunks), "chunks...")
	embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
	dim = embs.shape[1]
	index = faiss.IndexFlatL2(dim)
	index.add(embs)
	print("FAISS index ready with dim", dim)


	# Build in a background thread so Spaces can start quickly
	threading.Thread(target=build_embedding_index, daemon=True).start()


	def rag_search(query: str, k: int = TOP_K):
	if index is None or embedding_model is None:
	return []
	q_vec = embedding_model.encode([query]).astype("float32")
	D, I = index.search(q_vec, k)
	results = []
	for dist, idx in zip(D[0], I[0]):
	if idx == -1:
	continue
	results.append({
	"score": float(dist),
	"text": corpus_chunks[idx],
	"meta": corpus_metas[idx],
	})
	return results

	# ---------------------- LLM + RAG prompt building ----------------------

	# Try to create a small local LLM pipeline for demo; if not present, fallback to Inference API
	local_llm = None
	try:
	tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_LOCAL)
	model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_LOCAL)
	local_llm = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=256,
	do_sample=True,
	temperature=0.3,
	top_p=0.9,
	)
	print("Local tiny LLM loaded for demo.")
	except Exception:
	local_llm = None
	print("Local LLM not available; will use Inference API if HF_TOKEN is set.")

	SYSTEM_PROMPT = """
	You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
	Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.

	Rules:
	- Use ONLY the given textbook context when requested.
	- If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।"
	- বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক।
	- If it is a maths question, explain step-by-step clearly.
	"""


	def build_rag_prompt(context_blocks, question, chat_history):
	ctx = ""
	for i, block in enumerate(context_blocks, start=1):
	src = block["meta"].get("source", "textbook")
	ctx += f"\n[Context {i} – {src}]\n{block['text']}\n"

	hist = ""
	for role, msg in chat_history:
	hist += f"{role}: {msg}\n"

	prompt = f"{SYSTEM_PROMPT}\n\nপূর্বৰ বাৰ্তাসমূহ:\n{hist}\nসদস্যৰ প্ৰশ্ন:\n{question}\n\nসম্পৰ্কিত পাঠ্যপুথিৰ অংশ:\n{ctx}\n\nএতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।"
	return prompt


	def llm_answer_with_rag(question: str, chat_history):
	retrieved = rag_search(question, TOP_K)
	prompt = build_rag_prompt(retrieved, question, chat_history)

	# Prefer Inference API if available
	if inference_llm_client is not None:
	try:
	resp = inference_llm_client(inputs=prompt, params={"max_new_tokens": 512})
	# InferenceApi returns a dict or string depending on model
	if isinstance(resp, dict) and "generated_text" in resp:
	out_text = resp["generated_text"]
	elif isinstance(resp, str):
	out_text = resp
	else:
	out_text = str(resp)
	# Some remote models echo the prompt; try to strip prompt
	if out_text.startswith(prompt):
	answer = out_text[len(prompt):].strip()
	else:
	answer = out_text.strip()
	return answer
	except Exception:
	pass

	# Fallback to local tiny model
	if local_llm is not None:
	out = local_llm(prompt, num_return_sequences=1)[0]["generated_text"]
	if out.startswith(prompt):
	return out[len(prompt):].strip()
	return out

	# If nothing available, return a safe fallback
	return (
	"দুখঃখিত—এই Spaces ইনষ্টলেশ্যনটোৱে প্ৰতিস্থাপন কৰিব পৰা কোনো LLM নাপালে।"
	" যদি আপুনি HF_TOKEN হিচাপে এক্সেস টোকেন যোগ কৰে, মই অনলাইন Inference API ব্যৱহাৰ কৰি উত্তৰ দিম."
	)

	# ---------------------- OCR + math helpers ----------------------

	def ocr_from_image(img: Image.Image):
	if img is None:
	return ""
	try:
	img = img.convert("RGB")
	except Exception:
	pass
	try:
	text = pytesseract.image_to_string(img, lang="asm+eng")
	except Exception:
	try:
	text = pytesseract.image_to_string(img)
	except Exception:
	text = ""
	return text.strip()


	def is_likely_math(text: str) -> bool:
	math_chars = set("0123456789+-*/=^()%")
	if any(ch in text for ch in math_chars):
	return True
	kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"]
	return any(k in text for k in kws)


	def solve_math_expression(expr: str):
	try:
	expr = expr.replace("^", "**")
	if "=" in expr:
	left, right = expr.split("=", 1)
	left_s = sp.sympify(left)
	right_s = sp.sympify(right)
	eq = sp.Eq(left_s, right_s)
	sol = sp.solve(eq)
	steps = []
	steps.append("প্ৰথমে সমীকৰণ লওঁ:")
	steps.append(f"{sp.pretty(eq)}")
	steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
	steps.append(str(sol))
	explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
	explanation += f"\n\nসেয়েহে সমাধান: {sol}"
	else:
	expr_s = sp.sympify(expr)
	simp = sp.simplify(expr_s)
	explanation = (
	"প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
	f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
	)
	return explanation
	except Exception:
	return (
	"মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
	"দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7"
	)

	# ---------------------- Chat logic ----------------------

	def login_user(username, user_state):
	username = (username or "").strip()
	if not username:
	return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।"

	user_id = get_or_create_user(username)
	user_state = {"username": username, "user_id": user_id}
	total, math_count = get_user_stats(user_id)

	stats = (
	f"👤 ব্যৱহাৰকাৰী: {username}\n\n"
	f"📊 মোট প্ৰশ্ন: {total}\n"
	f"🧮 গণিত প্ৰশ্ন: {math_count}"
	)
	return user_state, stats


	def chat_logic(
	username,
	text_input,
	image_input,
	audio_input,
	chat_history,
	user_state,
	):
	if not user_state or not user_state.get("user_id"):
	sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি Login / লগিন টিপক।"
	chat_history = chat_history + [[text_input or "", sys_msg]]
	return chat_history, user_state, None

	user_id = user_state["user_id"]

	final_query_parts = []

	# audio_input not handled in this demo

	ocr_text = ""
	if image_input is not None:
	try:
	# Handle gradio image types: filepath (string), PIL Image, bytes/file-like
	if isinstance(image_input, str):
	img = Image.open(image_input)
	elif hasattr(image_input, "name") and isinstance(image_input.name, str):
	# uploaded file-like with .name
	img = Image.open(image_input.name)
	elif isinstance(image_input, (bytes, bytearray)):
	img = Image.open(io.BytesIO(image_input))
	else:
	img = image_input
	except Exception:
	try:
	if hasattr(image_input, "read"):
	img = Image.open(io.BytesIO(image_input.read()))
	else:
	img = None
	except Exception:
	img = None
	if img is not None:
	ocr_text = ocr_from_image(img)
	if ocr_text:
	final_query_parts.append(ocr_text)

	if text_input:
	final_query_parts.append(text_input)

	if not final_query_parts:
	sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।"
	chat_history = chat_history + [["", sys_msg]]
	return chat_history, user_state, None

	full_query = "\n".join(final_query_parts)

	conv = []
	for u, b in chat_history:
	if u:
	conv.append(("Student", u))
	if b:
	conv.append(("Tutor", b))

	is_math = is_likely_math(full_query)

	if is_math:
	math_answer = solve_math_expression(full_query)
	combined_question = (
	full_query
	+ "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n"
	+ math_answer
	+ "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।"
	)
	final_answer = llm_answer_with_rag(combined_question, conv)
	else:
	final_answer = llm_answer_with_rag(full_query, conv)

	log_interaction(user_id, full_query, final_answer, is_math)

	display_question = text_input or ocr_text or "(empty)"
	chat_history = chat_history + [[display_question, final_answer]]

	return chat_history, user_state, None

	# ---------------------- Gradio UI ----------------------

	# Wrap UI creation + launch in try/except so runtime errors are logged clearly
	import traceback

	try:
	with gr.Blocks(title=APP_NAME, theme="soft") as demo:
	gr.Markdown(
	"""
	# 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor

	- 📘 SEBA ক্লাছ ১০ পাঠ্যপুথিৰ ওপৰত ভিত্তি কৰি উত্তৰ
	- 🗣️ টেক্স্ট + ছবি (OCR) ইনপুট
	- 🧮 গণিত প্ৰশ্নৰ ধাপ-ধাপে সমাধান
	- 👤 ইউজাৰ লগিন + প্ৰগতি (progress) সংৰক্ষণ
	"""
	)

	user_state = gr.State({})

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 👤 লগিন")
	username_inp = gr.Textbox(
	label="নাম / ইউজাৰ আইডি",
	placeholder="উদাহৰণ: abu10, student01 ...",
	)
	login_btn = gr.Button("✅ Login / লগিন")
	stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")

	gr.Markdown(
	"""
	### 💡 টিপছ
	- "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল
	- ফটো আপলোড করলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয়
	- সম্ভব হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂
	"""
	)

	with gr.Column(scale=3):
	chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)

	text_inp = gr.Textbox(
	label="আপোনাৰ প্ৰশ্ন লিখক",
	placeholder="উদাহৰণ: \"ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?\"",
	lines=2,
	)

	with gr.Row():
	# Use a gr.Image type compatible with this Gradio version: 'filepath' or 'pil' or 'numpy'
	# 'filepath' returns a string path in Spaces; code above handles it.
	image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="filepath")
	audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy")

	with gr.Row():
	ask_btn = gr.Button("🤖 জাজাবৰক সোধক")

	login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])

	def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
	if user_state_inner and username_inner and not user_state_inner.get("username"):
	user_state_inner["username"] = username_inner
	return chat_logic(username_inner, text, image, audio, history, user_state_inner)

	ask_btn.click(
	wrapped_chat,
	inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
	outputs=[chat, user_state],
	concurrency_limit=4,
	)

	# On Hugging Face Spaces localhost may be inaccessible from the container; create a shareable link.
	# Using share=True here forces Gradio to create a public tunnel which Spaces allows.
	demo.launch(share=True)



	except Exception as e:
	# Write full traceback to a file for debugging in Spaces logs and print to stdout
	tb = traceback.format_exc()
	print("--- Exception during UI startup ---")
	print(tb)
	with open("startup_error.log", "w") as f:
	f.write(tb)
	# Re-raise so the container shows the failure (useful for CI/Spaces logs)
	raise