Sazid2 commited on
Commit
36162a4
·
verified ·
1 Parent(s): 86a2d4b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +429 -158
app.py CHANGED
@@ -1,160 +1,359 @@
1
  # app.py
2
  """
3
- Jajabor – Minimal safe version (no FAISS, no torch, no transformers)
4
- - Retrieval: TF-IDF (scikit-learn)
5
- - PDF reading: PyPDF2
6
- - OCR: pytesseract
7
- - Math: sympy
8
- - UI: Gradio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
 
11
  import os
12
  import io
13
  import sqlite3
14
  from datetime import datetime
15
- import traceback
16
 
17
- from PyPDF2 import PdfReader
18
- from PIL import Image
19
- import gradio as gr
20
  import numpy as np
 
21
 
22
- from sklearn.feature_extraction.text import TfidfVectorizer
23
- from sklearn.metrics.pairwise import linear_kernel
24
-
25
  import pytesseract
 
 
26
  import sympy as sp
 
27
 
28
- # ---------- CONFIG ----------
29
- APP_NAME = "Jajabor – Minimal (TF-IDF retrieval)"
30
- BASE_DIR = os.path.abspath(os.path.dirname(__file__))
 
31
  PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
32
  DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
33
 
 
 
 
 
 
 
 
 
 
 
34
  CHUNK_SIZE = 600
35
  CHUNK_OVERLAP = 120
36
- TOP_K = 3
 
 
 
 
 
 
37
 
38
- # ---------- DB ----------
39
- def init_db(path=DB_PATH):
40
- os.makedirs(os.path.dirname(path), exist_ok=True)
41
- conn = sqlite3.connect(path)
 
 
 
 
 
 
 
 
 
 
 
 
42
  cur = conn.cursor()
43
- cur.execute("""CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT UNIQUE, created_at TEXT)""")
44
- cur.execute("""CREATE TABLE IF NOT EXISTS interactions (id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER, timestamp TEXT, query TEXT, answer TEXT, is_math INTEGER)""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  conn.commit()
46
  conn.close()
47
 
48
- def get_or_create_user(username):
 
49
  username = username.strip()
50
  if not username:
51
  return None
 
52
  conn = sqlite3.connect(DB_PATH)
53
  cur = conn.cursor()
54
  cur.execute("SELECT id FROM users WHERE username=?", (username,))
55
  row = cur.fetchone()
56
  if row:
57
- uid = row[0]
58
  else:
59
- cur.execute("INSERT INTO users (username, created_at) VALUES (?, ?)", (username, datetime.utcnow().isoformat()))
 
 
 
60
  conn.commit()
61
- uid = cur.lastrowid
62
  conn.close()
63
- return uid
64
 
65
- def log_interaction(user_id, query, answer, is_math):
 
66
  conn = sqlite3.connect(DB_PATH)
67
  cur = conn.cursor()
68
- cur.execute("INSERT INTO interactions (user_id, timestamp, query, answer, is_math) VALUES (?, ?, ?, ?, ?)",
69
- (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0))
 
 
 
 
 
70
  conn.commit()
71
  conn.close()
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  init_db()
74
 
75
- # ---------- PDF reading ----------
76
- def extract_text_from_pdf(pdf_path):
77
- pages = []
78
  try:
79
- reader = PdfReader(pdf_path)
80
- for page in reader.pages:
81
- try:
82
- txt = page.extract_text() or ""
83
- pages.append(txt)
84
- except Exception:
85
- continue
86
- except Exception as e:
87
- print("PDF read error:", e)
88
  return "\n".join(pages)
89
 
90
- def load_all_pdfs(pdf_dir):
 
91
  texts = []
92
  metas = []
93
- if not os.path.isdir(pdf_dir):
94
- print("PDF_DIR not found:", pdf_dir)
95
  return texts, metas
96
  for fname in sorted(os.listdir(pdf_dir)):
97
  if fname.lower().endswith(".pdf"):
98
  path = os.path.join(pdf_dir, fname)
99
  print("Reading:", path)
100
  text = extract_text_from_pdf(path)
101
- texts.append(text)
102
- metas.append({"source": fname})
 
103
  return texts, metas
104
 
105
- def split_text_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
106
- if not text:
107
- return []
108
  chunks = []
109
- step = chunk_size - overlap
110
- i = 0
111
- while i < len(text):
112
- chunk = text[i:i+chunk_size]
 
113
  if chunk.strip():
114
  chunks.append(chunk)
115
- i += max(step, 1)
 
 
116
  return chunks
117
 
118
- # ---------- Build TF-IDF index ----------
119
- print("Loading PDFs and building TF-IDF index...")
120
- all_texts, all_metas = load_all_pdfs(PDF_DIR)
121
- corpus_chunks = []
122
- corpus_metas = []
123
- for text, meta in zip(all_texts, all_metas):
124
- chs = split_text_into_chunks(text)
125
- corpus_chunks.extend(chs)
126
- corpus_metas.extend([meta] * len(chs))
127
 
128
- if len(corpus_chunks) == 0:
129
- print("No PDF chunks found. Upload PDFs into pdfs/class10/")
130
 
131
- vectorizer = None
132
- tfidf_matrix = None
133
- if corpus_chunks:
134
- try:
135
- vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
136
- tfidf_matrix = vectorizer.fit_transform(corpus_chunks)
137
- print("TF-IDF ready. Chunks:", len(corpus_chunks))
138
- except Exception as e:
139
- print("Failed to build TF-IDF:", e)
140
- vectorizer = None
141
- tfidf_matrix = None
142
-
143
- def retrieve_tfidf(query, top_k=TOP_K):
144
- if tfidf_matrix is None or vectorizer is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  return []
146
- qv = vectorizer.transform([query])
147
- sims = linear_kernel(qv, tfidf_matrix).flatten()
148
- idxs = sims.argsort()[::-1][:top_k]
149
  results = []
150
- for idx in idxs:
151
- if sims[idx] <= 0:
152
  continue
153
- results.append({"score": float(sims[idx]), "text": corpus_chunks[idx], "meta": corpus_metas[idx]})
 
 
 
 
154
  return results
155
 
156
- # ---------- OCR and math ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def ocr_from_image(img: Image.Image):
 
 
158
  try:
159
  img = img.convert("RGB")
160
  except Exception:
@@ -168,131 +367,203 @@ def ocr_from_image(img: Image.Image):
168
  text = ""
169
  return text.strip()
170
 
 
171
  def is_likely_math(text: str) -> bool:
172
- if not text:
173
- return False
174
  math_chars = set("0123456789+-*/=^()%")
175
  if any(ch in text for ch in math_chars):
176
  return True
177
- kws = ["গণিত", "সমীকৰণ", "বীজগণিত", "math", "solve", "equation"]
178
  return any(k in text for k in kws)
179
 
 
180
  def solve_math_expression(expr: str):
181
  try:
182
  expr = expr.replace("^", "**")
183
  if "=" in expr:
184
  left, right = expr.split("=", 1)
185
- eq = sp.Eq(sp.sympify(left), sp.sympify(right))
 
 
186
  sol = sp.solve(eq)
187
- return "ধাপ-ধাপে সমাধান (সংক্ষেপ):\n" + str(sol)
 
 
 
 
 
 
188
  else:
189
- simp = sp.simplify(sp.sympify(expr))
190
- return f"সরলীকৰণ: {simp}"
 
 
 
 
 
191
  except Exception:
192
- return "গণিতীয় অভিব্যক্তি বুজা যায় নাই — দয়া কৰি সঠিকভাৱে লিখক।"
193
-
194
- # ---------- Answering (extractive) ----------
195
- def answer_with_retrieval(query, chat_history):
196
- results = retrieve_tfidf(query, top_k=TOP_K)
197
- if not results:
198
- return "পাঠ্যপুথি সম্বন্ধীয় তথ্য নহল; দয়া কৰি অধিক স্পষ্টকৈ সোধক।"
199
- # Combine top chunks as extractive answer (shorten if too long)
200
- answer_parts = []
201
- for r in results:
202
- txt = r["text"].strip()
203
- if len(txt) > 800:
204
- txt = txt[:800].rsplit("\n", 1)[0] + "…"
205
- answer_parts.append(f"[Source: {r['meta'].get('source','textbook')}] \n{txt}")
206
- return "\n\n".join(answer_parts)
207
-
208
- # ---------- Chat logic ----------
209
  def login_user(username, user_state):
210
  username = (username or "").strip()
211
  if not username:
212
- return user_state, "⚠️ অনুগ্ৰহ কৰি লগিনৰ বাবে এটা নাম লিখক।"
 
213
  user_id = get_or_create_user(username)
214
  user_state = {"username": username, "user_id": user_id}
215
- total, math_count = 0, 0
216
- try:
217
- total, math_count = (lambda uid: (lambda c,m: (c,m))( * (lambda cur: (cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (uid,)), cur.fetchone())[1] ) )(uid) )(user_id)
218
- except Exception:
219
- total, math_count = get_or_create_user(username) and (0,0)
220
- stats = f"👤 {username}\n📊 মোট প্ৰশ্ন: {total}\n🧮 গণিত: {math_count}"
 
221
  return user_state, stats
222
 
223
- def chat_logic(username, text_input, image_input, audio_input, chat_history, user_state):
224
- if chat_history is None:
225
- chat_history = []
226
 
 
 
 
 
 
 
 
 
227
  if not user_state or not user_state.get("user_id"):
228
- sys_msg = "⚠️ প্ৰথমে লগিন কৰক।"
229
  chat_history = chat_history + [[text_input or "", sys_msg]]
230
- return chat_history, user_state, ""
231
 
232
  user_id = user_state["user_id"]
 
233
  final_query_parts = []
234
 
 
 
235
  ocr_text = ""
236
- if image_input:
237
  try:
238
- if isinstance(image_input, str):
239
- img = Image.open(image_input)
 
 
240
  else:
241
- raw = image_input.read()
242
- img = Image.open(io.BytesIO(raw))
 
 
 
 
 
243
  ocr_text = ocr_from_image(img)
244
  if ocr_text:
245
  final_query_parts.append(ocr_text)
246
- except Exception:
247
- pass
248
 
249
  if text_input:
250
  final_query_parts.append(text_input)
251
 
252
  if not final_query_parts:
253
- sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক বা ছবি আপলোড কৰক।"
254
  chat_history = chat_history + [["", sys_msg]]
255
- return chat_history, user_state, ""
256
 
257
  full_query = "\n".join(final_query_parts)
258
 
259
- is_math_flag = is_likely_math(full_query)
260
- if is_math_flag:
 
 
 
 
 
 
 
 
261
  math_answer = solve_math_expression(full_query)
262
- # Use extractive retrieval to provide supporting text and then math result
263
- retrieval = answer_with_retrieval(full_query, chat_history)
264
- final_answer = f"{retrieval}\n\nগণিত সমাধান:\n{math_answer}"
 
 
 
 
265
  else:
266
- final_answer = answer_with_retrieval(full_query, chat_history)
267
 
268
- log_interaction(user_id, full_query, final_answer, is_math_flag)
269
- display_q = text_input or ocr_text or "(image)"
270
- chat_history = chat_history + [[display_q, final_answer]]
271
- return chat_history, user_state, ""
272
 
273
- # ---------- Gradio UI ----------
274
- with gr.Blocks(title=APP_NAME) as demo:
275
- gr.Markdown("# 🧭 Jajabor – Minimal TF-IDF Tutor (Free)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  user_state = gr.State({})
278
 
279
  with gr.Row():
280
  with gr.Column(scale=1):
281
- username_inp = gr.Textbox(label="নাম / ইউজাৰ আইডি", placeholder="e.g. abu10")
282
- login_btn = gr.Button("Login")
283
- stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  with gr.Column(scale=3):
285
- chat = gr.Chatbot(label="জাজাবৰ", height=480)
286
- text_inp = gr.Textbox(label="আপোনাৰ প্ৰশ্ন লিখক", lines=2)
 
 
 
 
 
 
 
 
 
 
287
  with gr.Row():
288
- image_inp = gr.Image(label="📷 ছবি (Optional)", type="filepath")
289
- audio_inp = gr.Audio(label="🎙️ (Optional)", type="filepath")
290
- ask_btn = gr.Button("সোধক")
291
 
292
  login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
293
 
294
- ask_btn.click(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
295
- text_inp.submit(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
 
 
 
 
 
 
 
 
 
296
 
297
- if __name__ == "__main__":
298
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
1
  # app.py
2
  """
3
+ Jajabor – SEBA Assamese Class 10 Tutor
4
+ Hugging Face Spaces ready Gradio app (single-file)
5
+
6
+ This file contains a working, lightweight adaptation of your Colab notebook
7
+ so it can run on Hugging Face Spaces (CPU-friendly demo).
8
+
9
+ IMPORTANT notes for deployment:
10
+ - Spaces has limited CPU/GPU. Large models (Qwen2.5, BAAI/bge-m3) won't run
11
+ locally in most Spaces. This app uses smaller models for a working demo.
12
+ - For production-quality behavior, switch embeddings/LLM calls to the
13
+ Hugging Face Inference API (use your HF token) or host on Colab/VM with GPU.
14
+
15
+ Create a `requirements.txt` with these entries (add to your repo):
16
+
17
+ gradio==4.44.0
18
+ pymupdf
19
+ sentence-transformers
20
+ faiss-cpu
21
+ transformers
22
+ accelerate
23
+ torch
24
+ pytesseract
25
+ pillow
26
+ sympy
27
+ huggingface_hub
28
+
29
+ Place your SEBA Class10 PDFs in the repository under `pdfs/class10/`.
30
+
31
+ Usage on Spaces:
32
+ - Upload the repo (app.py + requirements.txt + pdfs/class10/*).
33
+ - If you want higher-quality LLMs/embeddings, set a repo secret HF_TOKEN
34
+ and configure INFERENCE_MODELS below.
35
+
36
  """
37
 
38
  import os
39
  import io
40
  import sqlite3
41
  from datetime import datetime
42
+ import threading
43
 
44
+ import fitz # PyMuPDF
 
 
45
  import numpy as np
46
+ from PIL import Image
47
 
48
+ import gradio as gr
49
+ import faiss
 
50
  import pytesseract
51
+ from sentence_transformers import SentenceTransformer
52
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
53
  import sympy as sp
54
+ from huggingface_hub import InferenceApi
55
 
56
+ # ---------------------- Configuration ----------------------
57
+ APP_NAME = "Jajabor – SEBA Assamese Class 10 Tutor (Spaces demo)"
58
+
59
+ BASE_DIR = os.path.abspath(".")
60
  PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
61
  DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
62
 
63
+ # Lightweight defaults for Spaces demo. Replace with heavier models via Inference API.
64
+ EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
65
+ LLM_MODEL_LOCAL = "sshleifer/tiny-gpt2" # very small demo model (optional local)
66
+
67
+ # If you set HF_TOKEN as a repo secret / environment variable, the app will
68
+ # use the Inference API models below for better results.
69
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
70
+ INFERENCE_EMBED_MODEL = "sentence-transformers/all-mpnet-base-v2" # example
71
+ INFERENCE_LLM_MODEL = "bigscience/bloomz-1b1" # example remote model
72
+
73
  CHUNK_SIZE = 600
74
  CHUNK_OVERLAP = 120
75
+ TOP_K = 5
76
+
77
+ # Global variables initialized later
78
+ embedding_model = None
79
+ index = None
80
+ corpus_chunks = []
81
+ corpus_metas = []
82
 
83
+ # If HF_TOKEN provided, create inference clients
84
+ inference_embed_client = None
85
+ inference_llm_client = None
86
+ if HF_TOKEN:
87
+ try:
88
+ inference_embed_client = InferenceApi(repo_id=INFERENCE_EMBED_MODEL, token=HF_TOKEN)
89
+ inference_llm_client = InferenceApi(repo_id=INFERENCE_LLM_MODEL, token=HF_TOKEN)
90
+ except Exception:
91
+ inference_embed_client = None
92
+ inference_llm_client = None
93
+
94
+ # ---------------------- Database ----------------------
95
+
96
+ def init_db(db_path=DB_PATH):
97
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
98
+ conn = sqlite3.connect(db_path)
99
  cur = conn.cursor()
100
+ cur.execute(
101
+ """
102
+ CREATE TABLE IF NOT EXISTS users (
103
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
104
+ username TEXT UNIQUE,
105
+ created_at TEXT
106
+ )
107
+ """
108
+ )
109
+ cur.execute(
110
+ """
111
+ CREATE TABLE IF NOT EXISTS interactions (
112
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
113
+ user_id INTEGER,
114
+ timestamp TEXT,
115
+ query TEXT,
116
+ answer TEXT,
117
+ is_math INTEGER,
118
+ FOREIGN KEY(user_id) REFERENCES users(id)
119
+ )
120
+ """
121
+ )
122
  conn.commit()
123
  conn.close()
124
 
125
+
126
+ def get_or_create_user(username: str):
127
  username = username.strip()
128
  if not username:
129
  return None
130
+
131
  conn = sqlite3.connect(DB_PATH)
132
  cur = conn.cursor()
133
  cur.execute("SELECT id FROM users WHERE username=?", (username,))
134
  row = cur.fetchone()
135
  if row:
136
+ user_id = row[0]
137
  else:
138
+ cur.execute(
139
+ "INSERT INTO users (username, created_at) VALUES (?, ?)",
140
+ (username, datetime.utcnow().isoformat()),
141
+ )
142
  conn.commit()
143
+ user_id = cur.lastrowid
144
  conn.close()
145
+ return user_id
146
 
147
+
148
+ def log_interaction(user_id, query, answer, is_math: bool):
149
  conn = sqlite3.connect(DB_PATH)
150
  cur = conn.cursor()
151
+ cur.execute(
152
+ """
153
+ INSERT INTO interactions (user_id, timestamp, query, answer, is_math)
154
+ VALUES (?, ?, ?, ?, ?)
155
+ """,
156
+ (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0),
157
+ )
158
  conn.commit()
159
  conn.close()
160
 
161
+
162
+ def get_user_stats(user_id):
163
+ conn = sqlite3.connect(DB_PATH)
164
+ cur = conn.cursor()
165
+ cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (user_id,))
166
+ row = cur.fetchone()
167
+ conn.close()
168
+ total = row[0] or 0
169
+ math_count = row[1] or 0
170
+ return total, math_count
171
+
172
  init_db()
173
 
174
+ # ---------------------- PDF loading + RAG ----------------------
175
+
176
+ def extract_text_from_pdf(pdf_path: str) -> str:
177
  try:
178
+ doc = fitz.open(pdf_path)
179
+ except Exception:
180
+ return ""
181
+ pages = []
182
+ for page in doc:
183
+ txt = page.get_text("text")
184
+ if txt:
185
+ pages.append(txt)
 
186
  return "\n".join(pages)
187
 
188
+
189
+ def load_all_pdfs(pdf_dir: str):
190
  texts = []
191
  metas = []
192
+ if not os.path.exists(pdf_dir):
193
+ print("PDF_DIR does not exist:", pdf_dir)
194
  return texts, metas
195
  for fname in sorted(os.listdir(pdf_dir)):
196
  if fname.lower().endswith(".pdf"):
197
  path = os.path.join(pdf_dir, fname)
198
  print("Reading:", path)
199
  text = extract_text_from_pdf(path)
200
+ if text:
201
+ texts.append(text)
202
+ metas.append({"source": fname})
203
  return texts, metas
204
 
205
+
206
+ def split_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
 
207
  chunks = []
208
+ start = 0
209
+ L = len(text)
210
+ while start < L:
211
+ end = min(start + chunk_size, L)
212
+ chunk = text[start:end]
213
  if chunk.strip():
214
  chunks.append(chunk)
215
+ if end == L:
216
+ break
217
+ start = end - overlap
218
  return chunks
219
 
 
 
 
 
 
 
 
 
 
220
 
221
+ def build_embedding_index():
222
+ global embedding_model, index, corpus_chunks, corpus_metas
223
 
224
+ print("Loading embedding model:", EMBEDDING_MODEL_NAME)
225
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
226
+
227
+ all_texts, all_metas = load_all_pdfs(PDF_DIR)
228
+ corpus_chunks = []
229
+ corpus_metas = []
230
+ for text, meta in zip(all_texts, all_metas):
231
+ chs = split_text(text)
232
+ corpus_chunks.extend(chs)
233
+ corpus_metas.extend([meta] * len(chs))
234
+
235
+ if not corpus_chunks:
236
+ print("No document chunks found - RAG will be empty.")
237
+ index = None
238
+ return
239
+
240
+ print("Encoding", len(corpus_chunks), "chunks...")
241
+ embs = embedding_model.encode(corpus_chunks, batch_size=32, show_progress_bar=False).astype("float32")
242
+ dim = embs.shape[1]
243
+ index = faiss.IndexFlatL2(dim)
244
+ index.add(embs)
245
+ print("FAISS index ready with dim", dim)
246
+
247
+
248
+ # Build in a background thread so Spaces can start quickly
249
+ threading.Thread(target=build_embedding_index, daemon=True).start()
250
+
251
+
252
+ def rag_search(query: str, k: int = TOP_K):
253
+ if index is None or embedding_model is None:
254
  return []
255
+ q_vec = embedding_model.encode([query]).astype("float32")
256
+ D, I = index.search(q_vec, k)
 
257
  results = []
258
+ for dist, idx in zip(D[0], I[0]):
259
+ if idx == -1:
260
  continue
261
+ results.append({
262
+ "score": float(dist),
263
+ "text": corpus_chunks[idx],
264
+ "meta": corpus_metas[idx],
265
+ })
266
  return results
267
 
268
+ # ---------------------- LLM + RAG prompt building ----------------------
269
+
270
+ # Try to create a small local LLM pipeline for demo; if not present, fallback to Inference API
271
+ local_llm = None
272
+ try:
273
+ tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_LOCAL)
274
+ model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_LOCAL)
275
+ local_llm = pipeline(
276
+ "text-generation",
277
+ model=model,
278
+ tokenizer=tokenizer,
279
+ max_new_tokens=256,
280
+ do_sample=True,
281
+ temperature=0.3,
282
+ top_p=0.9,
283
+ )
284
+ print("Local tiny LLM loaded for demo.")
285
+ except Exception:
286
+ local_llm = None
287
+ print("Local LLM not available; will use Inference API if HF_TOKEN is set.")
288
+
289
+ SYSTEM_PROMPT = """
290
+ You are "Jajabor", an expert SEBA Assamese tutor for Class 10.
291
+ Always prefer to answer in Assamese. If the student clearly asks for English, you may reply in English.
292
+
293
+ Rules:
294
+ - Use ONLY the given textbook context when requested.
295
+ - If you are not sure, say: "এই প্ৰশ্নটো পাঠ্যপুথিৰ অংশত স্পষ্টকৈ নাই, সেয়েহে মই নিশ্চিত নহয়।"
296
+ - বোঝাপৰা সহজ ভাষাত ব্যাখ্যা কৰা, উদাহৰণ দিয়ক।
297
+ - If it is a maths question, explain step-by-step clearly.
298
+ """
299
+
300
+
301
+ def build_rag_prompt(context_blocks, question, chat_history):
302
+ ctx = ""
303
+ for i, block in enumerate(context_blocks, start=1):
304
+ src = block["meta"].get("source", "textbook")
305
+ ctx += f"\n[Context {i} – {src}]\n{block['text']}\n"
306
+
307
+ hist = ""
308
+ for role, msg in chat_history:
309
+ hist += f"{role}: {msg}\n"
310
+
311
+ prompt = f"{SYSTEM_PROMPT}\n\nপূর্বৰ বাৰ্তাসমূহ:\n{hist}\nসদস্যৰ প্ৰশ্ন:\n{question}\n\nসম্পৰ্কিত পাঠ্যপুথিৰ অংশ:\n{ctx}\n\nএতিয়া একেদম সহায়ক আৰু বুজিবলৈ সহজ উত্তৰ দিয়া।"
312
+ return prompt
313
+
314
+
315
+ def llm_answer_with_rag(question: str, chat_history):
316
+ retrieved = rag_search(question, TOP_K)
317
+ prompt = build_rag_prompt(retrieved, question, chat_history)
318
+
319
+ # Prefer Inference API if available
320
+ if inference_llm_client is not None:
321
+ try:
322
+ resp = inference_llm_client(inputs=prompt, params={"max_new_tokens": 512})
323
+ # InferenceApi returns a dict or string depending on model
324
+ if isinstance(resp, dict) and "generated_text" in resp:
325
+ out_text = resp["generated_text"]
326
+ elif isinstance(resp, str):
327
+ out_text = resp
328
+ else:
329
+ out_text = str(resp)
330
+ # Some remote models echo the prompt; try to strip prompt
331
+ if out_text.startswith(prompt):
332
+ answer = out_text[len(prompt):].strip()
333
+ else:
334
+ answer = out_text.strip()
335
+ return answer
336
+ except Exception:
337
+ pass
338
+
339
+ # Fallback to local tiny model
340
+ if local_llm is not None:
341
+ out = local_llm(prompt, num_return_sequences=1)[0]["generated_text"]
342
+ if out.startswith(prompt):
343
+ return out[len(prompt):].strip()
344
+ return out
345
+
346
+ # If nothing available, return a safe fallback
347
+ return (
348
+ "দুখঃখিত—এই Spaces ইনষ্টলেশ্যনটোৱে প্ৰতিস্থাপন কৰিব পৰা কোনো LLM নাপালে।"
349
+ " যদি আপুনি HF_TOKEN হিচাপে এক্সেস টোকেন যোগ কৰে, মই অনলাইন Inference API ব্যৱহাৰ কৰি উত্তৰ দিম."
350
+ )
351
+
352
+ # ---------------------- OCR + math helpers ----------------------
353
+
354
  def ocr_from_image(img: Image.Image):
355
+ if img is None:
356
+ return ""
357
  try:
358
  img = img.convert("RGB")
359
  except Exception:
 
367
  text = ""
368
  return text.strip()
369
 
370
+
371
  def is_likely_math(text: str) -> bool:
 
 
372
  math_chars = set("0123456789+-*/=^()%")
373
  if any(ch in text for ch in math_chars):
374
  return True
375
+ kws = ["গণিত", "সমীকৰণ", "উদাহৰণ", "প্ৰশ্ন", "বীজগণিত"]
376
  return any(k in text for k in kws)
377
 
378
+
379
  def solve_math_expression(expr: str):
380
  try:
381
  expr = expr.replace("^", "**")
382
  if "=" in expr:
383
  left, right = expr.split("=", 1)
384
+ left_s = sp.sympify(left)
385
+ right_s = sp.sympify(right)
386
+ eq = sp.Eq(left_s, right_s)
387
  sol = sp.solve(eq)
388
+ steps = []
389
+ steps.append("প্ৰথমে সমীকৰণ লওঁ:")
390
+ steps.append(f"{sp.pretty(eq)}")
391
+ steps.append("Sympy ৰ সহায়ত সমাধান পোৱা যায়:")
392
+ steps.append(str(sol))
393
+ explanation = "ধাপ-ধাপে সমাধান (সংক্ষেপে):\n" + "\n".join(f"- {s}" for s in steps)
394
+ explanation += f"\n\nসেয়েহে সমাধান: {sol}"
395
  else:
396
+ expr_s = sp.sympify(expr)
397
+ simp = sp.simplify(expr_s)
398
+ explanation = (
399
+ "প্ৰদত্ত গণিতীয় অভিব্যক্তি:\n"
400
+ f"{expr}\n\nসরলীকৰণ কৰাৰ পিছত পোৱা যায়:\n{simp}"
401
+ )
402
+ return explanation
403
  except Exception:
404
+ return (
405
+ "মই সঠিকভাৱে গণিতীয় অভিব্যক্তি চিনাক্ত কৰিব নোৱাৰিলোঁ। "
406
+ "দয়া কৰি সমীকৰণটো অলপ বেছি স্পষ্টকৈ লিখা: উদাহৰণ – 2x + 3 = 7"
407
+ )
408
+
409
+ # ---------------------- Chat logic ----------------------
410
+
 
 
 
 
 
 
 
 
 
 
411
  def login_user(username, user_state):
412
  username = (username or "").strip()
413
  if not username:
414
+ return user_state, "⚠️ অনুগ্ৰহ কৰি প্ৰথমে লগিনৰ বাবে এটা নাম লিখক।"
415
+
416
  user_id = get_or_create_user(username)
417
  user_state = {"username": username, "user_id": user_id}
418
+ total, math_count = get_user_stats(user_id)
419
+
420
+ stats = (
421
+ f"👤 ব্যৱহাৰকাৰী: **{username}**\n\n"
422
+ f"📊 মোট প্ৰশ্ন: **{total}**\n"
423
+ f"🧮 গণিত প্ৰশ্ন: **{math_count}**"
424
+ )
425
  return user_state, stats
426
 
 
 
 
427
 
428
+ def chat_logic(
429
+ username,
430
+ text_input,
431
+ image_input,
432
+ audio_input,
433
+ chat_history,
434
+ user_state,
435
+ ):
436
  if not user_state or not user_state.get("user_id"):
437
+ sys_msg = "⚠️ প্ৰথমে ওপৰত আপোনাৰ নাম লিখি **Login / লগিন** টিপক।"
438
  chat_history = chat_history + [[text_input or "", sys_msg]]
439
+ return chat_history, user_state, None
440
 
441
  user_id = user_state["user_id"]
442
+
443
  final_query_parts = []
444
 
445
+ # audio_input not handled in this demo
446
+
447
  ocr_text = ""
448
+ if image_input is not None:
449
  try:
450
+ if hasattr(image_input, "name"):
451
+ img = Image.open(image_input.name)
452
+ elif isinstance(image_input, (bytes, bytearray)):
453
+ img = Image.open(io.BytesIO(image_input))
454
  else:
455
+ img = image_input
456
+ except Exception:
457
+ try:
458
+ img = Image.open(io.BytesIO(image_input.read()))
459
+ except Exception:
460
+ img = None
461
+ if img is not None:
462
  ocr_text = ocr_from_image(img)
463
  if ocr_text:
464
  final_query_parts.append(ocr_text)
 
 
465
 
466
  if text_input:
467
  final_query_parts.append(text_input)
468
 
469
  if not final_query_parts:
470
+ sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক, কিম্বা ছবি আপলোড কৰক।"
471
  chat_history = chat_history + [["", sys_msg]]
472
+ return chat_history, user_state, None
473
 
474
  full_query = "\n".join(final_query_parts)
475
 
476
+ conv = []
477
+ for u, b in chat_history:
478
+ if u:
479
+ conv.append(("Student", u))
480
+ if b:
481
+ conv.append(("Tutor", b))
482
+
483
+ is_math = is_likely_math(full_query)
484
+
485
+ if is_math:
486
  math_answer = solve_math_expression(full_query)
487
+ combined_question = (
488
+ full_query
489
+ + "\n\nগণিত প্ৰোগ্ৰামে এই ফলাফল দিছে:\n"
490
+ + math_answer
491
+ + "\n\nঅনুগ্ৰহ কৰি শ্রেণী ১০ ৰ শিক্ষাৰ্থীৰ বাবে সহজ ভাষাত ব্যাখ্যা কৰক।"
492
+ )
493
+ final_answer = llm_answer_with_rag(combined_question, conv)
494
  else:
495
+ final_answer = llm_answer_with_rag(full_query, conv)
496
 
497
+ log_interaction(user_id, full_query, final_answer, is_math)
 
 
 
498
 
499
+ display_question = text_input or ocr_text or "(empty)"
500
+ chat_history = chat_history + [[display_question, final_answer]]
501
+
502
+ return chat_history, user_state, None
503
+
504
+ # ---------------------- Gradio UI ----------------------
505
+
506
+ with gr.Blocks(title=APP_NAME, theme="soft") as demo:
507
+ gr.Markdown(
508
+ """
509
+ # 🧭 জাজাবৰ – SEBA অসমীয়া ক্লাছ ১০ AI Tutor
510
+
511
+ - 📘 SEBA ক্লাছ ১০ পাঠ্যপুথিৰ ওপৰত ভিত্তি কৰি উত্তৰ
512
+ - 🗣️ টেক্স্ট + ছবি (OCR) ইনপুট
513
+ - 🧮 গণিত প্ৰশ্নৰ ধাপ-ধাপে সমাধান
514
+ - 👤 ইউজাৰ লগিন + প্ৰগতি (progress) সংৰক্ষণ
515
+ """
516
+ )
517
 
518
  user_state = gr.State({})
519
 
520
  with gr.Row():
521
  with gr.Column(scale=1):
522
+ gr.Markdown("### 👤 লগিন")
523
+ username_inp = gr.Textbox(
524
+ label="নাম / ইউজাৰ আইডি",
525
+ placeholder="উদাহৰণ: abu10, student01 ...",
526
+ )
527
+ login_btn = gr.Button("✅ Login / লগিন")
528
+ stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।", elem_classes="stats-box")
529
+
530
+ gr.Markdown(
531
+ """
532
+ ### 💡 টিপছ
533
+ - "ক্লাছ ১০ গণিত: উদাহৰণ ৩.১ প্ৰশ্ন ২" – এই ধৰণৰ প্ৰশ্ন ভাল
534
+ - ফটো আপলোড কৰিলে টেক্স্টটো OCR কৰি পঢ়িব চেষ্টা কৰা হয়
535
+ - সম্ভব হলে প্ৰশ্নটো অসমীয়াত সোধক 🙂
536
+ """
537
+ )
538
+
539
  with gr.Column(scale=3):
540
+ chat = gr.Chatbot(label="জাজাবৰ সৈতে কথোপকথন", height=500)
541
+
542
+ text_inp = gr.Textbox(
543
+ label="আপোনাৰ প্ৰশ্ন লিখক",
544
+ placeholder="উদাহৰণ: \"ক্লাছ ১০ অসমীয়া: অনুচ্ছেদ পাঠ ১ ৰ মূল বিষয় কি?\"",
545
+ lines=2,
546
+ )
547
+
548
+ with gr.Row():
549
+ image_inp = gr.Image(label="📷 প্ৰশ্নৰ ছবি (Optional)", type="file")
550
+ audio_inp = gr.Audio(label="🎙️ কণ্ঠস্বৰ প্ৰশ্ন (Stub — not used now)", type="numpy")
551
+
552
  with gr.Row():
553
+ ask_btn = gr.Button("🤖 জাজাবৰক সোধক")
 
 
554
 
555
  login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
556
 
557
+ def wrapped_chat(text, image, audio, history, user_state_inner, username_inner):
558
+ if user_state_inner and username_inner and not user_state_inner.get("username"):
559
+ user_state_inner["username"] = username_inner
560
+ return chat_logic(username_inner, text, image, audio, history, user_state_inner)
561
+
562
+ ask_btn.click(
563
+ wrapped_chat,
564
+ inputs=[text_inp, image_inp, audio_inp, chat, user_state, username_inp],
565
+ outputs=[chat, user_state, gr.State(None)],
566
+ )
567
+
568
 
569
+ demo.queue(concurrency_count=4).launch(server_name="0.0.0.0")