Sazid2 commited on
Commit
86a2d4b
·
verified ·
1 Parent(s): cf6f71f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -209
app.py CHANGED
@@ -1,225 +1,298 @@
1
- import streamlit as st
 
 
 
 
 
 
 
 
 
2
  import os
 
3
  import sqlite3
4
  from datetime import datetime
5
- import PyPDF2
6
- from sentence_transformers import SentenceTransformer
7
- import faiss
 
 
8
  import numpy as np
9
- from transformers import pipeline
 
 
 
10
  import pytesseract
11
- from PIL import Image
12
  import sympy as sp
13
- import io
14
 
15
- # App configuration
16
- st.set_page_config(
17
- page_title="Jajabor – SEBA Class 10 Tutor",
18
- page_icon="🧭",
19
- layout="wide"
20
- )
21
-
22
- # Initialize session state
23
- if 'chat_history' not in st.session_state:
24
- st.session_state.chat_history = []
25
- if 'username' not in st.session_state:
26
- st.session_state.username = ""
27
- if 'tutor' not in st.session_state:
28
- st.session_state.tutor = None
29
-
30
- class SimpleTutor:
31
- def __init__(self):
32
- self.llm = None
33
- self.embedding_model = None
34
- self.index = None
35
- self.corpus_chunks = []
36
- self._load_models()
37
- self.load_pdfs()
38
-
39
- def _load_models(self):
40
- try:
41
- self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
42
- self.llm = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
43
- except Exception as e:
44
- st.error(f"Model loading error: {e}")
45
-
46
- def load_pdfs(self):
47
- pdf_dir = "pdfs/class10"
48
- if not os.path.exists(pdf_dir):
49
- return
50
-
51
- all_texts = []
52
- for fname in os.listdir(pdf_dir):
53
- if fname.lower().endswith('.pdf'):
54
- path = os.path.join(pdf_dir, fname)
55
- try:
56
- reader = PyPDF2.PdfReader(path)
57
- text = ""
58
- for page in reader.pages:
59
- text += page.extract_text() or ""
60
- if text.strip():
61
- all_texts.append(text)
62
- except Exception as e:
63
- st.error(f"Error reading {fname}: {e}")
64
-
65
- self.corpus_chunks = []
66
- for text in all_texts:
67
- chunks = self._split_text(text)
68
- self.corpus_chunks.extend(chunks)
69
-
70
- if self.corpus_chunks and self.embedding_model:
71
- try:
72
- embs = self.embedding_model.encode(self.corpus_chunks).astype("float32")
73
- dim = embs.shape[1]
74
- self.index = faiss.IndexFlatL2(dim)
75
- self.index.add(embs)
76
- except Exception as e:
77
- st.error(f"FAISS error: {e}")
78
-
79
- def _split_text(self, text, chunk_size=400):
80
- if not text:
81
- return []
82
- return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size) if text[i:i+chunk_size].strip()]
83
-
84
- def answer_question(self, question):
85
- if not question.strip():
86
- return "অনুগ্ৰহ কৰি এটা প্ৰশ্ন সোধক।"
87
-
88
- if self._is_math_question(question):
89
- return self._solve_math(question)
90
-
91
- context = ""
92
- if self.index and self.corpus_chunks:
93
- relevant_chunks = self._find_relevant_chunks(question)
94
- if relevant_chunks:
95
- context = "\n".join(relevant_chunks[:2])
96
-
97
- if self.llm:
98
- try:
99
- prompt = f"প্ৰশ্ন: {question}\n\nসংদৰ্ভ: {context}\n\nসহায়ক উত্তৰ:" if context else f"প্ৰশ্ন: {question}\n\nউত্তৰ:"
100
- response = self.llm(prompt, max_new_tokens=150, temperature=0.3)
101
- return response[0]['generated_text']
102
- except Exception as e:
103
- return f"উত্তৰ তৈয়াৰ কৰোঁতে সমস্যা: {e}"
104
- else:
105
- return "মই আপোনাৰ প্ৰশ্নটো বুজিলোঁ। অধ্যয়নৰ বাবে শুভেচ্ছা!"
106
-
107
- def _is_math_question(self, text):
108
- math_indicators = ['+', '-', '*', '/', '=', 'x', 'y', 'গণিত', 'সমীকৰণ']
109
- return any(indicator in text.lower() for indicator in math_indicators)
110
-
111
- def _solve_math(self, expr):
112
- try:
113
- expr = expr.strip().replace('^', '**')
114
- if '=' in expr:
115
- parts = expr.split('=')
116
- if len(parts) == 2:
117
- left = sp.sympify(parts[0].strip())
118
- right = sp.sympify(parts[1].strip())
119
- equation = sp.Eq(left, right)
120
- solutions = sp.solve(equation)
121
- if solutions:
122
- return f"সমীকৰণ: {equation}\n\nসমাধান: {solutions}"
123
- else:
124
- expr_sym = sp.sympify(expr)
125
- simplified = sp.simplify(expr_sym)
126
- return f"প্ৰকাশ: {expr}\n\nসৰলীকৃত: {simplified}"
127
- except Exception as e:
128
- return f"গণিত সমাধানত সমস্যা: {e}"
129
-
130
- def _find_relevant_chunks(self, question, k=3):
131
- if not self.corpus_chunks:
132
- return []
133
-
134
- if self.index and self.embedding_model:
135
  try:
136
- q_vec = self.embedding_model.encode([question]).astype("float32")
137
- D, I = self.index.search(q_vec, k)
138
- return [self.corpus_chunks[i] for i in I[0] if 0 <= i < len(self.corpus_chunks)]
139
  except Exception:
140
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- def extract_text_from_image(uploaded_file):
 
 
 
 
 
144
  try:
145
- image = Image.open(uploaded_file)
146
- text = pytesseract.image_to_string(image)
147
- return text.strip()
148
  except Exception as e:
149
- return ""
150
-
151
- # Main app
152
- def main():
153
- st.title("🧭 জাজাবৰ – SEBA Class 10 AI Tutor")
154
-
155
- # Sidebar
156
- with st.sidebar:
157
- st.header("👤 লগিন")
158
- username = st.text_input("আপোনাৰ নাম", value=st.session_state.username)
159
- if username and username != st.session_state.username:
160
- st.session_state.username = username
161
- st.success(f"লগিন successful: {username}")
162
-
163
- st.header("📷 ছবিৰ পৰা পাঠ")
164
- uploaded_image = st.file_uploader("ছবি আপলোড কৰক", type=['png', 'jpg', 'jpeg'])
165
-
166
- st.header("💡 টিপছ")
167
- st.info("""
168
- - নাম লিখি প্ৰশ্ন সোধক
169
- - ছবি আপলোড কৰিলে OCR ৰ সহায়ত পাঠ পঢ়িব
170
- - বিষয়সমূহ:
171
- - অসমীয়া
172
- - ইংৰাজী
173
- - গণিত
174
- - বিজ্ঞান
175
- - সামাজিক বিজ্ঞান
176
- """)
177
-
178
- # Initialize tutor
179
- if st.session_state.tutor is None:
180
- with st.spinner('জাজাবৰক সাজু কৰি থকা হৈছে...'):
181
- st.session_state.tutor = SimpleTutor()
182
-
183
- # Main chat area
184
- st.header("💬 জাজাবৰৰ সৈতে কথোপকথন")
185
-
186
- # Display chat history
187
- for i, (question, answer) in enumerate(st.session_state.chat_history):
188
- with st.chat_message("user"):
189
- st.write(question)
190
- with st.chat_message("assistant"):
191
- st.write(answer)
192
-
193
- # Chat input
194
- if prompt := st.chat_input("আপোনাৰ প্ৰশ্ন ইয়াত লিখক..."):
195
- if not st.session_state.username:
196
- st.error("⚠️ প্ৰথমে আপোনাৰ নাম লিখক")
197
- st.stop()
198
-
199
- # Add user message to chat
200
- st.session_state.chat_history.append((prompt, ""))
201
-
202
- # Process OCR if image uploaded
203
- full_question = prompt
204
- if uploaded_image:
205
- ocr_text = extract_text_from_image(uploaded_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  if ocr_text:
207
- full_question += f"\n[ছবিৰ পাঠ: {ocr_text}]"
208
-
209
- # Get AI response
210
- with st.spinner('জাজাবৰে চিন্তা কৰি আছে...'):
211
- response = st.session_state.tutor.answer_question(full_question)
212
-
213
- # Update chat history
214
- st.session_state.chat_history[-1] = (prompt, response)
215
-
216
- # Rerun to update display
217
- st.rerun()
218
-
219
- # Clear chat button
220
- if st.button("🧹 কথোপকথন পৰিষ্কাৰ কৰক"):
221
- st.session_state.chat_history = []
222
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  if __name__ == "__main__":
225
- main()
 
1
+ # app.py
2
+ """
3
+ Jajabor – Minimal safe version (no FAISS, no torch, no transformers)
4
+ - Retrieval: TF-IDF (scikit-learn)
5
+ - PDF reading: PyPDF2
6
+ - OCR: pytesseract
7
+ - Math: sympy
8
+ - UI: Gradio
9
+ """
10
+
11
  import os
12
+ import io
13
  import sqlite3
14
  from datetime import datetime
15
+ import traceback
16
+
17
+ from PyPDF2 import PdfReader
18
+ from PIL import Image
19
+ import gradio as gr
20
  import numpy as np
21
+
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sklearn.metrics.pairwise import linear_kernel
24
+
25
  import pytesseract
 
26
  import sympy as sp
 
27
 
28
+ # ---------- CONFIG ----------
29
+ APP_NAME = "Jajabor – Minimal (TF-IDF retrieval)"
30
+ BASE_DIR = os.path.abspath(os.path.dirname(__file__))
31
+ PDF_DIR = os.path.join(BASE_DIR, "pdfs", "class10")
32
+ DB_PATH = os.path.join(BASE_DIR, "jajabor_users.db")
33
+
34
+ CHUNK_SIZE = 600
35
+ CHUNK_OVERLAP = 120
36
+ TOP_K = 3
37
+
38
+ # ---------- DB ----------
39
+ def init_db(path=DB_PATH):
40
+ os.makedirs(os.path.dirname(path), exist_ok=True)
41
+ conn = sqlite3.connect(path)
42
+ cur = conn.cursor()
43
+ cur.execute("""CREATE TABLE IF NOT EXISTS users (id INTEGER PRIMARY KEY AUTOINCREMENT, username TEXT UNIQUE, created_at TEXT)""")
44
+ cur.execute("""CREATE TABLE IF NOT EXISTS interactions (id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER, timestamp TEXT, query TEXT, answer TEXT, is_math INTEGER)""")
45
+ conn.commit()
46
+ conn.close()
47
+
48
+ def get_or_create_user(username):
49
+ username = username.strip()
50
+ if not username:
51
+ return None
52
+ conn = sqlite3.connect(DB_PATH)
53
+ cur = conn.cursor()
54
+ cur.execute("SELECT id FROM users WHERE username=?", (username,))
55
+ row = cur.fetchone()
56
+ if row:
57
+ uid = row[0]
58
+ else:
59
+ cur.execute("INSERT INTO users (username, created_at) VALUES (?, ?)", (username, datetime.utcnow().isoformat()))
60
+ conn.commit()
61
+ uid = cur.lastrowid
62
+ conn.close()
63
+ return uid
64
+
65
+ def log_interaction(user_id, query, answer, is_math):
66
+ conn = sqlite3.connect(DB_PATH)
67
+ cur = conn.cursor()
68
+ cur.execute("INSERT INTO interactions (user_id, timestamp, query, answer, is_math) VALUES (?, ?, ?, ?, ?)",
69
+ (user_id, datetime.utcnow().isoformat(), query, answer, 1 if is_math else 0))
70
+ conn.commit()
71
+ conn.close()
72
+
73
+ init_db()
74
+
75
+ # ---------- PDF reading ----------
76
+ def extract_text_from_pdf(pdf_path):
77
+ pages = []
78
+ try:
79
+ reader = PdfReader(pdf_path)
80
+ for page in reader.pages:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  try:
82
+ txt = page.extract_text() or ""
83
+ pages.append(txt)
 
84
  except Exception:
85
+ continue
86
+ except Exception as e:
87
+ print("PDF read error:", e)
88
+ return "\n".join(pages)
89
+
90
+ def load_all_pdfs(pdf_dir):
91
+ texts = []
92
+ metas = []
93
+ if not os.path.isdir(pdf_dir):
94
+ print("PDF_DIR not found:", pdf_dir)
95
+ return texts, metas
96
+ for fname in sorted(os.listdir(pdf_dir)):
97
+ if fname.lower().endswith(".pdf"):
98
+ path = os.path.join(pdf_dir, fname)
99
+ print("Reading:", path)
100
+ text = extract_text_from_pdf(path)
101
+ texts.append(text)
102
+ metas.append({"source": fname})
103
+ return texts, metas
104
+
105
+ def split_text_into_chunks(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
106
+ if not text:
107
  return []
108
+ chunks = []
109
+ step = chunk_size - overlap
110
+ i = 0
111
+ while i < len(text):
112
+ chunk = text[i:i+chunk_size]
113
+ if chunk.strip():
114
+ chunks.append(chunk)
115
+ i += max(step, 1)
116
+ return chunks
117
+
118
+ # ---------- Build TF-IDF index ----------
119
+ print("Loading PDFs and building TF-IDF index...")
120
+ all_texts, all_metas = load_all_pdfs(PDF_DIR)
121
+ corpus_chunks = []
122
+ corpus_metas = []
123
+ for text, meta in zip(all_texts, all_metas):
124
+ chs = split_text_into_chunks(text)
125
+ corpus_chunks.extend(chs)
126
+ corpus_metas.extend([meta] * len(chs))
127
 
128
+ if len(corpus_chunks) == 0:
129
+ print("No PDF chunks found. Upload PDFs into pdfs/class10/")
130
+
131
+ vectorizer = None
132
+ tfidf_matrix = None
133
+ if corpus_chunks:
134
  try:
135
+ vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
136
+ tfidf_matrix = vectorizer.fit_transform(corpus_chunks)
137
+ print("TF-IDF ready. Chunks:", len(corpus_chunks))
138
  except Exception as e:
139
+ print("Failed to build TF-IDF:", e)
140
+ vectorizer = None
141
+ tfidf_matrix = None
142
+
143
+ def retrieve_tfidf(query, top_k=TOP_K):
144
+ if tfidf_matrix is None or vectorizer is None:
145
+ return []
146
+ qv = vectorizer.transform([query])
147
+ sims = linear_kernel(qv, tfidf_matrix).flatten()
148
+ idxs = sims.argsort()[::-1][:top_k]
149
+ results = []
150
+ for idx in idxs:
151
+ if sims[idx] <= 0:
152
+ continue
153
+ results.append({"score": float(sims[idx]), "text": corpus_chunks[idx], "meta": corpus_metas[idx]})
154
+ return results
155
+
156
+ # ---------- OCR and math ----------
157
+ def ocr_from_image(img: Image.Image):
158
+ try:
159
+ img = img.convert("RGB")
160
+ except Exception:
161
+ pass
162
+ try:
163
+ text = pytesseract.image_to_string(img, lang="asm+eng")
164
+ except Exception:
165
+ try:
166
+ text = pytesseract.image_to_string(img)
167
+ except Exception:
168
+ text = ""
169
+ return text.strip()
170
+
171
+ def is_likely_math(text: str) -> bool:
172
+ if not text:
173
+ return False
174
+ math_chars = set("0123456789+-*/=^()%")
175
+ if any(ch in text for ch in math_chars):
176
+ return True
177
+ kws = ["গণিত", "সমীকৰণ", "বীজগণিত", "math", "solve", "equation"]
178
+ return any(k in text for k in kws)
179
+
180
+ def solve_math_expression(expr: str):
181
+ try:
182
+ expr = expr.replace("^", "**")
183
+ if "=" in expr:
184
+ left, right = expr.split("=", 1)
185
+ eq = sp.Eq(sp.sympify(left), sp.sympify(right))
186
+ sol = sp.solve(eq)
187
+ return "ধাপ-ধাপে সমাধান (সংক্ষেপ):\n" + str(sol)
188
+ else:
189
+ simp = sp.simplify(sp.sympify(expr))
190
+ return f"সরলীকৰণ: {simp}"
191
+ except Exception:
192
+ return "গণিতীয় অভিব্যক্তি বুজা যায় নাই — দয়া কৰি সঠিকভাৱে লিখক।"
193
+
194
+ # ---------- Answering (extractive) ----------
195
+ def answer_with_retrieval(query, chat_history):
196
+ results = retrieve_tfidf(query, top_k=TOP_K)
197
+ if not results:
198
+ return "পাঠ্যপুথি সম্বন্ধীয় তথ্য নহল; দয়া কৰি অধিক স্পষ্টকৈ সোধক।"
199
+ # Combine top chunks as extractive answer (shorten if too long)
200
+ answer_parts = []
201
+ for r in results:
202
+ txt = r["text"].strip()
203
+ if len(txt) > 800:
204
+ txt = txt[:800].rsplit("\n", 1)[0] + "…"
205
+ answer_parts.append(f"[Source: {r['meta'].get('source','textbook')}] \n{txt}")
206
+ return "\n\n".join(answer_parts)
207
+
208
+ # ---------- Chat logic ----------
209
+ def login_user(username, user_state):
210
+ username = (username or "").strip()
211
+ if not username:
212
+ return user_state, "⚠️ অনুগ্ৰহ কৰি লগিনৰ বাবে এটা নাম লিখক।"
213
+ user_id = get_or_create_user(username)
214
+ user_state = {"username": username, "user_id": user_id}
215
+ total, math_count = 0, 0
216
+ try:
217
+ total, math_count = (lambda uid: (lambda c,m: (c,m))( * (lambda cur: (cur.execute("SELECT COUNT(*), SUM(is_math) FROM interactions WHERE user_id=?", (uid,)), cur.fetchone())[1] ) )(uid) )(user_id)
218
+ except Exception:
219
+ total, math_count = get_or_create_user(username) and (0,0)
220
+ stats = f"👤 {username}\n📊 মোট প্ৰশ্ন: {total}\n🧮 গণিত: {math_count}"
221
+ return user_state, stats
222
+
223
+ def chat_logic(username, text_input, image_input, audio_input, chat_history, user_state):
224
+ if chat_history is None:
225
+ chat_history = []
226
+
227
+ if not user_state or not user_state.get("user_id"):
228
+ sys_msg = "⚠️ প্ৰথমে লগিন কৰক।"
229
+ chat_history = chat_history + [[text_input or "", sys_msg]]
230
+ return chat_history, user_state, ""
231
+
232
+ user_id = user_state["user_id"]
233
+ final_query_parts = []
234
+
235
+ ocr_text = ""
236
+ if image_input:
237
+ try:
238
+ if isinstance(image_input, str):
239
+ img = Image.open(image_input)
240
+ else:
241
+ raw = image_input.read()
242
+ img = Image.open(io.BytesIO(raw))
243
+ ocr_text = ocr_from_image(img)
244
  if ocr_text:
245
+ final_query_parts.append(ocr_text)
246
+ except Exception:
247
+ pass
248
+
249
+ if text_input:
250
+ final_query_parts.append(text_input)
251
+
252
+ if not final_query_parts:
253
+ sys_msg = "⚠️ অনুগ্ৰহ কৰি প্ৰশ্ন লিখক বা ছবি আপলোড কৰক।"
254
+ chat_history = chat_history + [["", sys_msg]]
255
+ return chat_history, user_state, ""
256
+
257
+ full_query = "\n".join(final_query_parts)
258
+
259
+ is_math_flag = is_likely_math(full_query)
260
+ if is_math_flag:
261
+ math_answer = solve_math_expression(full_query)
262
+ # Use extractive retrieval to provide supporting text and then math result
263
+ retrieval = answer_with_retrieval(full_query, chat_history)
264
+ final_answer = f"{retrieval}\n\nগণিত সমাধান:\n{math_answer}"
265
+ else:
266
+ final_answer = answer_with_retrieval(full_query, chat_history)
267
+
268
+ log_interaction(user_id, full_query, final_answer, is_math_flag)
269
+ display_q = text_input or ocr_text or "(image)"
270
+ chat_history = chat_history + [[display_q, final_answer]]
271
+ return chat_history, user_state, ""
272
+
273
+ # ---------- Gradio UI ----------
274
+ with gr.Blocks(title=APP_NAME) as demo:
275
+ gr.Markdown("# 🧭 Jajabor – Minimal TF-IDF Tutor (Free)")
276
+
277
+ user_state = gr.State({})
278
+
279
+ with gr.Row():
280
+ with gr.Column(scale=1):
281
+ username_inp = gr.Textbox(label="নাম / ইউজাৰ আইডি", placeholder="e.g. abu10")
282
+ login_btn = gr.Button("Login")
283
+ stats_md = gr.Markdown("এতিয়ালৈকে লগিন হোৱা নাই।")
284
+ with gr.Column(scale=3):
285
+ chat = gr.Chatbot(label="জাজাবৰ", height=480)
286
+ text_inp = gr.Textbox(label="আপোনাৰ প্ৰশ্ন লিখক", lines=2)
287
+ with gr.Row():
288
+ image_inp = gr.Image(label="📷 ছবি (Optional)", type="filepath")
289
+ audio_inp = gr.Audio(label="🎙️ (Optional)", type="filepath")
290
+ ask_btn = gr.Button("সোধক")
291
+
292
+ login_btn.click(login_user, inputs=[username_inp, user_state], outputs=[user_state, stats_md])
293
+
294
+ ask_btn.click(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
295
+ text_inp.submit(chat_logic, inputs=[username_inp, text_inp, image_inp, audio_inp, chat, user_state], outputs=[chat, user_state, None])
296
 
297
  if __name__ == "__main__":
298
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)