yuvabe-dev

Sleeping

App Files Files Community

Shri commited on Nov 18

Commit

5038bcb

1 Parent(s): ac6fa79

feat: tokenization sematic search endpoint

Browse files

Files changed (8) hide show

alembic/versions/dd61202db14f_add_knowledgebase_chunk.py +33 -0
src/chatbot/embedding.py +87 -0
src/chatbot/models.py +27 -1
src/chatbot/router.py +111 -0
src/chatbot/schemas.py +36 -0
src/chatbot/service.py +45 -2
src/chatbot/utils.py +57 -0
src/main.py +5 -3

alembic/versions/dd61202db14f_add_knowledgebase_chunk.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""add: knowledgebase,chunk
+Revision ID: dd61202db14f
+Revises: b33e3b5b7af9
+Create Date: 2025-11-17 23:28:11.537932
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel.sql.sqltypes
+# revision identifiers, used by Alembic.
+revision: str = 'dd61202db14f'
+down_revision: Union[str, Sequence[str], None] = 'b33e3b5b7af9'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###

src/chatbot/embedding.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# to run this file you need model.onnx_data on the assets/onnx folder or you can obtain it from here.: https://huggingface.co/onnx-community/embeddinggemma-300m-ONNX/tree/main/onnx
+import asyncio
+import os
+from typing import List
+import numpy as np
+import onnxruntime as ort
+from transformers import AutoTokenizer
+BASE_DIR = os.path.dirname(__file__)
+TOKENIZER_DIR = os.path.abspath(os.path.join(BASE_DIR, "..", "assets", "tokenizer"))
+MODEL_DIR = os.path.abspath(
+    os.path.join(BASE_DIR, "..", "assets", "onnx", "model.onnx")
+)
+class EmbeddingModel:
+    def __init__(self):
+        print(TOKENIZER_DIR)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            TOKENIZER_DIR, local_files_only=True
+        )
+        sess_options = ort.SessionOptions()
+        providers = ["CPUExecutionProvider"]
+        self.session = ort.InferenceSession(
+            MODEL_DIR, sess_options, providers=providers
+        )
+        self.input_names = [inp.name for inp in self.session.get_inputs()]
+        self.output_names = [out.name for out in self.session.get_outputs()]
+    def _run_sync(
+        self, input_ids: np.ndarray, attention_mask: np.ndarray
+    ) -> List[float]:
+        inputs = {}
+        if "input_ids" in self.input_names:
+            inputs["input_ids"] = input_ids
+        else:
+            inputs[self.input_names[0]] = input_ids
+        if "attention_mask" in self.input_names:
+            inputs["attention_mask"] = attention_mask
+        elif len(self.input_names) > 1:
+            inputs[self.input_names[1]] = attention_mask
+        outputs = self.session.run(self.output_names, inputs)
+        emb = outputs[0]
+        if emb.ndim == 3:
+            emb_vector = emb.mean(axis=1)[0]
+        elif emb.ndim == 2:
+            emb_vector = emb[0]
+        else:
+            emb_vector = np.asarray(emb).flatten()
+        return emb_vector.astype(float).tolist()
+    async def embed_text(self, text: str, max_length: int = 512) -> List[float]:
+        encoded = self.tokenizer(
+            text,
+            return_tensors="np",
+            truncation=True,
+            padding="longest",
+            max_length=max_length,
+        )
+        input_ids = encoded["input_ids"].astype(np.int64)
+        attention_mask = encoded.get("attention_mask", np.ones_like(input_ids)).astype(
+            np.int64
+        )
+        loop = asyncio.get_event_loop()
+        vector = await loop.run_in_executor(
+            None, self._run_sync, input_ids, attention_mask
+        )
+        return vector
+embedding_model = EmbeddingModel()

src/chatbot/models.py CHANGED Viewed

@@ -1,2 +1,28 @@
 import uuid
-import sqlmodel

 import uuid
+from datetime import datetime
+from typing import List
+from pgvector.sqlalchemy import Vector
+from sqlalchemy import Column
+from sqlmodel import Field, Relationship, SQLModel
+class KnowledgeBase(SQLModel, table=True):
+    __tablename__ = "knowledge_base"
+    id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
+    name: str = Field(nullable=False)
+    description: str | None = None
+    created_at: datetime = Field(default_factory=datetime.now)
+    knowledge_chunk: List["KnowledgeChunk"] = Relationship(
+        back_populates="knowledge_base"
+    )
+class KnowledgeChunk(SQLModel, table=True):
+    __tablename__ = "knowledge_chunk"
+    id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True)
+    kb_id: uuid.UUID = Field(foreign_key="knowledge_base.id", nullable=False)
+    chunk_index: int
+    chunk_text: str
+    embedding: List[float] = Field(sa_column=Column(Vector(768)))
+    knowledge_base: "KnowledgeBase" = Relationship(back_populates="knowledge_chunk")

src/chatbot/router.py CHANGED Viewed

	@@ -0,0 +1,111 @@

+import os
+import shutil
+import tempfile
+from typing import Optional
+from fastapi import APIRouter, Depends, File, Form, HTTPException, UploadFile
+from sqlalchemy import text
+from sqlmodel.ext.asyncio.session import AsyncSession
+from src.core.database import get_async_session
+from .embedding import embedding_model
+from .schemas import (
+    SemanticSearchRequest,
+    SemanticSearchResult,
+    TokenizeRequest,
+    TokenizeResponse,
+    UploadKBResponse,
+)
+from .service import process_pdf_and_store
+router = APIRouter(prefix="/chatbot", tags=["chatbot"])
+# before hitting this endpoint make sure the model.data & model.onnx_data is available on the asset/onnx folder
+@router.post("/upload-pdf", response_model=UploadKBResponse)
+async def upload_pdf(
+    file: UploadFile = File(...),
+    name: str = Form(...),
+    description: Optional[str] = Form(None),
+    session: AsyncSession = Depends(get_async_session),
+):
+    if not file.filename.endswith(".pdf"):
+        raise HTTPException(
+            status_code=400, detail="Only PDF files are supported for now."
+        )
+    tmp_dir = tempfile.mkdtemp()
+    tmp_path = os.path.join(tmp_dir, file.filename)
+    try:
+        with open(tmp_path, "wb") as out_f:
+            shutil.copyfileobj(file.file, out_f)
+        with open(tmp_path, "rb") as fobj:
+            result = await process_pdf_and_store(fobj, name, description, session)
+        return UploadKBResponse(
+            kb_id=result["kb_id"],
+            name=result["name"],
+            chunks_stored=result["chunks_stored"],
+        )
+    finally:
+        try:
+            os.remove(tmp_path)
+            os.rmdir(tmp_dir)
+        except Exception:
+            pass
+@router.post("/tokenize", response_model=TokenizeResponse)
+async def tokenize_text(payload: TokenizeRequest):
+    try:
+        encoded = embedding_model.tokenizer(
+            payload.text,
+            return_tensors="np",
+            truncation=True,
+            padding="longest",
+            max_length=512,
+        )
+        return TokenizeResponse(
+            input_ids=encoded["input_ids"][0].tolist(),
+            attention_mask=encoded["attention_mask"][0].tolist(),
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/semantic-search", response_model=list[SemanticSearchResult])
+async def semantic_search(
+    payload: SemanticSearchRequest, session: AsyncSession = Depends(get_async_session)
+):
+    if len(payload.embedding) == 0:
+        raise HTTPException(status_code=400, detail="Embedding cannot be empty.")
+    q_vector = payload.embedding
+    top_k = payload.top_k or 3
+    sql = text(
+        """
+        SELECT id, kb_id, chunk_text, embedding <=> :query_vec AS score
+        FROM knowledge_chunk
+        ORDER BY embedding <=> :query_vec
+        LIMIT :top_k
+    """
+    )
+    rows = await session.exec(sql, {"query_vec": q_vector, "top_k": top_k})
+    rows = rows.fetchall()
+    return [
+        SemanticSearchResult(
+            chunk_id=str(r.id),
+            kb_id=str(r.kb_id),
+            text=r.chunk_text,
+            score=float(r.score),
+        )
+        for r in rows
+    ]

src/chatbot/schemas.py CHANGED Viewed

	@@ -0,0 +1,36 @@

+import uuid
+from typing import List, Optional
+from pydantic import BaseModel
+class UploadKBResponse(BaseModel):
+    kb_id: uuid.UUID
+    name: str
+    chunks_stored: int
+class UploadKBRequest(BaseModel):
+    name: str
+    description: Optional[str] = None
+class TokenizeRequest(BaseModel):
+    text: str
+class TokenizeResponse(BaseModel):
+    input_ids: List[int]
+    attention_mask: List[int]
+class SemanticSearchRequest(BaseModel):
+    embedding: List[float]
+    top_k: Optional[int] = 3
+class SemanticSearchResult(BaseModel):
+    chunk_id: str
+    kb_id: str
+    text: str
+    score: float

src/chatbot/service.py CHANGED Viewed

@@ -1,2 +1,45 @@
-from typing import List
-from uuid import UUID

+import os
+from sqlmodel.ext.asyncio.session import AsyncSession
+from .embedding import embedding_model
+from .models import KnowledgeBase, KnowledgeChunk
+from .utils import (
+    chunk_sentences_with_overlap,
+    extract_text_from_pdf_fileobj,
+    split_into_sentences,
+)
+DEFAULT_MAX_WORDS = int(os.getenv("CHUNK_MAX_WORDS", "200"))
+DEFAULT_OVERLAP = int(os.getenv("CHUNK_OVERLAP_WORDS", "40"))
+async def process_pdf_and_store(
+    fileobj, kb_name: str, kb_description: str | None, session: AsyncSession
+):
+    raw_text = extract_text_from_pdf_fileobj(fileobj)
+    sentences = split_into_sentences(raw_text)
+    chunks = chunk_sentences_with_overlap(
+        sentences, max_words=DEFAULT_MAX_WORDS, overlap_words=DEFAULT_OVERLAP
+    )
+    kb = KnowledgeBase(name=kb_name, description=kb_description)
+    session.add(kb)
+    await session.commit()
+    await session.refresh(kb)
+    chunk_objs = []
+    for idx, chunk_text in enumerate(chunks):
+        emb = await embedding_model.embed_text(chunk_text)
+        chunk = KnowledgeChunk(
+            kb_id=kb.id, chunk_index=idx, chunk_text=chunk_text, embedding=emb
+        )
+        session.add(chunk)
+        chunk_objs.append(chunk)
+    await session.commit()
+    return {"kb_id": kb.id, "name": kb_name, "chunks_stored": len(chunk_objs)}

src/chatbot/utils.py CHANGED Viewed

	@@ -0,0 +1,57 @@

+import re
+from typing import List
+import PyPDF2
+def clean_text(text: str) -> str:
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\s+([,.!?;:])', r'\1', text)
+    text = re.sub(r'[_\-]{2,}', ' ', text)
+    text = re.sub(r'\.{2,}', '.', text)
+    text = re.sub(r'\s{2,}', ' ', text)
+    return text.strip()
+def extract_text_from_pdf_fileobj(fileobj) -> str:
+    reader = PyPDF2.PdfReader(fileobj)
+    all_text = []
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            all_text.append(page_text)
+    return clean_text(" ".join(all_text))
+def split_into_sentences(text: str) -> List[str]:
+    sentence_endings = re.compile(r'(?<=[.!?])\s+')
+    sentences = sentence_endings.split(text)
+    return [s.strip() for s in sentences if s.strip()]
+def chunk_sentences_with_overlap(sentences: List[str], max_words: int = 200, overlap_words: int = 40) -> List[str]:
+    chunks = []
+    current = []
+    current_len = 0
+    for sentence in sentences:
+        words = sentence.split()
+        wc = len(words)
+        if current_len + wc > max_words and current:
+            chunks.append(" ".join(current))
+            if overlap_words > 0:
+                last_words = " ".join(" ".join(current).split()[-overlap_words:])
+                current = [last_words] if last_words else []
+                current_len = len(last_words.split())
+            else:
+                current = []
+                current_len = 0
+        current.append(sentence)
+        current_len += wc
+    if current:
+        chunks.append(" ".join(current))
+    return chunks

src/main.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from src.assets.router import router as assets
-from src.profile.router import router as profile
 from fastapi import FastAPI
 from src.auth.router import router as auth_router
 from src.core.database import init_db
 from src.home.router import router as home_router
-from src.auth.router import router as auth_router
 from src.leave.router import router as leave
 app = FastAPI(title="Yuvabe App API")
@@ -22,6 +22,8 @@ app.include_router(assets)
 app.include_router(leave)
 @app.get("/")
 def root():

 from fastapi import FastAPI
+from src.assets.router import router as assets
 from src.auth.router import router as auth_router
+from src.chatbot.router import router as chatbot
 from src.core.database import init_db
 from src.home.router import router as home_router
 from src.leave.router import router as leave
+from src.profile.router import router as profile
 app = FastAPI(title="Yuvabe App API")
 app.include_router(leave)
+app.include_router(chatbot)
 @app.get("/")
 def root():