Spaces:
Sleeping
Sleeping
| import os | |
| from uuid import UUID | |
| from sqlmodel.ext.asyncio.session import AsyncSession | |
| from sqlmodel import select | |
| from .embedding import embedding_model | |
| from .models import KnowledgeBase, KnowledgeChunk | |
| from .utils import ( | |
| chunk_sentences_with_overlap, | |
| extract_text_from_pdf_fileobj, | |
| split_into_sentences, | |
| ) | |
| DEFAULT_MAX_WORDS = int(os.getenv("CHUNK_MAX_WORDS", "200")) | |
| DEFAULT_OVERLAP = int(os.getenv("CHUNK_OVERLAP_WORDS", "40")) | |
| async def process_pdf_and_store( | |
| fileobj, kb_name: str, kb_description: str | None, session: AsyncSession | |
| ): | |
| raw_text = extract_text_from_pdf_fileobj(fileobj) | |
| sentences = split_into_sentences(raw_text) | |
| chunks = chunk_sentences_with_overlap( | |
| sentences, max_words=DEFAULT_MAX_WORDS, overlap_words=DEFAULT_OVERLAP | |
| ) | |
| kb = KnowledgeBase(name=kb_name, description=kb_description) | |
| session.add(kb) | |
| await session.commit() | |
| await session.refresh(kb) | |
| chunk_objs = [] | |
| for idx, chunk_text in enumerate(chunks): | |
| emb = await embedding_model.embed_text(chunk_text) | |
| chunk = KnowledgeChunk( | |
| kb_id=kb.id, chunk_index=idx, chunk_text=chunk_text, embedding=emb | |
| ) | |
| session.add(chunk) | |
| chunk_objs.append(chunk) | |
| await session.commit() | |
| return {"kb_id": kb.id, "name": kb_name, "chunks_stored": len(chunk_objs)} | |
| async def store_manual_text(kb_id: UUID, text: str, session: AsyncSession): | |
| embedding = await embedding_model.embed_text(text) | |
| result = await session.execute( | |
| select(KnowledgeChunk).where(KnowledgeChunk.kb_id == kb_id) | |
| ) | |
| existing = result.scalars().all() | |
| next_index = len(existing) | |
| new_chunk = KnowledgeChunk( | |
| kb_id=kb_id, | |
| chunk_index=next_index, | |
| chunk_text=text, | |
| embedding=embedding | |
| ) | |
| session.add(new_chunk) | |
| await session.commit() | |
| return { | |
| "kb_id": kb_id, | |
| "chunk_index": next_index, | |
| "status": "stored", | |
| "text": text | |
| } | |