Spaces:
Sleeping
Sleeping
File size: 1,989 Bytes
5038bcb 998ba81 5038bcb 998ba81 5038bcb 998ba81 fd50b36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import os
from uuid import UUID
from sqlmodel.ext.asyncio.session import AsyncSession
from sqlmodel import select
from .embedding import embedding_model
from .models import KnowledgeBase, KnowledgeChunk
from .utils import (
chunk_sentences_with_overlap,
extract_text_from_pdf_fileobj,
split_into_sentences,
)
DEFAULT_MAX_WORDS = int(os.getenv("CHUNK_MAX_WORDS", "200"))
DEFAULT_OVERLAP = int(os.getenv("CHUNK_OVERLAP_WORDS", "40"))
async def process_pdf_and_store(
fileobj, kb_name: str, kb_description: str | None, session: AsyncSession
):
raw_text = extract_text_from_pdf_fileobj(fileobj)
sentences = split_into_sentences(raw_text)
chunks = chunk_sentences_with_overlap(
sentences, max_words=DEFAULT_MAX_WORDS, overlap_words=DEFAULT_OVERLAP
)
kb = KnowledgeBase(name=kb_name, description=kb_description)
session.add(kb)
await session.commit()
await session.refresh(kb)
chunk_objs = []
for idx, chunk_text in enumerate(chunks):
emb = await embedding_model.embed_text(chunk_text)
chunk = KnowledgeChunk(
kb_id=kb.id, chunk_index=idx, chunk_text=chunk_text, embedding=emb
)
session.add(chunk)
chunk_objs.append(chunk)
await session.commit()
return {"kb_id": kb.id, "name": kb_name, "chunks_stored": len(chunk_objs)}
async def store_manual_text(kb_id: UUID, text: str, session: AsyncSession):
embedding = await embedding_model.embed_text(text)
result = await session.execute(
select(KnowledgeChunk).where(KnowledgeChunk.kb_id == kb_id)
)
existing = result.scalars().all()
next_index = len(existing)
new_chunk = KnowledgeChunk(
kb_id=kb_id,
chunk_index=next_index,
chunk_text=text,
embedding=embedding
)
session.add(new_chunk)
await session.commit()
return {
"kb_id": kb_id,
"chunk_index": next_index,
"status": "stored",
"text": text
}
|