File size: 1,989 Bytes
5038bcb
998ba81
5038bcb
998ba81
5038bcb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
998ba81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd50b36
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
from uuid import UUID
from sqlmodel.ext.asyncio.session import AsyncSession
from sqlmodel import select
from .embedding import embedding_model
from .models import KnowledgeBase, KnowledgeChunk
from .utils import (
    chunk_sentences_with_overlap,
    extract_text_from_pdf_fileobj,
    split_into_sentences,
)

DEFAULT_MAX_WORDS = int(os.getenv("CHUNK_MAX_WORDS", "200"))
DEFAULT_OVERLAP = int(os.getenv("CHUNK_OVERLAP_WORDS", "40"))


async def process_pdf_and_store(
    fileobj, kb_name: str, kb_description: str | None, session: AsyncSession
):
    raw_text = extract_text_from_pdf_fileobj(fileobj)

    sentences = split_into_sentences(raw_text)

    chunks = chunk_sentences_with_overlap(
        sentences, max_words=DEFAULT_MAX_WORDS, overlap_words=DEFAULT_OVERLAP
    )

    kb = KnowledgeBase(name=kb_name, description=kb_description)
    session.add(kb)
    await session.commit()
    await session.refresh(kb)

    chunk_objs = []
    for idx, chunk_text in enumerate(chunks):
        emb = await embedding_model.embed_text(chunk_text)

        chunk = KnowledgeChunk(
            kb_id=kb.id, chunk_index=idx, chunk_text=chunk_text, embedding=emb
        )
        session.add(chunk)
        chunk_objs.append(chunk)

    await session.commit()

    return {"kb_id": kb.id, "name": kb_name, "chunks_stored": len(chunk_objs)}

async def store_manual_text(kb_id: UUID, text: str, session: AsyncSession):
    embedding = await embedding_model.embed_text(text)

    result = await session.execute(
        select(KnowledgeChunk).where(KnowledgeChunk.kb_id == kb_id)
    )
    existing = result.scalars().all()
    next_index = len(existing)

    new_chunk = KnowledgeChunk(
        kb_id=kb_id,
        chunk_index=next_index,
        chunk_text=text,
        embedding=embedding
    )

    session.add(new_chunk)
    await session.commit()

    return {
        "kb_id": kb_id,
        "chunk_index": next_index,
        "status": "stored",
        "text": text
    }