File size: 4,173 Bytes
5545a5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index
# Embeddings: jinaai/jina-embeddings-v3 (HF), trust_remote_code=True, normalize_embeddings=True

import os
import gradio as gr

# Silence Chroma telemetry noise
os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"

from chromadb.config import Settings
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# -------- Config (can be overridden via Space "Variables") --------
PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain")     # path to your committed Chroma index
EMB_MODEL   = os.getenv("EMB_MODEL", "jinaai/jina-embeddings-v3")
TOPK_DEF    = int(os.getenv("TOPK", "5"))

# Embedding function for query text — must match the model used to build the index
EMBEDDINGS = HuggingFaceEmbeddings(
    model_name=EMB_MODEL,
    model_kwargs={"trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True},
)

def load_vector_store():
    """

    Load the persisted Chroma collection with the embedding function for query-time encoding.

    Returns (vs, error_message_or_None)

    """
    try:
        vs = Chroma(
            persist_directory=PERSIST_DIR,
            embedding_function=EMBEDDINGS,
            client_settings=Settings(anonymized_telemetry=False),
        )
        # sanity check (forces collection open)
        _ = vs._collection.count()
        return vs, None
    except Exception as e:
        # Helpful diagnostics: list available collections
        try:
            import chromadb
            client = chromadb.PersistentClient(
                path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False)
            )
            existing = [c.name for c in client.list_collections()]
        except Exception:
            existing = []
        msg = (
            f"Failed to load Chroma store at '{PERSIST_DIR}'. "
            f"Existing collections: {existing or '—'}. "
            "Check that the index folder is present in the Space and the collection name matches."
        )
        return None, f"{msg}\n\nDetails: {e}"

VS, LOAD_ERR = load_vector_store()

def search(query: str, k: int = TOPK_DEF):
    if LOAD_ERR:
        return f"⚠️ {LOAD_ERR}"
    q = (query or "").strip()
    if not q:
        return "Please enter a query."
    try:
        results = VS.similarity_search_with_score(q, k=int(k))
    except Exception as e:
        return f"Search failed: {e}"
    if not results:
        return "No results."

    lines = [f"### Top {len(results)} results"]
    for i, (doc, score) in enumerate(results, 1):
        meta = doc.metadata or {}
        src = meta.get("source") or meta.get("file_path") or "(no source)"
        snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content
        lines.append(f"**[{i}]**  \nSimilarity: `{score:.4f}`\n\n> {snippet}")
    lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.")
    return "\n\n".join(lines)

with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1") as demo:
    gr.Markdown(
        """

        ## Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1

        **Datensatz: 21.603 Leitsatzentscheidungen des BGH (ab dem Jahr 2000) extrahiert aus https://zenodo.org/records/15153244**



        **Modell:** jinaai/jina-embeddings-v3



        **Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geordnet nach Ähnlichkeitswerten zurück.



        **Versuche bespielsweise:**

        - `Kann KI Erfinder sein?` → erwartetes Aktenzeichen **X ZB 5/22**



        *Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources.

        """
    )
    with gr.Row():
        q = gr.Textbox(label="Query", placeholder="Kann KI Erfinder sein?")
        k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K")
    out = gr.Markdown()
    gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out])

demo.launch()