bgh_jina / app.py
cwinkler's picture
Upload 10 files
5545a5e verified
# app.py — Hugging Face Space (Gradio) using a prebuilt Chroma index
# Embeddings: jinaai/jina-embeddings-v3 (HF), trust_remote_code=True, normalize_embeddings=True
import os
import gradio as gr
# Silence Chroma telemetry noise
os.environ["CHROMA_TELEMETRY_DISABLED"] = "1"
from chromadb.config import Settings
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# -------- Config (can be overridden via Space "Variables") --------
PERSIST_DIR = os.getenv("PERSIST_DIR", "./chroma_langchain") # path to your committed Chroma index
EMB_MODEL = os.getenv("EMB_MODEL", "jinaai/jina-embeddings-v3")
TOPK_DEF = int(os.getenv("TOPK", "5"))
# Embedding function for query text — must match the model used to build the index
EMBEDDINGS = HuggingFaceEmbeddings(
model_name=EMB_MODEL,
model_kwargs={"trust_remote_code": True},
encode_kwargs={"normalize_embeddings": True},
)
def load_vector_store():
"""
Load the persisted Chroma collection with the embedding function for query-time encoding.
Returns (vs, error_message_or_None)
"""
try:
vs = Chroma(
persist_directory=PERSIST_DIR,
embedding_function=EMBEDDINGS,
client_settings=Settings(anonymized_telemetry=False),
)
# sanity check (forces collection open)
_ = vs._collection.count()
return vs, None
except Exception as e:
# Helpful diagnostics: list available collections
try:
import chromadb
client = chromadb.PersistentClient(
path=PERSIST_DIR, settings=Settings(anonymized_telemetry=False)
)
existing = [c.name for c in client.list_collections()]
except Exception:
existing = []
msg = (
f"Failed to load Chroma store at '{PERSIST_DIR}'. "
f"Existing collections: {existing or '—'}. "
"Check that the index folder is present in the Space and the collection name matches."
)
return None, f"{msg}\n\nDetails: {e}"
VS, LOAD_ERR = load_vector_store()
def search(query: str, k: int = TOPK_DEF):
if LOAD_ERR:
return f"⚠️ {LOAD_ERR}"
q = (query or "").strip()
if not q:
return "Please enter a query."
try:
results = VS.similarity_search_with_score(q, k=int(k))
except Exception as e:
return f"Search failed: {e}"
if not results:
return "No results."
lines = [f"### Top {len(results)} results"]
for i, (doc, score) in enumerate(results, 1):
meta = doc.metadata or {}
src = meta.get("source") or meta.get("file_path") or "(no source)"
snippet = (doc.page_content[:800] + "…") if len(doc.page_content) > 800 else doc.page_content
lines.append(f"**[{i}]** \nSimilarity: `{score:.4f}`\n\n> {snippet}")
lines.append("\n> **Disclaimer:** Models can produce incorrect or misleading statements. Verify with sources.")
return "\n\n".join(lines)
with gr.Blocks(title="Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1") as demo:
gr.Markdown(
"""
## Semantische Suchmaschine für BGH Leitsatzentscheidungen v0.1
**Datensatz: 21.603 Leitsatzentscheidungen des BGH (ab dem Jahr 2000) extrahiert aus https://zenodo.org/records/15153244**
**Modell:** jinaai/jina-embeddings-v3
**Wie es funktioniert:** Ermöglicht die semantische Suche im Datensatz und gibt die Entscheidungen geordnet nach Ähnlichkeitswerten zurück.
**Versuche bespielsweise:**
- `Kann KI Erfinder sein?` → erwartetes Aktenzeichen **X ZB 5/22**
*Disclaimer:* Models may produce incorrect or misleading statements. Verify with sources.
"""
)
with gr.Row():
q = gr.Textbox(label="Query", placeholder="Kann KI Erfinder sein?")
k = gr.Slider(1, 20, value=TOPK_DEF, step=1, label="Top-K")
out = gr.Markdown()
gr.Button("Search").click(fn=search, inputs=[q, k], outputs=[out])
demo.launch()