Spaces:

SandhyaMadhunagula
/

GNN-Knowledge-Graph

Sleeping

App Files Files Community

SandhyaMadhunagula commited on 30 days ago

Commit

be89e03

verified ·

1 Parent(s): 0c0e6f2

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +28 -0
app.py +174 -0
main.py +0 -0
requirements.txt +95 -0
view_db.py +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies (needed for some AI libraries)
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application code
+COPY . .
+# Create a directory for the graph if it doesn't exist
+RUN mkdir -p static
+# Flask apps on Hugging Face Spaces must run on port 7860
+EXPOSE 7860
+# Run app.py when the container launches
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from flask import Flask, render_template, request, redirect, url_for, session
+import networkx as nx
+from pyvis.network import Network
+import os, re, pickle
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from docx import Document
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
+import csv
+from flask import Response
+import io
+app = Flask(__name__)
+app.secret_key = "secret_key_for_session"
+model_name = "Babelscape/rebel-large"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+load_dotenv() # This loads the variables from .env
+HF_TOKEN = os.getenv("HF_TOKEN")
+rebel_tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+#rebel_tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
+rebel_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=HF_TOKEN, low_cpu_mem_usage=True).to(device)
+DB_FILE = "graph_database.pkl"
+def save_db(graph):
+    with open(DB_FILE, "wb") as f:
+        pickle.dump(graph, f)
+def load_db():
+    if os.path.exists(DB_FILE):
+        try:
+            with open(DB_FILE, "rb") as f:
+                return pickle.load(f)
+        except: return nx.DiGraph()
+    return nx.DiGraph()
+G = load_db()
+def extract_triples(text):
+    inputs = rebel_tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
+    gen_kwargs = {"max_length": 128, "length_penalty": 0, "num_beams": 1, "num_return_sequences": 1}
+    generated_tokens = rebel_model.generate(**inputs, **gen_kwargs)
+    decoded = rebel_tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)[0]
+    triples = []
+    current_subject, current_relation, current_object = "", "", ""
+    current_state = ""
+    # ADD THESE TWO LINES TO FIX THE "FIRST WORD" PROBLEM
+    clean_decoded = decoded.replace("<s>", "").replace("</s>", "")
+    clean_decoded = clean_decoded.replace("<triplet>", " <triplet> ").replace("<subj>", " <subj> ").replace("<obj>", " <obj> ")
+    # CHANGE THIS LOOP TO USE clean_decoded
+    for token in clean_decoded.split():
+        if token == "<triplet>":
+            current_state = "s"
+            if current_subject and current_relation and current_object:
+                triples.append((current_subject.strip(), current_relation.strip(), current_object.strip()))
+            current_subject, current_relation, current_object = "", "", ""
+        elif token == "<subj>": current_state = "o"
+        elif token == "<obj>": current_state = "r"
+        else:
+            if current_state == "s": current_subject += " " + token
+            elif current_state == "o": current_object += " " + token
+            elif current_state == "r": current_relation += " " + token
+    if current_subject and current_relation and current_object:
+        triples.append((current_subject.strip(), current_relation.strip(), current_object.strip()))
+    return triples
+def visualize_graph():
+    net = Network(height="600px", width="100%", directed=True, bgcolor="#ffffff", font_color="black", cdn_resources='remote')
+    net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=150, damping=0.4)
+    # CRITICAL FIX: Loop through nodes and edges to draw them
+    for node in G.nodes():
+        net.add_node(node, label=node, color="#00d2ff", size=25, shadow={'enabled': True, 'color': 'rgba(0,210,255,0.6)', 'size': 10})
+    for source, target, data in G.edges(data=True):
+        net.add_edge(source, target, label=data.get("label", ""), color="#a29bfe")
+    if not os.path.exists("static"): os.makedirs("static")
+    net.save_graph("static/graph.html")
+@app.route("/", methods=["GET", "POST"])
+def index():
+    global G
+    answer = None
+    user_query = ""
+    text = session.get('user_text', "")
+    if request.method == "POST":
+        # 1. HANDLE FILE UPLOAD OR TEXT BOX
+        if "file" in request.files and request.files["file"].filename != "":
+            file = request.files["file"]
+            ext = file.filename.split('.')[-1].lower()
+            if ext == "pdf":
+                reader = PdfReader(file)
+                text = " ".join([page.extract_text() for page in reader.pages])
+            elif ext == "docx":
+                text = " ".join([p.text for p in Document(file).paragraphs])
+            elif ext == "txt":
+                text = file.read().decode("utf-8")
+        elif "text" in request.form and request.form["text"].strip():
+            text = request.form["text"]
+        # 2. PROCESS DATA (Only if we have new text)
+        if text and "query" not in request.form:
+            session['user_text'] = text
+            sentences = [s.strip() for s in re.split(r'[\n.!?]', text) if len(s.strip()) > 10]
+            print(f"--- 🚀 AI is extracting from {len(sentences)} sentences ---")
+            for i, sent in enumerate(sentences):
+                print(f"📄 Processing {i+1}/{len(sentences)}...")
+                for s, r, o in extract_triples(sent):
+                    G.add_edge(s.title().strip(), o.title().strip(), label=r.strip())
+            save_db(G)
+            visualize_graph()
+        # 3. HANDLE SEARCH QUERY
+        if "query" in request.form:
+            user_query = request.form["query"].strip()
+            keywords = [w.lower() for w in user_query.split() if len(w) > 3]
+            results = []
+            for node in G.nodes():
+                if any(k in node.lower() for k in keywords):
+                    for n in G.successors(node):
+                        results.append(f"<b>{node}</b> {G[node][n]['label']} <b>{n}</b>")
+                    for p in G.predecessors(node):
+                        results.append(f"<b>{p}</b> {G[p][node]['label']} <b>{node}</b>")
+            answer = " • " + "<br> • ".join(list(set(results))[:8]) if results else f"Nothing found for '{user_query}'."
+    db_triples = [{"s": s, "r": d['label'], "o": t} for s, t, d in G.edges(data=True)]
+    return render_template("index.html", answer=answer, graph=os.path.exists("static/graph.html"), user_query=user_query, user_text=text, db_triples=db_triples)
+@app.route("/export_csv")
+def export_csv():
+    # 1. Create a string buffer to hold CSV data
+    output = io.StringIO()
+    writer = csv.writer(output)
+    # 2. Write the Header
+    writer.writerow(['Subject', 'Relationship', 'Object'])
+    # 3. Write the Data from the Graph G
+    for s, t, d in G.edges(data=True):
+        writer.writerow([s, d.get('label', ''), t])
+    # 4. Prepare the response for download
+    output.seek(0)
+    return Response(
+        output,
+        mimetype="text/csv",
+        headers={"Content-disposition": "attachment; filename=knowledge_graph.csv"}
+    )
+@app.route("/clear")
+def clear_db():
+    global G
+    G = nx.DiGraph()
+    session.clear()
+    if os.path.exists(DB_FILE): os.remove(DB_FILE)
+    if os.path.exists("static/graph.html"): os.remove("static/graph.html")
+    return redirect(url_for('index'))
+#if __name__ == "__main__":
+ #   app.run(debug=True)
+if __name__ == "__main__":
+    # 0.0.0.0 makes it accessible to the internet
+    app.run(host="0.0.0.0", port=7860)

main.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,95 @@

+accelerate==1.13.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+anyio==4.12.1
+asttokens==3.0.1
+attrs==25.4.0
+blinker==1.9.0
+certifi==2026.1.4
+charset-normalizer==3.4.4
+click==8.3.1
+colorama==0.4.6
+contourpy==1.3.3
+cycler==0.12.1
+datasets==2.14.5
+decorator==5.2.1
+dill==0.3.7
+executing==2.2.1
+filelock==3.20.3
+Flask==3.1.3
+fonttools==4.61.1
+frozenlist==1.8.0
+fsspec==2023.6.0
+h11==0.16.0
+hf-xet==1.3.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.6.0
+idna==3.11
+ipython==9.10.0
+ipython_pygments_lexers==1.1.1
+itsdangerous==2.2.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.3
+jsonpickle==4.1.1
+kiwisolver==1.4.9
+lxml==6.0.2
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.1
+multiprocess==0.70.15
+networkx==3.6.1
+numpy==1.26.4
+packaging==26.0
+pandas==1.5.3
+parso==0.8.6
+pillow==12.0.0
+prompt_toolkit==3.0.52
+propcache==0.4.1
+psutil==7.2.2
+pure_eval==0.2.3
+pyarrow==11.0.0
+Pygments==2.19.2
+pyparsing==3.3.2
+PyPDF2==3.0.1
+python-dateutil==2.9.0.post0
+python-docx==1.2.0
+pytz==2025.2
+pyvis==0.3.2
+PyYAML==6.0.3
+regex==2026.1.15
+requests==2.32.5
+rich==14.3.3
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.0
+sentencepiece==0.2.1
+seqeval==1.2.2
+shellingham==1.5.4
+six==1.17.0
+stack-data==0.6.3
+sympy==1.14.0
+threadpoolctl==3.6.0
+tokenizers==0.22.2
+torch==2.10.0+cpu
+torchaudio==2.1.2+cpu
+torchvision==0.16.2+cpu
+tqdm==4.67.3
+traitlets==5.14.3
+transformers==5.3.0
+typer==0.24.1
+typer-slim==0.21.1
+typing_extensions==4.15.0
+tzdata==2025.3
+urllib3==2.6.3
+wcwidth==0.6.0
+Werkzeug==3.1.6
+xxhash==3.6.0
+yarl==1.22.0

view_db.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import pickle
+import pandas as pd
+with open("graph_database.pkl", "rb") as f:
+    G = pickle.load(f)
+# Convert edges to a list of dictionaries
+edge_list = [{"Subject": s, "Relation": d['label'], "Object": t} for s, t, d in G.edges(data=True)]
+# Display as a table
+df = pd.DataFrame(edge_list)
+print(df)