Spaces:
Sleeping
Sleeping
Gary
commited on
Commit
·
1b9a516
1
Parent(s):
c384c23
Fetch data from pincone
Browse files- app.py +1 -4
- indexer.py +17 -6
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -31,11 +31,8 @@ class CustomRAG:
|
|
| 31 |
|
| 32 |
|
| 33 |
def answer_question(query):
|
| 34 |
-
docs = load_raw_dataset()
|
| 35 |
llm = get_llm("google/flan-t5-base")
|
| 36 |
-
vector_database = create_vector_database(
|
| 37 |
-
docs, "sentence-transformers/all-MiniLM-L6-v2"
|
| 38 |
-
)
|
| 39 |
prompt_template = get_prompt_template()
|
| 40 |
rag = CustomRAG(
|
| 41 |
vector_database,
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def answer_question(query):
|
|
|
|
| 34 |
llm = get_llm("google/flan-t5-base")
|
| 35 |
+
vector_database = create_vector_database("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
|
|
| 36 |
prompt_template = get_prompt_template()
|
| 37 |
rag = CustomRAG(
|
| 38 |
vector_database,
|
indexer.py
CHANGED
|
@@ -1,12 +1,20 @@
|
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
import pandas as pd
|
| 3 |
from langchain.schema import Document
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
-
from
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
| 7 |
from langchain.llms import HuggingFacePipeline
|
| 8 |
from langchain.prompts import PromptTemplate
|
|
|
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def load_raw_dataset():
|
| 12 |
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
|
|
@@ -15,8 +23,6 @@ def load_raw_dataset():
|
|
| 15 |
|
| 16 |
df["combined"] = df["input"] + " " + df["output"]
|
| 17 |
|
| 18 |
-
df = df.sample(n=min(5000, len(df)), random_state=42)
|
| 19 |
-
|
| 20 |
docs = [
|
| 21 |
Document(
|
| 22 |
page_content=row["combined"],
|
|
@@ -28,9 +34,14 @@ def load_raw_dataset():
|
|
| 28 |
return docs
|
| 29 |
|
| 30 |
|
| 31 |
-
def create_vector_database(
|
|
|
|
|
|
|
|
|
|
| 32 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
return vectorstore
|
| 35 |
|
| 36 |
|
|
|
|
| 1 |
+
from pinecone import Pinecone
|
| 2 |
from datasets import load_dataset
|
| 3 |
import pandas as pd
|
| 4 |
from langchain.schema import Document
|
| 5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 6 |
+
from transformers import (
|
| 7 |
+
AutoTokenizer,
|
| 8 |
+
pipeline,
|
| 9 |
+
AutoModelForSeq2SeqLM,
|
| 10 |
+
)
|
| 11 |
from langchain.llms import HuggingFacePipeline
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
+
import os
|
| 14 |
|
| 15 |
+
api_key = os.environ["PINECONE_API_KEY"]
|
| 16 |
+
|
| 17 |
+
from langchain_pinecone import PineconeVectorStore
|
| 18 |
|
| 19 |
def load_raw_dataset():
|
| 20 |
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
|
|
|
|
| 23 |
|
| 24 |
df["combined"] = df["input"] + " " + df["output"]
|
| 25 |
|
|
|
|
|
|
|
| 26 |
docs = [
|
| 27 |
Document(
|
| 28 |
page_content=row["combined"],
|
|
|
|
| 34 |
return docs
|
| 35 |
|
| 36 |
|
| 37 |
+
def create_vector_database(model_name):
|
| 38 |
+
PINECONE_INDEX_NAME = "medical-rag-index"
|
| 39 |
+
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
|
| 40 |
+
|
| 41 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
| 42 |
+
|
| 43 |
+
index = pc.Index(PINECONE_INDEX_NAME)
|
| 44 |
+
vectorstore = PineconeVectorStore(index=index, embedding=embedding_model)
|
| 45 |
return vectorstore
|
| 46 |
|
| 47 |
|
requirements.txt
CHANGED
|
@@ -7,4 +7,6 @@ faiss-cpu
|
|
| 7 |
huggingface-hub
|
| 8 |
praw
|
| 9 |
langchain-community
|
| 10 |
-
accelerate
|
|
|
|
|
|
|
|
|
| 7 |
huggingface-hub
|
| 8 |
praw
|
| 9 |
langchain-community
|
| 10 |
+
accelerate
|
| 11 |
+
langchain-pinecone
|
| 12 |
+
pinecone
|