Teapack1 commited on
Commit
9f3c9bf
·
1 Parent(s): ebdb126

FAISS vector db for HF spaces compatibility

Browse files
fast_app.py CHANGED
@@ -7,20 +7,15 @@ from fastapi.templating import Jinja2Templates
7
  from fastapi.staticfiles import StaticFiles
8
  from fastapi.encoders import jsonable_encoder
9
 
10
- from langchain.vectorstores import Chroma
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
 
13
  from langchain.chains import RetrievalQA
14
- from langchain.document_loaders import (
15
- TextLoader,
16
- PyPDFLoader,
17
- DirectoryLoader,
18
- UnstructuredFileLoader,
19
- )
20
- from langchain.document_loaders.csv_loader import CSVLoader
21
  from langchain.llms import OpenAI
22
  from langchain import PromptTemplate
23
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
 
24
 
25
  from ingest import Ingest
26
 
@@ -31,7 +26,7 @@ from ingest import Ingest
31
  # if huggingface_token is None:
32
  # raise ValueError("Hugging Face token is not set in environment variables.")
33
 
34
- openai_api_key = os.getenv("OPENAI_API_KEY")
35
  if openai_api_key is None:
36
  raise ValueError("OAI token is not set in environment variables.")
37
 
@@ -39,8 +34,8 @@ if openai_api_key is None:
39
  app = FastAPI()
40
  templates = Jinja2Templates(directory="templates")
41
  app.mount("/static", StaticFiles(directory="static"), name="static")
42
- english_embedding_model="text-embedding-3-large"
43
- czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
44
 
45
  czech_store = "stores/czech_512"
46
  english_store = "stores/english_512"
@@ -55,6 +50,7 @@ ingestor = Ingest(
55
  english_embedding_model=english_embedding_model,
56
  )
57
 
 
58
  def prompt_en():
59
  prompt_template_en = """You are electrical engineer and you answer users ###Question.
60
 
@@ -75,6 +71,7 @@ def prompt_en():
75
  print("\n Prompt ready... \n\n")
76
  return prompt_en
77
 
 
78
  def prompt_cz():
79
  prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
80
 
@@ -144,7 +141,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
144
  model=embedding_model,
145
  )
146
 
147
- vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
148
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
149
 
150
  chain_type_kwargs = {"prompt": prompt}
 
7
  from fastapi.staticfiles import StaticFiles
8
  from fastapi.encoders import jsonable_encoder
9
 
10
+ from langchain_community.vectorstores import FAISS
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
 
13
  from langchain.chains import RetrievalQA
14
+
 
 
 
 
 
 
15
  from langchain.llms import OpenAI
16
  from langchain import PromptTemplate
17
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
18
+ import chainlit as cl
19
 
20
  from ingest import Ingest
21
 
 
26
  # if huggingface_token is None:
27
  # raise ValueError("Hugging Face token is not set in environment variables.")
28
 
29
+ openai_api_key = "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"
30
  if openai_api_key is None:
31
  raise ValueError("OAI token is not set in environment variables.")
32
 
 
34
  app = FastAPI()
35
  templates = Jinja2Templates(directory="templates")
36
  app.mount("/static", StaticFiles(directory="static"), name="static")
37
+ english_embedding_model = "text-embedding-3-large"
38
+ czech_embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
39
 
40
  czech_store = "stores/czech_512"
41
  english_store = "stores/english_512"
 
50
  english_embedding_model=english_embedding_model,
51
  )
52
 
53
+
54
  def prompt_en():
55
  prompt_template_en = """You are electrical engineer and you answer users ###Question.
56
 
 
71
  print("\n Prompt ready... \n\n")
72
  return prompt_en
73
 
74
+
75
  def prompt_cz():
76
  prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
77
 
 
141
  model=embedding_model,
142
  )
143
 
144
+ vectordb = FAISS.load_local(persist_directory, embedding)
145
  retriever = vectordb.as_retriever(search_kwargs={"k": 3})
146
 
147
  chain_type_kwargs = {"prompt": prompt}
ingest.py CHANGED
@@ -1,4 +1,4 @@
1
- from langchain.vectorstores import Chroma
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
 
4
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
@@ -53,12 +53,11 @@ class Ingest:
53
  )
54
  texts = text_splitter.split_documents(documents)
55
 
56
- vectordb = Chroma.from_documents(
57
  documents=texts,
58
  embedding=embedding,
59
- persist_directory=self.english_store,
60
- collection_metadata={"hnsw:space": "cosine"},
61
  )
 
62
 
63
  print("\n English vector Store Created.......\n\n")
64
 
@@ -84,12 +83,11 @@ class Ingest:
84
  )
85
 
86
  texts = text_splitter.split_documents(documents)
87
- vectordb = Chroma.from_documents(
88
  documents=texts,
89
  embedding=embedding,
90
- persist_directory=self.czech_store,
91
- collection_metadata={"hnsw:space": "cosine"},
92
  )
 
93
 
94
  print("\n Czech vector Store Created.......\n\n")
95
 
 
1
+ from langchain_community.vectorstores import FAISS
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
 
4
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
 
53
  )
54
  texts = text_splitter.split_documents(documents)
55
 
56
+ vectordb = FAISS.from_documents(
57
  documents=texts,
58
  embedding=embedding,
 
 
59
  )
60
+ vectordb.save_local(self.english_store)
61
 
62
  print("\n English vector Store Created.......\n\n")
63
 
 
83
  )
84
 
85
  texts = text_splitter.split_documents(documents)
86
+ vectordb = FAISS.from_documents(
87
  documents=texts,
88
  embedding=embedding,
 
 
89
  )
90
+ vectordb.save_local(self.czech_store)
91
 
92
  print("\n Czech vector Store Created.......\n\n")
93
 
requirements.txt CHANGED
@@ -1,5 +1,9 @@
1
- langchain
 
 
2
  fastapi
 
 
3
  uvicorn
4
  python-multipart
5
  ctransformers
@@ -9,8 +13,8 @@ sentence_transformers
9
  chromadb
10
  pytesseract
11
  fitz
12
- libpff-python
13
  openai
14
  tiktoken
15
  frontend
16
- pysqlite3-binary
 
1
+
2
+ langchain-community==0.0.19
3
+ langchain==0.1.6
4
  fastapi
5
+ faiss-cpu
6
+ pypdf
7
  uvicorn
8
  python-multipart
9
  ctransformers
 
13
  chromadb
14
  pytesseract
15
  fitz
16
+ #libpff-python
17
  openai
18
  tiktoken
19
  frontend
20
+ chainlit
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
3
- size 1164000
 
 
 
 
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
3
- size 100
 
 
 
 
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
3
- size 4000
 
 
 
 
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin DELETED
File without changes
stores/czech_512/chroma.sqlite3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2187862ccdfdb78565366853a939dc50038908171936c8584d69a09b55aa4e7c
3
- size 1929216
 
 
 
 
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
3
- size 12428000
 
 
 
 
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
3
- size 100
 
 
 
 
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
3
- size 55974
 
 
 
 
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
3
- size 4000
 
 
 
 
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
3
- size 8624
 
 
 
 
stores/english_512/chroma.sqlite3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:369ede691a1330113d353e1a425a7cd24ad9d76ee61ee542adab1f12a6887146
3
- size 26963968