Spaces:
Sleeping
Sleeping
standerd commit
Browse files- .gitattributes +2 -0
- README.md +4 -4
- fast_app.py +46 -18
- ingest.py +7 -2
- static/dummy.txt +1 -0
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin +3 -0
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin +3 -0
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin +3 -0
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin +0 -0
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin +3 -0
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin +3 -0
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle +3 -0
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin +3 -0
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin +3 -0
- templates/index.html +3 -1
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
stores/english_512/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 2.9.1
|
8 |
python_version: 3.10.4
|
|
|
1 |
---
|
2 |
+
title: RAG Retriever Eng Czech
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: gray
|
6 |
sdk: gradio
|
7 |
sdk_version: 2.9.1
|
8 |
python_version: 3.10.4
|
fast_app.py
CHANGED
@@ -39,35 +39,61 @@ if openai_api_key is None:
|
|
39 |
app = FastAPI()
|
40 |
templates = Jinja2Templates(directory="templates")
|
41 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
|
|
|
|
42 |
|
43 |
czech_store = "stores/czech_512"
|
44 |
-
english_store = "stores/
|
45 |
|
46 |
ingestor = Ingest(
|
47 |
openai_api_key=openai_api_key,
|
48 |
-
chunk=
|
49 |
-
overlap=
|
50 |
czech_store=czech_store,
|
51 |
english_store=english_store,
|
|
|
|
|
52 |
)
|
53 |
|
54 |
-
|
|
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
58 |
|
59 |
-
Context: {context}
|
60 |
-
Question: {question}
|
61 |
|
62 |
-
Only return the helpful answer below and nothing else.
|
63 |
-
Helpful answer:
|
64 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
|
73 |
@app.get("/", response_class=HTMLResponse)
|
@@ -96,8 +122,9 @@ async def ingest_data(folderPath: str = Form(...), language: str = Form(...)):
|
|
96 |
async def get_response(query: str = Form(...), language: str = Form(...)):
|
97 |
print(language)
|
98 |
if language == "czech":
|
|
|
99 |
print("\n Czech language selected....\n\n")
|
100 |
-
embedding_model =
|
101 |
persist_directory = czech_store
|
102 |
model_name = embedding_model
|
103 |
model_kwargs = {"device": "cpu"}
|
@@ -108,8 +135,9 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
|
|
108 |
encode_kwargs=encode_kwargs,
|
109 |
)
|
110 |
else:
|
|
|
111 |
print("\n English language selected....\n\n")
|
112 |
-
embedding_model =
|
113 |
persist_directory = english_store
|
114 |
embedding = OpenAIEmbeddings(
|
115 |
openai_api_key=openai_api_key,
|
@@ -117,7 +145,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
|
|
117 |
)
|
118 |
|
119 |
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
|
120 |
-
retriever = vectordb.as_retriever(search_kwargs={"k":
|
121 |
|
122 |
chain_type_kwargs = {"prompt": prompt}
|
123 |
qa_chain = RetrievalQA.from_chain_type(
|
|
|
39 |
app = FastAPI()
|
40 |
templates = Jinja2Templates(directory="templates")
|
41 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
42 |
+
english_embedding_model="text-embedding-3-large"
|
43 |
+
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
44 |
|
45 |
czech_store = "stores/czech_512"
|
46 |
+
english_store = "stores/english_512"
|
47 |
|
48 |
ingestor = Ingest(
|
49 |
openai_api_key=openai_api_key,
|
50 |
+
chunk=512,
|
51 |
+
overlap=256,
|
52 |
czech_store=czech_store,
|
53 |
english_store=english_store,
|
54 |
+
czech_embedding_model=czech_embedding_model,
|
55 |
+
english_embedding_model=english_embedding_model,
|
56 |
)
|
57 |
|
58 |
+
def prompt_en():
|
59 |
+
prompt_template_en = """You are electrical engineer and you answer users ###Question.
|
60 |
|
61 |
+
#Your answer has to be helpful, relevant and closely related to the user's ###Question.
|
62 |
+
#Provide as much literal information and transcription from the #Context as possible.
|
63 |
+
#Only use your own words to connect, clarify or explain the information!
|
64 |
+
#If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
65 |
|
66 |
+
###Context: {context}
|
67 |
+
###Question: {question}
|
68 |
|
69 |
+
Only return the helpful answer below and nothing else.
|
70 |
+
Helpful answer:
|
71 |
+
"""
|
72 |
+
prompt_en = PromptTemplate(
|
73 |
+
template=prompt_template_en, input_variables=["context", "question"]
|
74 |
+
)
|
75 |
+
print("\n Prompt ready... \n\n")
|
76 |
+
return prompt_en
|
77 |
|
78 |
+
def prompt_cz():
|
79 |
+
prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
|
80 |
+
|
81 |
+
#Vaše odpověď musí být užitečná, relevantní a úzce souviset s uživatelovou ###Otázkou.
|
82 |
+
#Poskytněte co nejvíce doslovných informací a přepisů z #Kontextu.
|
83 |
+
#Použijte vlastní slova pouze pro spojení, objasnění nebo vysvětlení informací!
|
84 |
+
#Pokud odpověď neznáte, prostě řekněte, že to nevíte, nepokoušejte se vymýšlet odpověď.
|
85 |
|
86 |
+
###Kontext: {context}
|
87 |
+
###Otázka: {question}
|
88 |
+
|
89 |
+
Níže vraťte pouze užitečnou odpověď a nic jiného.
|
90 |
+
Užitečná odpověď:
|
91 |
+
"""
|
92 |
+
prompt_cz = PromptTemplate(
|
93 |
+
template=prompt_template_cz, input_variables=["context", "question"]
|
94 |
+
)
|
95 |
+
print("\n Prompt ready... \n\n")
|
96 |
+
return prompt_cz
|
97 |
|
98 |
|
99 |
@app.get("/", response_class=HTMLResponse)
|
|
|
122 |
async def get_response(query: str = Form(...), language: str = Form(...)):
|
123 |
print(language)
|
124 |
if language == "czech":
|
125 |
+
prompt = prompt_cz()
|
126 |
print("\n Czech language selected....\n\n")
|
127 |
+
embedding_model = czech_embedding_model
|
128 |
persist_directory = czech_store
|
129 |
model_name = embedding_model
|
130 |
model_kwargs = {"device": "cpu"}
|
|
|
135 |
encode_kwargs=encode_kwargs,
|
136 |
)
|
137 |
else:
|
138 |
+
prompt = prompt_en()
|
139 |
print("\n English language selected....\n\n")
|
140 |
+
embedding_model = english_embedding_model # Default to English
|
141 |
persist_directory = english_store
|
142 |
embedding = OpenAIEmbeddings(
|
143 |
openai_api_key=openai_api_key,
|
|
|
145 |
)
|
146 |
|
147 |
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
|
148 |
+
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
149 |
|
150 |
chain_type_kwargs = {"prompt": prompt}
|
151 |
qa_chain = RetrievalQA.from_chain_type(
|
ingest.py
CHANGED
@@ -20,6 +20,8 @@ class Ingest:
|
|
20 |
english_store="stores/english_512",
|
21 |
data_czech="data/czech",
|
22 |
data_english="data/english",
|
|
|
|
|
23 |
):
|
24 |
self.openai_api_key = openai_api_key
|
25 |
self.chunk = chunk
|
@@ -28,17 +30,20 @@ class Ingest:
|
|
28 |
self.english_store = english_store
|
29 |
self.data_czech = data_czech
|
30 |
self.data_english = data_english
|
|
|
|
|
31 |
|
32 |
def ingest_english(self):
|
33 |
|
34 |
embedding = OpenAIEmbeddings(
|
35 |
openai_api_key=self.openai_api_key,
|
36 |
-
model=
|
37 |
)
|
38 |
|
39 |
loader = DirectoryLoader(
|
40 |
self.data_english,
|
41 |
show_progress=True,
|
|
|
42 |
)
|
43 |
|
44 |
documents = loader.load()
|
@@ -58,7 +63,7 @@ class Ingest:
|
|
58 |
print("\n English vector Store Created.......\n\n")
|
59 |
|
60 |
def ingest_czech(self):
|
61 |
-
embedding_model =
|
62 |
model_kwargs = {"device": "cpu"}
|
63 |
encode_kwargs = {"normalize_embeddings": False}
|
64 |
embedding = HuggingFaceEmbeddings(
|
|
|
20 |
english_store="stores/english_512",
|
21 |
data_czech="data/czech",
|
22 |
data_english="data/english",
|
23 |
+
english_embedding_model="text-embedding-3-large",
|
24 |
+
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en",
|
25 |
):
|
26 |
self.openai_api_key = openai_api_key
|
27 |
self.chunk = chunk
|
|
|
30 |
self.english_store = english_store
|
31 |
self.data_czech = data_czech
|
32 |
self.data_english = data_english
|
33 |
+
self.english_embedding_model = english_embedding_model
|
34 |
+
self.czech_embedding_model = czech_embedding_model
|
35 |
|
36 |
def ingest_english(self):
|
37 |
|
38 |
embedding = OpenAIEmbeddings(
|
39 |
openai_api_key=self.openai_api_key,
|
40 |
+
model=self.english_embedding_model,
|
41 |
)
|
42 |
|
43 |
loader = DirectoryLoader(
|
44 |
self.data_english,
|
45 |
show_progress=True,
|
46 |
+
loader_cls=PyPDFLoader,
|
47 |
)
|
48 |
|
49 |
documents = loader.load()
|
|
|
63 |
print("\n English vector Store Created.......\n\n")
|
64 |
|
65 |
def ingest_czech(self):
|
66 |
+
embedding_model = self.czech_embedding_model
|
67 |
model_kwargs = {"device": "cpu"}
|
68 |
encode_kwargs = {"normalize_embeddings": False}
|
69 |
embedding = HuggingFaceEmbeddings(
|
static/dummy.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
dummy
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
|
3 |
+
size 1164000
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
|
3 |
+
size 100
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
|
3 |
+
size 4000
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin
ADDED
File without changes
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
|
3 |
+
size 12428000
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
|
3 |
+
size 100
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
|
3 |
+
size 55974
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
|
3 |
+
size 4000
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
|
3 |
+
size 8624
|
templates/index.html
CHANGED
@@ -192,9 +192,11 @@
|
|
192 |
<!-- Example Queries Section -->
|
193 |
<div id="exampleQueries" class="mb-3">
|
194 |
<h2 class="h5">Try Example Queries:</h2>
|
195 |
-
<button class="btn btn-sm btn-secondary example-query">What cable
|
196 |
<button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
|
197 |
<button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
|
|
|
|
|
198 |
</div>
|
199 |
|
200 |
<div class="row">
|
|
|
192 |
<!-- Example Queries Section -->
|
193 |
<div id="exampleQueries" class="mb-3">
|
194 |
<h2 class="h5">Try Example Queries:</h2>
|
195 |
+
<button class="btn btn-sm btn-secondary example-query">What cable do I use to hang a 1.5kg heavy luminaire on?</button>
|
196 |
<button class="btn btn-sm btn-secondary example-query">What is the minimal gauge of live wires ?</button>
|
197 |
<button class="btn btn-sm btn-secondary example-query">What flamability fequirements do plastic enclosure have to meet ?</button>
|
198 |
+
<button class="btn btn-sm btn-secondary example-query">Jaké parametry musí splňovat kovový kryt živých částí ?</button>
|
199 |
+
|
200 |
</div>
|
201 |
|
202 |
<div class="row">
|