Spaces:
Sleeping
Sleeping
Commit
·
d9f47da
1
Parent(s):
ce243a5
Adding Pipeline class.
Browse files- app.py +9 -8
- mi_clase.py +127 -1
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from mi_clase import persona
|
|
|
3 |
st.title("Ask your scientific question!")
|
4 |
expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
|
5 |
txt = st.text_area(
|
@@ -19,15 +20,15 @@ try:
|
|
19 |
q = lista[0]
|
20 |
|
21 |
mi_dict= {
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"
|
28 |
}
|
29 |
-
|
30 |
st.write(mi_dict)
|
31 |
-
st.write(
|
32 |
except:
|
33 |
st.error("Your question doesn't have the required format. Please, correct it.")
|
|
|
1 |
import streamlit as st
|
2 |
from mi_clase import persona
|
3 |
+
from mi_clase import pipeline
|
4 |
st.title("Ask your scientific question!")
|
5 |
expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
|
6 |
txt = st.text_area(
|
|
|
20 |
q = lista[0]
|
21 |
|
22 |
mi_dict= {
|
23 |
+
"prompt":q,
|
24 |
+
"A":a,
|
25 |
+
"B":b,
|
26 |
+
"C":c,
|
27 |
+
"D":d,
|
28 |
+
"E":e
|
29 |
}
|
30 |
+
answer = pipeline.give_the_best_answer(mi_dict)
|
31 |
st.write(mi_dict)
|
32 |
+
st.write(answer)
|
33 |
except:
|
34 |
st.error("Your question doesn't have the required format. Please, correct it.")
|
mi_clase.py
CHANGED
@@ -6,4 +6,130 @@ class Persona:
|
|
6 |
def get_nomber(self):
|
7 |
return self.nombre
|
8 |
|
9 |
-
persona = Persona("josue",33)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
def get_nomber(self):
|
7 |
return self.nombre
|
8 |
|
9 |
+
persona = Persona("josue",33)
|
10 |
+
|
11 |
+
|
12 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] ='0'
|
13 |
+
import torch
|
14 |
+
from transformers import AutoModelForMultipleChoice
|
15 |
+
from transformers import AutoTokenizer
|
16 |
+
from nltk.corpus import stopwords
|
17 |
+
from nltk.tokenize import word_tokenize
|
18 |
+
from transformers import AutoTokenizer, AutoModel
|
19 |
+
|
20 |
+
QUERY_MODEL = "/kaggle/input/bge-small-faiss/"
|
21 |
+
GENERATOR_MODEL="/kaggle/input/training-model-2/model_v2"
|
22 |
+
DEVICE = "cpu" # cpu or cuda
|
23 |
+
|
24 |
+
class Pipeline:
|
25 |
+
|
26 |
+
#---- init class
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
self.model = AutoModelForMultipleChoice.from_pretrained(GENERATOR_MODEL)
|
30 |
+
self.tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
|
31 |
+
self.semModel = AutoModel.from_pretrained(QUERY_MODEL)
|
32 |
+
self.semTokenizer = AutoTokenizer.from_pretrained(QUERY_MODEL)
|
33 |
+
self.device = torch.device(DEVICE)
|
34 |
+
|
35 |
+
self.semModel.to(self.device)
|
36 |
+
self.model.to(self.device)
|
37 |
+
|
38 |
+
#---- utils functions
|
39 |
+
|
40 |
+
def convert_to_letter(self,a):
|
41 |
+
if a == 0:
|
42 |
+
return "A"
|
43 |
+
if a==1:
|
44 |
+
return "B"
|
45 |
+
if a==2:
|
46 |
+
return "C"
|
47 |
+
if a==3:
|
48 |
+
return "D"
|
49 |
+
if a==4:
|
50 |
+
return "E"
|
51 |
+
|
52 |
+
def filter_stopwords(self,example_sent):
|
53 |
+
stop_words = set(stopwords.words('english'))
|
54 |
+
word_tokens = word_tokenize(example_sent)
|
55 |
+
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
|
56 |
+
return " ".join(filtered_sentence)
|
57 |
+
|
58 |
+
def cls_pooling(self,model_output):
|
59 |
+
return model_output.pooler_output#last_hidden_state[:, 0]
|
60 |
+
|
61 |
+
def get_embeddings(self,text_list):
|
62 |
+
encoded_input = self.semTokenizer(
|
63 |
+
text_list, padding=True, truncation=True, return_tensors="pt",max_length =512
|
64 |
+
)
|
65 |
+
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
66 |
+
model_output = self.semModel(**encoded_input)
|
67 |
+
return self.cls_pooling(model_output)
|
68 |
+
|
69 |
+
#---- retriever
|
70 |
+
|
71 |
+
def get_context_from_text(self,question):
|
72 |
+
question_embedding = self.get_embeddings([question]).cpu().detach().numpy()
|
73 |
+
scores, samples = datasetx.get_nearest_examples(
|
74 |
+
"embeddings", question_embedding, k=5
|
75 |
+
)
|
76 |
+
samples_df = pd.DataFrame.from_dict(samples)
|
77 |
+
samples_df["scores"] = scores
|
78 |
+
samples_df.sort_values("scores", ascending=False, inplace=True)
|
79 |
+
contexts = ""
|
80 |
+
# aux_row = ""
|
81 |
+
for _, row in samples_df.iterrows():
|
82 |
+
contexts = contexts + f"=={row.section}== {row.text} "
|
83 |
+
|
84 |
+
# if aux_row =={row.title}:
|
85 |
+
# contexts = contexts + f"=={row.section}== {row.text}"
|
86 |
+
# else:
|
87 |
+
# contexts = contexts + f"==={row.title}=== =={row.section}== {row.text}"
|
88 |
+
# aux_row = {row.title}
|
89 |
+
return contexts
|
90 |
+
|
91 |
+
#---- generator
|
92 |
+
|
93 |
+
# [CLS] context #### question? [SEP] answer [SEP]
|
94 |
+
def create_tokens(self,quetion_and_options,context):
|
95 |
+
question = quetion_and_options["prompt"]
|
96 |
+
candidate1 = "#### "+question + " [SEP] "+quetion_and_options["A"]+ " [SEP]"
|
97 |
+
candidate2 = "#### "+question + " [SEP] "+quetion_and_options["B"]+ " [SEP]"
|
98 |
+
candidate3 = "#### "+question + " [SEP] "+quetion_and_options["C"]+ " [SEP]"
|
99 |
+
candidate4 = "#### "+question + " [SEP] "+quetion_and_options["D"]+ " [SEP]"
|
100 |
+
candidate5 = "#### "+question + " [SEP] "+quetion_and_options["E"]+ " [SEP]"
|
101 |
+
prompt = "[CLS]"+ context
|
102 |
+
|
103 |
+
inputs = self.tokenizer([
|
104 |
+
[prompt, candidate1],
|
105 |
+
[prompt, candidate2],
|
106 |
+
[prompt, candidate3],
|
107 |
+
[prompt, candidate4],
|
108 |
+
[prompt, candidate5]
|
109 |
+
], return_tensors="pt", padding=True,truncation="only_first",max_length =512,add_special_tokens=False)
|
110 |
+
labels = torch.tensor(0).unsqueeze(0)
|
111 |
+
return (inputs,labels)
|
112 |
+
|
113 |
+
def infer_answer(self,mi_tupla):
|
114 |
+
(inputs,labels) = mi_tupla
|
115 |
+
|
116 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
117 |
+
labels = labels.to(self.device)
|
118 |
+
|
119 |
+
outputs = self.model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
|
120 |
+
logits = outputs.logits
|
121 |
+
_, topk_indices = torch.topk(logits, k=3, dim=1)
|
122 |
+
#predicted_class = logits.argmax().item()
|
123 |
+
return topk_indices
|
124 |
+
|
125 |
+
#---- retriever + generator
|
126 |
+
|
127 |
+
def give_the_best_answer(self,dict_with_all_the_info):
|
128 |
+
a = self.get_context_from_text(my_dict["prompt"])
|
129 |
+
b = self.create_tokens(my_dict,a)
|
130 |
+
c = self.infer_answer(b)
|
131 |
+
d = self.convert_to_letter(int(c[0][0]))
|
132 |
+
#print("\nThe answer is ",)
|
133 |
+
return d
|
134 |
+
|
135 |
+
pipeline = Pipeline()
|