JosueElias commited on
Commit
d9f47da
·
1 Parent(s): ce243a5

Adding Pipeline class.

Browse files
Files changed (2) hide show
  1. app.py +9 -8
  2. mi_clase.py +127 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  from mi_clase import persona
 
3
  st.title("Ask your scientific question!")
4
  expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
5
  txt = st.text_area(
@@ -19,15 +20,15 @@ try:
19
  q = lista[0]
20
 
21
  mi_dict= {
22
- "question":q,
23
- "option A":a,
24
- "option B":b,
25
- "option C":c,
26
- "option D":d,
27
- "option E":e
28
  }
29
-
30
  st.write(mi_dict)
31
- st.write(persona.nombre)
32
  except:
33
  st.error("Your question doesn't have the required format. Please, correct it.")
 
1
  import streamlit as st
2
  from mi_clase import persona
3
+ from mi_clase import pipeline
4
  st.title("Ask your scientific question!")
5
  expected_format = "What is color?\nA)Is a name.\nB)Is something horrible.\nC)I don't know.\nD)You should ask someone else.\nE)Ask in a pyshic book."
6
  txt = st.text_area(
 
20
  q = lista[0]
21
 
22
  mi_dict= {
23
+ "prompt":q,
24
+ "A":a,
25
+ "B":b,
26
+ "C":c,
27
+ "D":d,
28
+ "E":e
29
  }
30
+ answer = pipeline.give_the_best_answer(mi_dict)
31
  st.write(mi_dict)
32
+ st.write(answer)
33
  except:
34
  st.error("Your question doesn't have the required format. Please, correct it.")
mi_clase.py CHANGED
@@ -6,4 +6,130 @@ class Persona:
6
  def get_nomber(self):
7
  return self.nombre
8
 
9
- persona = Persona("josue",33)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def get_nomber(self):
7
  return self.nombre
8
 
9
+ persona = Persona("josue",33)
10
+
11
+
12
+ # os.environ['CUDA_VISIBLE_DEVICES'] ='0'
13
+ import torch
14
+ from transformers import AutoModelForMultipleChoice
15
+ from transformers import AutoTokenizer
16
+ from nltk.corpus import stopwords
17
+ from nltk.tokenize import word_tokenize
18
+ from transformers import AutoTokenizer, AutoModel
19
+
20
+ QUERY_MODEL = "/kaggle/input/bge-small-faiss/"
21
+ GENERATOR_MODEL="/kaggle/input/training-model-2/model_v2"
22
+ DEVICE = "cpu" # cpu or cuda
23
+
24
+ class Pipeline:
25
+
26
+ #---- init class
27
+
28
+ def __init__(self):
29
+ self.model = AutoModelForMultipleChoice.from_pretrained(GENERATOR_MODEL)
30
+ self.tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
31
+ self.semModel = AutoModel.from_pretrained(QUERY_MODEL)
32
+ self.semTokenizer = AutoTokenizer.from_pretrained(QUERY_MODEL)
33
+ self.device = torch.device(DEVICE)
34
+
35
+ self.semModel.to(self.device)
36
+ self.model.to(self.device)
37
+
38
+ #---- utils functions
39
+
40
+ def convert_to_letter(self,a):
41
+ if a == 0:
42
+ return "A"
43
+ if a==1:
44
+ return "B"
45
+ if a==2:
46
+ return "C"
47
+ if a==3:
48
+ return "D"
49
+ if a==4:
50
+ return "E"
51
+
52
+ def filter_stopwords(self,example_sent):
53
+ stop_words = set(stopwords.words('english'))
54
+ word_tokens = word_tokenize(example_sent)
55
+ filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
56
+ return " ".join(filtered_sentence)
57
+
58
+ def cls_pooling(self,model_output):
59
+ return model_output.pooler_output#last_hidden_state[:, 0]
60
+
61
+ def get_embeddings(self,text_list):
62
+ encoded_input = self.semTokenizer(
63
+ text_list, padding=True, truncation=True, return_tensors="pt",max_length =512
64
+ )
65
+ encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
66
+ model_output = self.semModel(**encoded_input)
67
+ return self.cls_pooling(model_output)
68
+
69
+ #---- retriever
70
+
71
+ def get_context_from_text(self,question):
72
+ question_embedding = self.get_embeddings([question]).cpu().detach().numpy()
73
+ scores, samples = datasetx.get_nearest_examples(
74
+ "embeddings", question_embedding, k=5
75
+ )
76
+ samples_df = pd.DataFrame.from_dict(samples)
77
+ samples_df["scores"] = scores
78
+ samples_df.sort_values("scores", ascending=False, inplace=True)
79
+ contexts = ""
80
+ # aux_row = ""
81
+ for _, row in samples_df.iterrows():
82
+ contexts = contexts + f"=={row.section}== {row.text} "
83
+
84
+ # if aux_row =={row.title}:
85
+ # contexts = contexts + f"=={row.section}== {row.text}"
86
+ # else:
87
+ # contexts = contexts + f"==={row.title}=== =={row.section}== {row.text}"
88
+ # aux_row = {row.title}
89
+ return contexts
90
+
91
+ #---- generator
92
+
93
+ # [CLS] context #### question? [SEP] answer [SEP]
94
+ def create_tokens(self,quetion_and_options,context):
95
+ question = quetion_and_options["prompt"]
96
+ candidate1 = "#### "+question + " [SEP] "+quetion_and_options["A"]+ " [SEP]"
97
+ candidate2 = "#### "+question + " [SEP] "+quetion_and_options["B"]+ " [SEP]"
98
+ candidate3 = "#### "+question + " [SEP] "+quetion_and_options["C"]+ " [SEP]"
99
+ candidate4 = "#### "+question + " [SEP] "+quetion_and_options["D"]+ " [SEP]"
100
+ candidate5 = "#### "+question + " [SEP] "+quetion_and_options["E"]+ " [SEP]"
101
+ prompt = "[CLS]"+ context
102
+
103
+ inputs = self.tokenizer([
104
+ [prompt, candidate1],
105
+ [prompt, candidate2],
106
+ [prompt, candidate3],
107
+ [prompt, candidate4],
108
+ [prompt, candidate5]
109
+ ], return_tensors="pt", padding=True,truncation="only_first",max_length =512,add_special_tokens=False)
110
+ labels = torch.tensor(0).unsqueeze(0)
111
+ return (inputs,labels)
112
+
113
+ def infer_answer(self,mi_tupla):
114
+ (inputs,labels) = mi_tupla
115
+
116
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
117
+ labels = labels.to(self.device)
118
+
119
+ outputs = self.model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
120
+ logits = outputs.logits
121
+ _, topk_indices = torch.topk(logits, k=3, dim=1)
122
+ #predicted_class = logits.argmax().item()
123
+ return topk_indices
124
+
125
+ #---- retriever + generator
126
+
127
+ def give_the_best_answer(self,dict_with_all_the_info):
128
+ a = self.get_context_from_text(my_dict["prompt"])
129
+ b = self.create_tokens(my_dict,a)
130
+ c = self.infer_answer(b)
131
+ d = self.convert_to_letter(int(c[0][0]))
132
+ #print("\nThe answer is ",)
133
+ return d
134
+
135
+ pipeline = Pipeline()