pritamdeka commited on
Commit
1f561dd
Β·
1 Parent(s): 7a12c31

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -0
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ word_embedding_model = models.Transformer('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
2
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
3
+ pooling_mode_mean_tokens=True,
4
+ pooling_mode_cls_token=False,
5
+ pooling_mode_max_tokens=False)
6
+
7
+ embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
8
+
9
+ def search(query):
10
+ Entrez.email = '[email protected]'
11
+ handle = Entrez.esearch(db='pubmed',
12
+ sort='relevance',
13
+ retmax='5',
14
+ retmode='xml',
15
+ term=query)
16
+ results = Entrez.read(handle)
17
+ return results
18
+
19
+ def fetch_details(id_list):
20
+ ids = ','.join(id_list)
21
+ Entrez.email = '[email protected]'
22
+
23
+ handle_1 = Entrez.efetch(db='pubmed', retmode='xml', id=ids)
24
+ results_1 = Entrez.read(handle_1)
25
+ return results_1
26
+
27
+
28
+ def remove_stopwords(sen):
29
+ sen_new = " ".join([i for i in sen if i not in stop_words])
30
+ return sen_new
31
+
32
+
33
+
34
+ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
35
+ element=[]
36
+ final_textrank_list=[]
37
+ document=[]
38
+ text_doc=[]
39
+ final_list=[]
40
+ score_list=[]
41
+ sum_list=[]
42
+ model_1 = SentenceTransformer(model_1)
43
+ model_2 = SentenceTransformer(model_2)
44
+ url = article_link
45
+ if (url == False):
46
+ print("error")
47
+ html = requests.get(url).text
48
+ article = fulltext(html)
49
+ corpus=sent_tokenize(article)
50
+ indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence',
51
+ 'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that',
52
+ 'indicated that','suggested that','demonstrated that']
53
+ count_dict={}
54
+ for l in corpus:
55
+ c=0
56
+ for l2 in indicator_list:
57
+ if l.find(l2)!=-1:#then it is a substring
58
+ c=1
59
+ break
60
+ if c:#
61
+ count_dict[l]=1
62
+ else:
63
+ count_dict[l]=0
64
+ for sent, score in count_dict.items():
65
+ score_list.append(score)
66
+ clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
67
+ corpus_embeddings = model_1.encode(clean_sentences_new)
68
+ sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
69
+ for i in range(len(clean_sentences_new)):
70
+ len_embeddings=(len(corpus_embeddings[i]))
71
+ for j in range(len(clean_sentences_new)):
72
+ if i != j:
73
+ if(len_embeddings == 1024):
74
+ sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,1024), corpus_embeddings[j].reshape(1,1024))[0,0]
75
+ elif(len_embeddings == 768):
76
+ sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
77
+ nx_graph = nx.from_numpy_array(sim_mat)
78
+ scores = nx.pagerank(nx_graph)
79
+ sentences=((scores[i],s) for i,s in enumerate(corpus))
80
+ for elem in sentences:
81
+ element.append(elem[0])
82
+ for sc, lst in zip(score_list, element): ########## taking the scores from both the lists
83
+ sum1=sc+lst
84
+ sum_list.append(sum1)
85
+ x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
86
+ for elem in x:
87
+ final_textrank_list.append(elem[1])
88
+
89
+ a=int((10*len(final_textrank_list))/100.0)
90
+ if(a<5):
91
+ total=5
92
+ else:
93
+ total=int(a)
94
+ for i in range(total):
95
+ document.append(final_textrank_list[i])
96
+ doc=" ".join(document)
97
+ for i in document:
98
+ doc_1=nlp(i)
99
+ text_doc.append([X.text for X in doc_1.ents])
100
+ entity_list = [item for sublist in text_doc for item in sublist]
101
+ entity_list = [word for word in entity_list if not word in all_stopwords]
102
+ entity_list = [word_entity for word_entity in entity_list if(p.singular_noun(word_entity) == False)]
103
+ entity_list=list(dict.fromkeys(entity_list))
104
+ doc_embedding = model_2.encode([doc])
105
+ candidates=entity_list
106
+ candidate_embeddings = model_2.encode(candidates)
107
+ distances = cosine_similarity(doc_embedding, candidate_embeddings)
108
+ top_n = max_num_keywords
109
+ keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
110
+ keywords = '\n'.join(keyword_list)
111
+
112
+ c_len=(len(keyword_list))
113
+ keyword_embeddings = embedder.encode(keyword_list)
114
+ data_embeddings = embedder.encode(keyword_list)
115
+
116
+ for num_clusters in range(1, top_n):
117
+ clustering_model = KMeans(n_clusters=num_clusters)
118
+ clustering_model.fit(keyword_embeddings)
119
+ cluster_assignment = clustering_model.labels_
120
+ clustered_sentences = [[] for i in range(num_clusters)]
121
+ for sentence_id, cluster_id in enumerate(cluster_assignment):
122
+ clustered_sentences[cluster_id].append(keyword_list[sentence_id])
123
+ cl_sent_len=(len(clustered_sentences))
124
+ list_cluster=list(clustered_sentences)
125
+ a=len(list_cluster)
126
+ cluster_list_final.append(list_cluster)
127
+ if (c_len==cl_sent_len and c_len>=3) or cl_sent_len==1:
128
+ silhouette_avg = 0
129
+ silhouette_score_list.append(silhouette_avg)
130
+ elif c_len==cl_sent_len==2:
131
+ silhouette_avg = 1
132
+ silhouette_score_list.append(silhouette_avg)
133
+ else:
134
+ silhouette_avg = silhouette_score(keyword_embeddings, cluster_assignment)
135
+ silhouette_score_list.append(silhouette_avg)
136
+ res_dict = dict(zip(silhouette_score_list, cluster_list_final))
137
+ cluster_items=res_dict[max(res_dict)]
138
+
139
+ for i in cluster_items:
140
+ z=' OR '.join(i)
141
+ comb.append("("+z+")")
142
+ comb_list.append(comb)
143
+ combinations = []
144
+ for subset in itertools.combinations(comb, 2):
145
+ combinations.append(subset)
146
+ f1_list=[]
147
+ for s in combinations:
148
+ final = ' AND '.join(s)
149
+ f1_list.append("("+final+")")
150
+ f_1=' OR '.join(f1_list)
151
+ final_list.append(f_1)
152
+
153
+
154
+ #if __name__ == '__main__':
155
+ #for qu in range(len(final_list)):
156
+ results=search(f_1)
157
+ id_list = results['IdList']
158
+ #if(id_list != []):
159
+ papers = fetch_details(id_list)
160
+ abstract_list=[]
161
+ year_list=[]
162
+ journal_list=[]
163
+ title_list=[]
164
+ for i, paper in enumerate(papers['PubmedArticle']):
165
+ x=(json.dumps(papers['PubmedArticle'][i], indent=2))
166
+ t_list=[]
167
+ y = json.loads(x)
168
+ try:
169
+ value_1 = y['MedlineCitation']['Article']['Abstract']['AbstractText']
170
+ value = (y['MedlineCitation']['Article']['ArticleTitle'])
171
+ value_2 = (y['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
172
+ value_journal = (y['MedlineCitation']['Article']['Journal']['Title'])
173
+ t_list.append(value)
174
+ title_list.append(t_list)
175
+ year_list.append(value_2)
176
+ abstract_list.append(value_1)
177
+ journal_list.append(value_journal)
178
+ except KeyError:
179
+ value_1 = []
180
+ title_list.append(t_list)
181
+ abstract_list.append(value_1)
182
+ year_list.append(value_2)
183
+ journal_list.append(value_journal)
184
+ mydict={'Title': title_list, 'Abstract':abstract_list, 'Journal Title': journal_list, 'Year': year_list}
185
+ df_new=pd.DataFrame(mydict)
186
+ #print(df_new)
187
+ #else:
188
+ # abstract_list=[]
189
+ # title_list=[]
190
+ # year_list=[]
191
+ # journal_list=[]
192
+ # a=["No result"]
193
+ # b=["No results"]
194
+ # abstract_list.append(a)
195
+ # title_list.append(b)
196
+ # mydict={'Title': title_list, 'Abstract':abstract_list, 'Journal Title': journal_list, 'Year': year_list}
197
+ # df_new=pd.DataFrame(mydict)
198
+ #print(df_new)
199
+ return title_list
200
+
201
+ gr.Interface(keyphrase_generator,
202
+ inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"),
203
+ gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
204
+ 'sentence-transformers/all-mpnet-base-v1',
205
+ 'sentence-transformers/all-distilroberta-v1',
206
+ 'sentence-transformers/gtr-t5-large',
207
+ 'pritamdeka/S-Bluebert-snli-multinli-stsb',
208
+ 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
209
+ 'sentence-transformers/stsb-mpnet-base-v2',
210
+ 'sentence-transformers/stsb-roberta-base-v2',
211
+ 'sentence-transformers/stsb-distilroberta-base-v2',
212
+ 'sentence-transformers/sentence-t5-large',
213
+ 'sentence-transformers/sentence-t5-base'],
214
+ type="value",
215
+ default='sentence-transformers/all-mpnet-base-v1',
216
+ label="Select any SBERT model for TextRank from the list below"),
217
+ gr.inputs.Dropdown(choices=['sentence-transformers/paraphrase-mpnet-base-v2',
218
+ 'sentence-transformers/all-mpnet-base-v1',
219
+ 'sentence-transformers/paraphrase-distilroberta-base-v1',
220
+ 'sentence-transformers/paraphrase-xlm-r-multilingual-v1',
221
+ 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
222
+ 'sentence-transformers/paraphrase-albert-small-v2',
223
+ 'sentence-transformers/paraphrase-albert-base-v2',
224
+ 'sentence-transformers/paraphrase-MiniLM-L12-v2',
225
+ 'sentence-transformers/paraphrase-MiniLM-L6-v2',
226
+ 'sentence-transformers/all-MiniLM-L12-v2',
227
+ 'sentence-transformers/all-distilroberta-v1',
228
+ 'sentence-transformers/paraphrase-TinyBERT-L6-v2',
229
+ 'sentence-transformers/paraphrase-MiniLM-L3-v2',
230
+ 'sentence-transformers/all-MiniLM-L6-v2'],
231
+ type="value",
232
+ default='sentence-transformers/all-mpnet-base-v1',
233
+ label="Select any SBERT model for keyphrases from the list below"),
234
+ gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
235
+ outputs=gr.outputs.Textbox(type="auto", label="Stuff"),
236
+ theme="peach",
237
+ title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.",
238
+ article= "The work is based on a part of the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
239
+ "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
240
+ "\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
241
+ "\t The default model names are provided which can be changed from the list of pretrained models. "
242
+ "\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.").launch(share=True,debug=True)