pritamdeka commited on
Commit
16ebd70
Β·
1 Parent(s): 408061c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -10
app.py CHANGED
@@ -55,16 +55,20 @@ all_stopwords = sp.Defaults.stop_words
55
 
56
 
57
 
58
-
59
-
60
-
61
  def remove_stopwords(sen):
62
  sen_new = " ".join([i for i in sen if i not in stop_words])
63
  return sen_new
64
 
 
 
 
 
 
 
65
 
66
 
67
- def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3):
 
68
  word_embedding_model = models.Transformer(model_3)
69
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
70
  pooling_mode_mean_tokens=True,
@@ -87,6 +91,7 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
87
  final_list=[]
88
  score_list=[]
89
  sum_list=[]
 
90
  model_1 = SentenceTransformer(model_1)
91
  model_2 = SentenceTransformer(model_2)
92
  url = article_link
@@ -131,6 +136,10 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
131
  x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
132
  for elem in x:
133
  final_textrank_list.append(elem[1])
 
 
 
 
134
 
135
  a=int((10*len(final_textrank_list))/100.0)
136
  if(a<5):
@@ -154,6 +163,11 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
154
  top_n = max_num_keywords
155
  keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
156
  keywords = '\n'.join(keyword_list)
 
 
 
 
 
157
 
158
  c_len=(len(keyword_list))
159
  keyword_embeddings = embedder.encode(keyword_list)
@@ -196,6 +210,9 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
196
  f_1=' OR '.join(f1_list)
197
  final_list.append(f_1)
198
 
 
 
 
199
 
200
  ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
201
 
@@ -215,7 +232,7 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
215
  search_id='&id='+all_search_ids
216
  ret_type='&rettype=text'
217
  ret_mode='&retmode=xml'
218
- ret_max='&retmax=10'
219
  ret_sort='&sort=relevance'
220
  return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
221
  pubmed_abstract_request = requests.get(return_url)
@@ -228,12 +245,33 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
228
  for b in article_abstract:
229
  article_abstract_name = b.text
230
  abstracts_list.append(article_abstract_name)
231
- mydict = {'Title': titles_list, 'Abstract':abstracts_list}
232
-
233
 
234
- df_new = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in mydict.items() ]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  df_final = df_new.fillna(' ')
236
 
 
237
 
238
  return df_final
239
 
@@ -276,10 +314,17 @@ igen_pubmed = gr.Interface(keyphrase_generator,
276
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
277
  type="value",
278
  default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
279
- label="Select any SapBERT model for clustering from the list below")],
 
 
 
 
 
 
280
  outputs=gr.outputs.Dataframe(type="auto", label="Retrieved Results from PubMed",max_cols=None, overflow_row_behaviour="paginate"),
281
  theme="dark-peach",
282
- title="PubMed Abstract Retriever", description="Retrieves relevant PubMed abstracts for an online article which can be used as further references.",
 
283
  article= "This work is based on the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
284
  "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
285
  "\t The application then uses a UMLS based BERT model, <a href=https://arxiv.org/abs/2010.11784>SapBERT</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 10 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The SapBERT models can be changed as per the list provided. "
 
55
 
56
 
57
 
 
 
 
58
  def remove_stopwords(sen):
59
  sen_new = " ".join([i for i in sen if i not in stop_words])
60
  return sen_new
61
 
62
+ examples = [
63
+ ["https://www.medicalnewstoday.com/articles/alzheimers-addressing-sleep-disturbance-may-alleviate-symptoms"],
64
+ ["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"],
65
+ ["https://www.cancer.news/2022-02-04-doctors-testifying-covid-vaccines-causing-cancer-aids.html#",
66
+ "https://www.cancer.news/2021-12-22-mrna-vaccines-weaken-immune-system-cause-cancer.html"]
67
+ ]
68
 
69
 
70
+ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3, max_retrieved, model_4):
71
+
72
  word_embedding_model = models.Transformer(model_3)
73
  pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
74
  pooling_mode_mean_tokens=True,
 
91
  final_list=[]
92
  score_list=[]
93
  sum_list=[]
94
+ ############################################## Here we first extract the sentences using SBERT and Textrank ###########################
95
  model_1 = SentenceTransformer(model_1)
96
  model_2 = SentenceTransformer(model_2)
97
  url = article_link
 
136
  x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
137
  for elem in x:
138
  final_textrank_list.append(elem[1])
139
+
140
+ ################################################################ Textrank ends ##################################################
141
+
142
+ ######################################################## From here we start the keyphrase extraction process ################################################
143
 
144
  a=int((10*len(final_textrank_list))/100.0)
145
  if(a<5):
 
163
  top_n = max_num_keywords
164
  keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
165
  keywords = '\n'.join(keyword_list)
166
+
167
+ ############################################################## Keyphrase extraction ends #############################################
168
+
169
+
170
+ ################################################################## From here we start the clustering and query generation ##################################
171
 
172
  c_len=(len(keyword_list))
173
  keyword_embeddings = embedder.encode(keyword_list)
 
210
  f_1=' OR '.join(f1_list)
211
  final_list.append(f_1)
212
 
213
+ ######################################################## query generation ends here #######################################
214
+
215
+ ####################################### PubeMed abstract extraction starts here #########################################
216
 
217
  ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
218
 
 
232
  search_id='&id='+all_search_ids
233
  ret_type='&rettype=text'
234
  ret_mode='&retmode=xml'
235
+ ret_max='&retmax=500'
236
  ret_sort='&sort=relevance'
237
  return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
238
  pubmed_abstract_request = requests.get(return_url)
 
245
  for b in article_abstract:
246
  article_abstract_name = b.text
247
  abstracts_list.append(article_abstract_name)
 
 
248
 
249
+ ################################## PubMed extraction ends here ########################################################
250
+
251
+ ########################################## Most relevant abstracts as per news article heading starts here ##########################################
252
+
253
+ first_article = Article(url, language='en')
254
+ first_article.download()
255
+ first_article.parse()
256
+ article_heading=(first_article.title)
257
+ article_heading=sent_tokenize(article_heading)
258
+ model_4 = SentenceTransformer(model_4)
259
+
260
+ my_dict = dict(zip(titles_list,abstracts_list))
261
+ title_embeddings = model_4.encode(titles_list)
262
+ heading_embedding = model_4.encode(article_heading)
263
+ similarities = cosine_similarity(heading_embedding, title_embeddings)
264
+ max_n = max_retrieved
265
+ sorted_titles = [titles_list[index] for index in similarities.argsort()[0][-max_n:]]
266
+ sorted_abstract_list=[]
267
+ for list_elem in sorted_titles:
268
+ sorted_abstract_list.append(my_dict[list_elem])
269
+ sorted_dict = {'Title': sorted_titles, 'Abstract': sorted_abstract_list}
270
+ df_new=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in sorted_dict.items() ]))
271
+
272
  df_final = df_new.fillna(' ')
273
 
274
+ ############################################# Ends here ####################################################
275
 
276
  return df_final
277
 
 
314
  'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
315
  type="value",
316
  default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
317
+ label="Select any SapBERT model for clustering from the list below"),
318
+ gr.inputs.Slider(minimum=5, maximum=15, step=1, default=10, label="PubMed Max Abstracts"),
319
+ gr.inputs.Dropdown(choices=['pritamdeka/S-Bluebert-snli-multinli-stsb',
320
+ 'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb'],
321
+ type="value",
322
+ default='pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
323
+ label="Select any SBERT model for abstracts from the list below")],
324
  outputs=gr.outputs.Dataframe(type="auto", label="Retrieved Results from PubMed",max_cols=None, overflow_row_behaviour="paginate"),
325
  theme="dark-peach",
326
+ title="PubMed Abstract Retriever", description="Retrieves relevant PubMed abstracts for an online article which can be used as further references. Please note that it may take sometime for the models to load.",
327
+ examples=examples,
328
  article= "This work is based on the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
329
  "\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
330
  "\t The application then uses a UMLS based BERT model, <a href=https://arxiv.org/abs/2010.11784>SapBERT</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 10 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The SapBERT models can be changed as per the list provided. "