Spaces:
Runtime error
Runtime error
pritamdeka
commited on
Commit
Β·
16ebd70
1
Parent(s):
408061c
Update app.py
Browse files
app.py
CHANGED
@@ -55,16 +55,20 @@ all_stopwords = sp.Defaults.stop_words
|
|
55 |
|
56 |
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
def remove_stopwords(sen):
|
62 |
sen_new = " ".join([i for i in sen if i not in stop_words])
|
63 |
return sen_new
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
-
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3):
|
|
|
68 |
word_embedding_model = models.Transformer(model_3)
|
69 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
|
70 |
pooling_mode_mean_tokens=True,
|
@@ -87,6 +91,7 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
87 |
final_list=[]
|
88 |
score_list=[]
|
89 |
sum_list=[]
|
|
|
90 |
model_1 = SentenceTransformer(model_1)
|
91 |
model_2 = SentenceTransformer(model_2)
|
92 |
url = article_link
|
@@ -131,6 +136,10 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
131 |
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
|
132 |
for elem in x:
|
133 |
final_textrank_list.append(elem[1])
|
|
|
|
|
|
|
|
|
134 |
|
135 |
a=int((10*len(final_textrank_list))/100.0)
|
136 |
if(a<5):
|
@@ -154,6 +163,11 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
154 |
top_n = max_num_keywords
|
155 |
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
|
156 |
keywords = '\n'.join(keyword_list)
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
c_len=(len(keyword_list))
|
159 |
keyword_embeddings = embedder.encode(keyword_list)
|
@@ -196,6 +210,9 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
196 |
f_1=' OR '.join(f1_list)
|
197 |
final_list.append(f_1)
|
198 |
|
|
|
|
|
|
|
199 |
|
200 |
ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
201 |
|
@@ -215,7 +232,7 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
215 |
search_id='&id='+all_search_ids
|
216 |
ret_type='&rettype=text'
|
217 |
ret_mode='&retmode=xml'
|
218 |
-
ret_max='&retmax=
|
219 |
ret_sort='&sort=relevance'
|
220 |
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
|
221 |
pubmed_abstract_request = requests.get(return_url)
|
@@ -228,12 +245,33 @@ def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_
|
|
228 |
for b in article_abstract:
|
229 |
article_abstract_name = b.text
|
230 |
abstracts_list.append(article_abstract_name)
|
231 |
-
mydict = {'Title': titles_list, 'Abstract':abstracts_list}
|
232 |
-
|
233 |
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
df_final = df_new.fillna(' ')
|
236 |
|
|
|
237 |
|
238 |
return df_final
|
239 |
|
@@ -276,10 +314,17 @@ igen_pubmed = gr.Interface(keyphrase_generator,
|
|
276 |
'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
|
277 |
type="value",
|
278 |
default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
|
279 |
-
label="Select any SapBERT model for clustering from the list below")
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
outputs=gr.outputs.Dataframe(type="auto", label="Retrieved Results from PubMed",max_cols=None, overflow_row_behaviour="paginate"),
|
281 |
theme="dark-peach",
|
282 |
-
title="PubMed Abstract Retriever", description="Retrieves relevant PubMed abstracts for an online article which can be used as further references.",
|
|
|
283 |
article= "This work is based on the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
|
284 |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
|
285 |
"\t The application then uses a UMLS based BERT model, <a href=https://arxiv.org/abs/2010.11784>SapBERT</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 10 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The SapBERT models can be changed as per the list provided. "
|
|
|
55 |
|
56 |
|
57 |
|
|
|
|
|
|
|
58 |
def remove_stopwords(sen):
|
59 |
sen_new = " ".join([i for i in sen if i not in stop_words])
|
60 |
return sen_new
|
61 |
|
62 |
+
examples = [
|
63 |
+
["https://www.medicalnewstoday.com/articles/alzheimers-addressing-sleep-disturbance-may-alleviate-symptoms"],
|
64 |
+
["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"],
|
65 |
+
["https://www.cancer.news/2022-02-04-doctors-testifying-covid-vaccines-causing-cancer-aids.html#",
|
66 |
+
"https://www.cancer.news/2021-12-22-mrna-vaccines-weaken-immune-system-cause-cancer.html"]
|
67 |
+
]
|
68 |
|
69 |
|
70 |
+
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords, model_3, max_retrieved, model_4):
|
71 |
+
|
72 |
word_embedding_model = models.Transformer(model_3)
|
73 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
|
74 |
pooling_mode_mean_tokens=True,
|
|
|
91 |
final_list=[]
|
92 |
score_list=[]
|
93 |
sum_list=[]
|
94 |
+
############################################## Here we first extract the sentences using SBERT and Textrank ###########################
|
95 |
model_1 = SentenceTransformer(model_1)
|
96 |
model_2 = SentenceTransformer(model_2)
|
97 |
url = article_link
|
|
|
136 |
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
|
137 |
for elem in x:
|
138 |
final_textrank_list.append(elem[1])
|
139 |
+
|
140 |
+
################################################################ Textrank ends ##################################################
|
141 |
+
|
142 |
+
######################################################## From here we start the keyphrase extraction process ################################################
|
143 |
|
144 |
a=int((10*len(final_textrank_list))/100.0)
|
145 |
if(a<5):
|
|
|
163 |
top_n = max_num_keywords
|
164 |
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
|
165 |
keywords = '\n'.join(keyword_list)
|
166 |
+
|
167 |
+
############################################################## Keyphrase extraction ends #############################################
|
168 |
+
|
169 |
+
|
170 |
+
################################################################## From here we start the clustering and query generation ##################################
|
171 |
|
172 |
c_len=(len(keyword_list))
|
173 |
keyword_embeddings = embedder.encode(keyword_list)
|
|
|
210 |
f_1=' OR '.join(f1_list)
|
211 |
final_list.append(f_1)
|
212 |
|
213 |
+
######################################################## query generation ends here #######################################
|
214 |
+
|
215 |
+
####################################### PubeMed abstract extraction starts here #########################################
|
216 |
|
217 |
ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
|
218 |
|
|
|
232 |
search_id='&id='+all_search_ids
|
233 |
ret_type='&rettype=text'
|
234 |
ret_mode='&retmode=xml'
|
235 |
+
ret_max='&retmax=500'
|
236 |
ret_sort='&sort=relevance'
|
237 |
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort
|
238 |
pubmed_abstract_request = requests.get(return_url)
|
|
|
245 |
for b in article_abstract:
|
246 |
article_abstract_name = b.text
|
247 |
abstracts_list.append(article_abstract_name)
|
|
|
|
|
248 |
|
249 |
+
################################## PubMed extraction ends here ########################################################
|
250 |
+
|
251 |
+
########################################## Most relevant abstracts as per news article heading starts here ##########################################
|
252 |
+
|
253 |
+
first_article = Article(url, language='en')
|
254 |
+
first_article.download()
|
255 |
+
first_article.parse()
|
256 |
+
article_heading=(first_article.title)
|
257 |
+
article_heading=sent_tokenize(article_heading)
|
258 |
+
model_4 = SentenceTransformer(model_4)
|
259 |
+
|
260 |
+
my_dict = dict(zip(titles_list,abstracts_list))
|
261 |
+
title_embeddings = model_4.encode(titles_list)
|
262 |
+
heading_embedding = model_4.encode(article_heading)
|
263 |
+
similarities = cosine_similarity(heading_embedding, title_embeddings)
|
264 |
+
max_n = max_retrieved
|
265 |
+
sorted_titles = [titles_list[index] for index in similarities.argsort()[0][-max_n:]]
|
266 |
+
sorted_abstract_list=[]
|
267 |
+
for list_elem in sorted_titles:
|
268 |
+
sorted_abstract_list.append(my_dict[list_elem])
|
269 |
+
sorted_dict = {'Title': sorted_titles, 'Abstract': sorted_abstract_list}
|
270 |
+
df_new=pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in sorted_dict.items() ]))
|
271 |
+
|
272 |
df_final = df_new.fillna(' ')
|
273 |
|
274 |
+
############################################# Ends here ####################################################
|
275 |
|
276 |
return df_final
|
277 |
|
|
|
314 |
'cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token'],
|
315 |
type="value",
|
316 |
default='cambridgeltl/SapBERT-from-PubMedBERT-fulltext',
|
317 |
+
label="Select any SapBERT model for clustering from the list below"),
|
318 |
+
gr.inputs.Slider(minimum=5, maximum=15, step=1, default=10, label="PubMed Max Abstracts"),
|
319 |
+
gr.inputs.Dropdown(choices=['pritamdeka/S-Bluebert-snli-multinli-stsb',
|
320 |
+
'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb'],
|
321 |
+
type="value",
|
322 |
+
default='pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
|
323 |
+
label="Select any SBERT model for abstracts from the list below")],
|
324 |
outputs=gr.outputs.Dataframe(type="auto", label="Retrieved Results from PubMed",max_cols=None, overflow_row_behaviour="paginate"),
|
325 |
theme="dark-peach",
|
326 |
+
title="PubMed Abstract Retriever", description="Retrieves relevant PubMed abstracts for an online article which can be used as further references. Please note that it may take sometime for the models to load.",
|
327 |
+
examples=examples,
|
328 |
article= "This work is based on the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>."
|
329 |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT."
|
330 |
"\t The application then uses a UMLS based BERT model, <a href=https://arxiv.org/abs/2010.11784>SapBERT</a> to cluster the keyphrases using K-means clustering method and finally create a boolean query. After that the top 10 titles and abstracts are retrieved from PubMed database and displayed according to relevancy. The SapBERT models can be changed as per the list provided. "
|