Spaces:
Runtime error
Runtime error
import nltk | |
import re | |
nltk.download('wordnet') | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('maxent_ne_chunker') | |
nltk.download('words') | |
nltk.download('brown') | |
from newspaper import Article | |
from newspaper import fulltext | |
import requests | |
import itertools | |
from nltk.tokenize import word_tokenize | |
from sentence_transformers import SentenceTransformer | |
import pandas as pd | |
import numpy as np | |
from pandas import ExcelWriter | |
from torch.utils.data import DataLoader | |
import math | |
from sentence_transformers import models, losses | |
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer | |
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator | |
from sentence_transformers.readers import * | |
from nltk.corpus import stopwords | |
stop_words = stopwords.words('english') | |
import matplotlib.pyplot as plt | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
from sklearn.metrics.pairwise import cosine_similarity | |
import scipy.spatial | |
import networkx as nx | |
from nltk.tokenize import sent_tokenize | |
import scispacy | |
import spacy | |
import en_core_sci_lg | |
import string | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import gradio as gr | |
import inflect | |
from Bio import Entrez | |
from sklearn.cluster import KMeans | |
from sklearn.cluster import AgglomerativeClustering | |
from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score | |
import json | |
p = inflect.engine() | |
nlp = en_core_sci_lg.load() | |
sp = en_core_sci_lg.load() | |
all_stopwords = sp.Defaults.stop_words | |
word_embedding_model = models.Transformer('cambridgeltl/SapBERT-from-PubMedBERT-fulltext') | |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), | |
pooling_mode_mean_tokens=True, | |
pooling_mode_cls_token=False, | |
pooling_mode_max_tokens=False) | |
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model]) | |
def search(query): | |
Entrez.email = '[email protected]' | |
handle = Entrez.esearch(db='pubmed', | |
sort='relevance', | |
retmax='5', | |
retmode='xml', | |
term=query) | |
results = Entrez.read(handle) | |
return results | |
def fetch_details(id_list): | |
ids = ','.join(id_list) | |
Entrez.email = '[email protected]' | |
handle_1 = Entrez.efetch(db='pubmed', retmode='xml', id=ids) | |
results_1 = Entrez.read(handle_1) | |
return results_1 | |
def remove_stopwords(sen): | |
sen_new = " ".join([i for i in sen if i not in stop_words]) | |
return sen_new | |
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords): | |
element=[] | |
final_textrank_list=[] | |
document=[] | |
text_doc=[] | |
final_list=[] | |
score_list=[] | |
sum_list=[] | |
model_1 = SentenceTransformer(model_1) | |
model_2 = SentenceTransformer(model_2) | |
url = article_link | |
if (url == False): | |
print("error") | |
html = requests.get(url).text | |
article = fulltext(html) | |
corpus=sent_tokenize(article) | |
indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence', | |
'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that', | |
'indicated that','suggested that','demonstrated that'] | |
count_dict={} | |
for l in corpus: | |
c=0 | |
for l2 in indicator_list: | |
if l.find(l2)!=-1:#then it is a substring | |
c=1 | |
break | |
if c:# | |
count_dict[l]=1 | |
else: | |
count_dict[l]=0 | |
for sent, score in count_dict.items(): | |
score_list.append(score) | |
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist() | |
corpus_embeddings = model_1.encode(clean_sentences_new) | |
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)]) | |
for i in range(len(clean_sentences_new)): | |
len_embeddings=(len(corpus_embeddings[i])) | |
for j in range(len(clean_sentences_new)): | |
if i != j: | |
if(len_embeddings == 1024): | |
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,1024), corpus_embeddings[j].reshape(1,1024))[0,0] | |
elif(len_embeddings == 768): | |
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0] | |
nx_graph = nx.from_numpy_array(sim_mat) | |
scores = nx.pagerank(nx_graph) | |
sentences=((scores[i],s) for i,s in enumerate(corpus)) | |
for elem in sentences: | |
element.append(elem[0]) | |
for sc, lst in zip(score_list, element): ########## taking the scores from both the lists | |
sum1=sc+lst | |
sum_list.append(sum1) | |
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True) | |
for elem in x: | |
final_textrank_list.append(elem[1]) | |
a=int((10*len(final_textrank_list))/100.0) | |
if(a<5): | |
total=5 | |
else: | |
total=int(a) | |
for i in range(total): | |
document.append(final_textrank_list[i]) | |
doc=" ".join(document) | |
for i in document: | |
doc_1=nlp(i) | |
text_doc.append([X.text for X in doc_1.ents]) | |
entity_list = [item for sublist in text_doc for item in sublist] | |
entity_list = [word for word in entity_list if not word in all_stopwords] | |
entity_list = [word_entity for word_entity in entity_list if(p.singular_noun(word_entity) == False)] | |
entity_list=list(dict.fromkeys(entity_list)) | |
doc_embedding = model_2.encode([doc]) | |
candidates=entity_list | |
candidate_embeddings = model_2.encode(candidates) | |
distances = cosine_similarity(doc_embedding, candidate_embeddings) | |
top_n = max_num_keywords | |
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]] | |
keywords = '\n'.join(keyword_list) | |
c_len=(len(keyword_list)) | |
keyword_embeddings = embedder.encode(keyword_list) | |
data_embeddings = embedder.encode(keyword_list) | |
for num_clusters in range(1, top_n): | |
clustering_model = KMeans(n_clusters=num_clusters) | |
clustering_model.fit(keyword_embeddings) | |
cluster_assignment = clustering_model.labels_ | |
clustered_sentences = [[] for i in range(num_clusters)] | |
for sentence_id, cluster_id in enumerate(cluster_assignment): | |
clustered_sentences[cluster_id].append(keyword_list[sentence_id]) | |
cl_sent_len=(len(clustered_sentences)) | |
list_cluster=list(clustered_sentences) | |
a=len(list_cluster) | |
cluster_list_final.append(list_cluster) | |
if (c_len==cl_sent_len and c_len>=3) or cl_sent_len==1: | |
silhouette_avg = 0 | |
silhouette_score_list.append(silhouette_avg) | |
elif c_len==cl_sent_len==2: | |
silhouette_avg = 1 | |
silhouette_score_list.append(silhouette_avg) | |
else: | |
silhouette_avg = silhouette_score(keyword_embeddings, cluster_assignment) | |
silhouette_score_list.append(silhouette_avg) | |
res_dict = dict(zip(silhouette_score_list, cluster_list_final)) | |
cluster_items=res_dict[max(res_dict)] | |
for i in cluster_items: | |
z=' OR '.join(i) | |
comb.append("("+z+")") | |
comb_list.append(comb) | |
combinations = [] | |
for subset in itertools.combinations(comb, 2): | |
combinations.append(subset) | |
f1_list=[] | |
for s in combinations: | |
final = ' AND '.join(s) | |
f1_list.append("("+final+")") | |
f_1=' OR '.join(f1_list) | |
final_list.append(f_1) | |
ncbi_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' | |
last_url='esearch.fcgi?db=pubmed'+'&term='+f_1 | |
search_rettype = '&rettype=json' | |
overall_url=ncbi_url+last_url+search_rettype+'&sort=relevance' | |
r = requests.get(overall_url) | |
root = ET.fromstring(r.text) | |
levels = root.findall('.//Id') | |
name_list=[] | |
for level in levels: | |
name = level.text | |
name_list.append(name) | |
name_1 = ','.join(name_list) | |
fetch_url='efetch.fcgi?db=pubmed' | |
search_id='&id='+name_1 | |
ret_type='&rettype=text' | |
ret_mode='&retmode=xml' | |
ret_max='&retmax=10' | |
ret_sort='&sort=relevance' | |
return_url=ncbi_url+fetch_url+search_id+ret_type+ret_mode+ret_max+ret_sort | |
r_1 = requests.get(return_url) | |
root_1 = ET.fromstring(r_1.text) | |
levels_1 = root_1.findall('.//ArticleTitle') | |
for level in levels_1: | |
name = level.text | |
title_list.append(name) | |
return title_list | |
gr.Interface(keyphrase_generator, | |
inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide article web link here",default="", label="Article web link"), | |
gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2', | |
'sentence-transformers/all-mpnet-base-v1', | |
'sentence-transformers/all-distilroberta-v1', | |
'sentence-transformers/gtr-t5-large', | |
'pritamdeka/S-Bluebert-snli-multinli-stsb', | |
'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb', | |
'sentence-transformers/stsb-mpnet-base-v2', | |
'sentence-transformers/stsb-roberta-base-v2', | |
'sentence-transformers/stsb-distilroberta-base-v2', | |
'sentence-transformers/sentence-t5-large', | |
'sentence-transformers/sentence-t5-base'], | |
type="value", | |
default='sentence-transformers/all-mpnet-base-v1', | |
label="Select any SBERT model for TextRank from the list below"), | |
gr.inputs.Dropdown(choices=['sentence-transformers/paraphrase-mpnet-base-v2', | |
'sentence-transformers/all-mpnet-base-v1', | |
'sentence-transformers/paraphrase-distilroberta-base-v1', | |
'sentence-transformers/paraphrase-xlm-r-multilingual-v1', | |
'sentence-transformers/paraphrase-multilingual-mpnet-base-v2', | |
'sentence-transformers/paraphrase-albert-small-v2', | |
'sentence-transformers/paraphrase-albert-base-v2', | |
'sentence-transformers/paraphrase-MiniLM-L12-v2', | |
'sentence-transformers/paraphrase-MiniLM-L6-v2', | |
'sentence-transformers/all-MiniLM-L12-v2', | |
'sentence-transformers/all-distilroberta-v1', | |
'sentence-transformers/paraphrase-TinyBERT-L6-v2', | |
'sentence-transformers/paraphrase-MiniLM-L3-v2', | |
'sentence-transformers/all-MiniLM-L6-v2'], | |
type="value", | |
default='sentence-transformers/all-mpnet-base-v1', | |
label="Select any SBERT model for keyphrases from the list below"), | |
gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")], | |
outputs=gr.outputs.Textbox(type="auto", label="Stuff"), | |
theme="peach", | |
title="Scientific Article Keyphrase Generator", description="Generates the keyphrases from an article which best describes the article.", | |
article= "The work is based on a part of the paper <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>provided here</a>." | |
"\t It uses the TextRank algorithm with SBERT to first find the top sentences and then extracts the keyphrases from those sentences using scispaCy and SBERT." | |
"\t The list of SBERT models required in the textboxes can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>." | |
"\t The default model names are provided which can be changed from the list of pretrained models. " | |
"\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.").launch(share=True,debug=True) |