from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import pickle import torch import io #contents = pickle.load(f) becomes... #contents = CPU_Unpickler(f).load() model_path = "t5_10k_small_cpu.sav" #load model from drive with open(model_path, "rb") as f: model= pickle.load(f) #tokenizer = AutoTokenizer.from_pretrained(checkpoint) #model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) import nltk from finbert_embedding.embedding import FinbertEmbedding import pandas as pd from nltk.cluster import KMeansClusterer import numpy as np import os from scipy.spatial import distance_matrix from tensorflow.python.lib.io import file_io import pickle nltk.download('punkt') def make_abstractive_summary(word): # Instantiate path to store each text Datafile in dataframe data_path = "/tmp/" if not os.path.exists(data_path): os.makedirs(data_path) input_ = "/tmp/input.txt" # Write file to disk so we can convert each datapoint to a txt file with open(input_, "w") as file: file.write(word) # read the written txt into a variable to start clustering with open(input_ , 'r') as f: text = f.read() # Create tokens from the txt file tokens = nltk.sent_tokenize(text) # Strip out trailing and leading white spaces from tokens sentences = [word.strip() for word in tokens] #Create a DataFrame from the tokens data = pd.DataFrame(sentences) # Assign name Sentences to the column containing text tokens data.columns = ['Sentences'] # Function to create numerical embeddings for each text tokens in dataframe def get_sentence_embeddings(): # Create empty list for sentence embeddings sentence_list = [] # Loop through all sentences and append sentence embeddings to list for i in tokens: sentence_embedding = model.sentence_vector(i) sentence_list.append(sentence_embedding) # Create empty list for ndarray sentence_array=[] # Loop through sentence list and change data type from tensor to array for i in sentence_list: sentence_array.append(i.numpy()) # return sentence embeddings as list return sentence_array # Apply get_sentence_embeddings to dataframe to create column Embeddings data['Embeddings'] = get_sentence_embeddings() #Number of expected sentences NUM_CLUSTERS = 10 iterations = 8 # Convert Embeddings into an array and store in variable X X = np.array(data['Embeddings'].to_list()) #Build k-means cluster algorithm Kclusterer = KMeansClusterer( NUM_CLUSTERS, distance = nltk.cluster.util.cosine_distance, repeats = iterations, avoid_empty_clusters = True) # if length of text is too short, K means would return an error # use the try except block to return the text as result if it is too short. try: assigned_clusters = Kclusterer.cluster(X,assign_clusters=True) # Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid data['Cluster'] = pd.Series(assigned_clusters, index = data.index) data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x]) # return the text if clustering algorithm catches an exceptiona and move to the next text file except ValueError: return text # function that computes the distance of each embeddings from the centroid of the cluster def distance_from_centroid(row): return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0] # apply distance_from_centroid function to data data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1) ## Return Final Summary summary = " ".join(data.sort_values( 'Distance_From_Centroid', ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist()) return ("FinBERT MODEL OUTPUT:--->"+summary," Length of Input:---->"+str(len(word))," Length of Output:----> "+str(len(summary)))