#!/usr/bin/env python # coding: utf-8 import pandas as pd import numpy as np import gzip import itertools import multiprocessing import csv import pickle import random from sklearn.metrics.pairwise import cosine_similarity as cosine from sklearn.metrics import mean_squared_error as mse from tqdm import tqdm, tqdm_notebook from multiprocessing import Manager, Pool from scipy.spatial.distance import cdist from numpy.linalg import norm from scipy.stats import spearmanr, pearsonr from functools import partial manager = Manager() similarity_list = manager.list() proteinListNew = manager.list() representation_dataframe = "" protein_names = "" # define similarity_list and proteinList as global variables representation_name = "" similarity_tasks = "" detailed_output = False def parallelSimilarity(paramList): protein_embedding_dataframe = representation_dataframe i = paramList[0] j = paramList[1] aspect = paramList[2] if j>i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] if protein1 in protein_names and protein2 in protein_names: prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) #cosine will return in shape of input vectors first dimension cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item() manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1)) manhattanSim = 1-manhattanDistNorm.item() if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0): manhattanSim = 1.0 #print((protein1,protein2)) #print(manhattanDist) #print(norm(prot1vec,1)) #print(norm(prot2vec,1)) euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean') euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) euclidianSim = 1-euclideanDistNorm.item() if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0): euclidianSim = 1.0 real = paramList[3] # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled similarity_list.append((real,cos,manhattanSim ,euclidianSim)) return similarity_list def calculateCorrelationforOntology(aspect,matrix_type): print("\n\nSemantic similarity correlation calculation for aspect: " + aspect + " using matrix/dataset: " + matrix_type + " ...\n") #Clear lists before each aspect similarity_list[:] = [] proteinListNew[:] = [] similarityMatrixNameDict = {} similarityMatrixNameDict["All"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix.csv" similarityMatrixNameDict["500"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" similarityMatrixNameDict["Sparse"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" similarityMatrixNameDict["200"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv" similarityMatrixFileName = similarityMatrixNameDict[matrix_type] human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True) proteinList = human_proteinSimilarityMatrix.columns #proteinListNew is referanced using Manager for prot in proteinList: proteinListNew.append(prot) if matrix_type == "Sparse": #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy") sparsified_similarity_coordinates = np.load("../data/auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy") protParamList = sparsified_similarity_coordinates else: i = range(len(proteinList)) j = range(len(proteinList)) protParamList = list(itertools.product(i,j)) protParamListNew = [] # Prepare parameters for parallel processing these parameters will be # used concurrently by different processes for tup in tqdm(protParamList): i = tup[0] j = tup[1] if matrix_type == "Sparse": protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1,protein2] tupNew = (tup[0],tup[1],aspect,real) protParamListNew.append(tupNew) else: if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1,protein2] tupNew = (tup[0],tup[1],aspect,real) protParamListNew.append(tupNew) total_task_num=len(protParamListNew) pool = Pool() similarity_listRet = [] #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type) for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity,protParamListNew), total=total_task_num , position=0, leave=True ): pass #time.sleep(0.1) pool.close() pool.join() real_distance_list = [value[0] for value in similarity_listRet] cosine_distance_list = [value[1] for value in similarity_listRet] manhattan_distance_list = [value[2] for value in similarity_listRet] euclidian_distance_list = [value[3] for value in similarity_listRet] distance_lists = [real_distance_list,cosine_distance_list,manhattan_distance_list,euclidian_distance_list] if detailed_output: report_detailed_distance_scores(representation_name,matrix_type,aspect,distance_lists) cosineCorr = spearmanr(real_distance_list, cosine_distance_list) manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list) #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr)) #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr)) #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr)) return (cosineCorr,manhattanCorr,euclidianCorr) def report_detailed_distance_scores(representation_name,similarity_matrix_type,aspect,distance_lists): saveFileName = "../results/Semantic_sim_inference_detailed_distance_scores"+aspect+"_"+similarity_matrix_type+"_"+representation_name+".pkl" with open(saveFileName, "wb") as f: pickle.dump(distance_lists, f) def calculate_all_correlations(): for similarity_matrix_type in similarity_tasks: saveFileName = "../results/Semantic_sim_inference_"+similarity_matrix_type+"_"+representation_name+".csv" buffer = "Semantic Aspect,CosineSim_Correlation,CosineSim_Correlation p-value, ManhattanSim_Correlation,ManhattanSim_Correlation p-value, EuclidianSim_Correlation,EuclidianSim_Correlation p-value \n" f = open(saveFileName,'w') f.write(buffer) for aspect in ["MF","BP","CC"]: corr = calculateCorrelationforOntology(aspect,similarity_matrix_type) buffer = "" + aspect + ","+ str(round(corr[0][0],5))+ ","+ str(round(corr[0][1],5))+ ","+ str(round(corr[1][0],5))\ + ","+ str(round(corr[1][1],5))+ ","+ str(round(corr[2][0],5))+ ","+str(round(corr[2][1],5))+"\n" f = open(saveFileName,'a') f.write(buffer) f.close()