Spaces:
Sleeping
Sleeping
File size: 6,829 Bytes
dd49f8a 82d01e1 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a de474d4 a7f8182 de474d4 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a 82d01e1 c44e698 a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 dd49f8a a7f8182 0d7fce8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import tqdm
import multiprocessing
import pandas as pd
import numpy as np
import scipy.stats
import os
import sys
script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append('..')
sys.path.append('.')
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
skempi_vectors_path = None
representation_name = None
def load_representation(multi_col_representation_vector_file_path):
print("\nLoading representation vectors...\n")
multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
vals = multi_col_representation_vector.iloc[:, 1:(len(multi_col_representation_vector.columns))]
original_values_as_df = pd.DataFrame({'PDB_ID': pd.Series([], dtype='str'), 'Vector': pd.Series([], dtype='object')})
for index, row in tqdm.tqdm(vals.iterrows(), total=len(vals)):
list_of_floats = [float(item) for item in list(row)]
original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['PDB_ID']] + [list_of_floats]
return original_values_as_df
def calc_train_error(X_train, y_train, model):
'''Returns in-sample error for an already fit model.'''
predictions = model.predict(X_train)
mse = mean_squared_error(y_train, predictions)
mae = mean_absolute_error(y_train, predictions)
corr = scipy.stats.pearsonr(y_train, predictions)
return mse, mae, corr
def calc_validation_error(X_test, y_test, model):
'''Returns out-of-sample error for an already fit model.'''
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
corr = scipy.stats.pearsonr(y_test, predictions)
return mse, mae, corr
def calc_metrics(X_train, y_train, X_test, y_test, model):
'''Fits the model and returns the metrics for in-sample and out-of-sample errors.'''
model.fit(X_train, y_train)
#train_mse_error, train_mae_error, train_corr = calc_train_error(X_train, y_train, model)
val_mse_error, val_mae_error, val_corr = calc_validation_error(X_test, y_test, model)
return val_mse_error, val_mae_error, val_corr
def report_results(
validation_mse_error_list,
validation_mae_error_list,
validation_corr_list,
validation_corr_pval_list,
):
result_summary = {
"val_mse_error": round(np.mean(validation_mse_error_list) * 100, 4),
"val_mse_std": round(np.std(validation_mse_error_list) * 100, 4),
"val_mae_error": round(np.mean(validation_mae_error_list) * 100, 4),
"val_mae_std": round(np.std(validation_mae_error_list) * 100, 4),
"validation_corr": round(np.mean(validation_corr_list), 4),
"validation_corr_pval": round(np.mean(validation_corr_pval_list), 4),
}
result_detail = {
"val_mse_errors": list(np.multiply(validation_mse_error_list, 100)),
"val_mae_errors": list(np.multiply(validation_mae_error_list, 100)),
"validation_corrs": list(np.multiply(validation_corr_list, 100)),
"validation_corr_pvals": list(np.multiply(validation_corr_pval_list, 100)),
}
return result_summary, result_detail
def predictAffinityWithModel(regressor_model, multiplied_vectors_df):
K = 10
kf = KFold(n_splits=K, shuffle=True, random_state=42)
train_mse_error_list = []
validation_mse_error_list = []
train_mae_error_list = []
validation_mae_error_list = []
train_corr_list = []
validation_corr_list = []
train_corr_pval_list = []
validation_corr_pval_list = []
data = np.array(np.asarray(multiplied_vectors_df["Vector"].tolist()), dtype=float)
ppi_affinity_filtered_df = ppi_affinity_df[
ppi_affinity_df['Protein1'].isin(multiplied_vectors_df['Protein1']) &
ppi_affinity_df['Protein2'].isin(multiplied_vectors_df['Protein2'])
]
target = np.array(ppi_affinity_filtered_df["Affinity"])
scaler = MinMaxScaler()
scaler.fit(target.reshape(-1, 1))
target = scaler.transform(target.reshape(-1, 1))[:, 0]
for train_index, val_index in tqdm.tqdm(kf.split(data, target), total=K):
# split data
X_train, X_val = data[train_index], data[val_index]
y_train, y_val = target[train_index], target[val_index]
# instantiate model
reg = regressor_model
# calculate errors
(
val_mse_error,
val_mae_error,
val_corr,
) = calc_metrics(X_train, y_train, X_val, y_val, reg)
# append to appropriate lists
validation_mse_error_list.append(val_mse_error)
validation_mae_error_list.append(val_mae_error)
validation_corr_list.append(val_corr[0])
validation_corr_pval_list.append(val_corr[1])
return report_results(
validation_mse_error_list,
validation_mae_error_list,
validation_corr_list,
validation_corr_pval_list,
)
ppi_affinity_file_path = "../data/auxilary_input/skempi_pipr/SKEMPI_all_dg_avg.txt"
ppi_affinity_file = os.path.join(script_dir, ppi_affinity_file_path)
ppi_affinity_df = pd.read_csv(ppi_affinity_file, sep="\t", header=None)
ppi_affinity_df.columns = ['Protein1', 'Protein2', 'Affinity']
def calculate_vector_multiplications(skempi_vectors_df):
multiplied_vectors = pd.DataFrame({
'Protein1': pd.Series([], dtype='str'),
'Protein2': pd.Series([], dtype='str'),
'Vector': pd.Series([], dtype='object')
})
print("Element-wise vector multiplications are being calculated")
rep_prot_list = list(skempi_vectors_df['PDB_ID'])
for index, row in tqdm.tqdm(ppi_affinity_df.iterrows()):
if row['Protein1'] in rep_prot_list and row['Protein2'] in rep_prot_list:
vec1 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein1']]['Vector'])[0]
vec2 = list(skempi_vectors_df[skempi_vectors_df['PDB_ID'] == row['Protein2']]['Vector'])[0]
multiplied_vec = np.multiply(vec1, vec2)
multiplied_vectors = multiplied_vectors.append({
'Protein1': row['Protein1'],
'Protein2': row['Protein2'],
'Vector': multiplied_vec
}, ignore_index=True)
return multiplied_vectors
def predict_affinities_and_report_results():
skempi_vectors_df = load_representation(skempi_vectors_path)
multiplied_vectors_df = calculate_vector_multiplications(skempi_vectors_df)
model = linear_model.BayesianRidge()
result_summary, result_detail = predictAffinityWithModel(model, multiplied_vectors_df)
# Return the results as a dictionary instead of writing to a file
return {'summary': result_summary,
'detail': result_detail} |