mgyigit commited on
Commit
37a12fb
·
verified ·
1 Parent(s): 81b1545

Update src/bin/target_family_classifier.py

Browse files
Files changed (1) hide show
  1. src/bin/target_family_classifier.py +54 -134
src/bin/target_family_classifier.py CHANGED
@@ -4,27 +4,18 @@ Created on Mon Jun 8 09:32:26 2020
4
 
5
  @author: Muammer
6
  """
7
- import os
8
- script_dir = os.path.dirname(os.path.abspath(__file__))
9
-
10
  import numpy as np
11
- from sklearn.model_selection import cross_validate
12
- from sklearn.model_selection import cross_val_predict
13
- from sklearn.metrics import matthews_corrcoef
14
- from sklearn.metrics import classification_report
15
- from sklearn.multiclass import OneVsRestClassifier
16
- from sklearn import linear_model
17
- from sklearn.metrics import f1_score
18
- from sklearn.metrics import confusion_matrix
19
  from sklearn.model_selection import train_test_split
20
- import pandas as pd
21
- from numpy import save
22
- from sklearn.metrics import precision_recall_fscore_support
 
 
 
23
  from tqdm import tqdm
24
- from sklearn.metrics import accuracy_score
25
  import math
26
 
27
-
28
  representation_name = ""
29
  representation_path = ""
30
  dataset = "nc"
@@ -33,68 +24,40 @@ detailed_output = False
33
  def convert_dataframe_to_multi_col(representation_dataframe):
34
  entry = pd.DataFrame(representation_dataframe['Entry'])
35
  vector = pd.DataFrame(list(representation_dataframe['Vector']))
36
- multi_col_representation_vector = pd.merge(left=entry,right=vector,left_index=True, right_index=True)
37
  return multi_col_representation_vector
38
 
39
  def class_based_scores(c_report, c_matrix):
40
  c_report = pd.DataFrame(c_report).transpose()
41
- #print(c_report)
42
  c_report = c_report.drop(['precision', 'recall'], axis=1)
43
  c_report = c_report.drop(labels=['accuracy', 'macro avg', 'weighted avg'], axis=0)
 
44
  cm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
45
- #print(c_report)
46
  accuracy = cm.diagonal()
47
-
48
- #print(accuracy)
49
- #if len(accuracy) == 6:
50
- # accuracy = np.delete(accuracy, 5)
51
-
52
  accuracy = pd.Series(accuracy, index=c_report.index)
53
  c_report['accuracy'] = accuracy
54
 
55
  total = c_report['support'].sum()
56
- #print(total)
57
  num_classes = np.shape(c_matrix)[0]
58
  mcc = np.zeros(shape=(num_classes,), dtype='float32')
59
- weights = np.sum(c_matrix, axis=0)/np.sum(c_matrix)
60
- total_tp = 0
61
- total_fp = 0
62
- total_fn = 0
63
- total_tn = 0
64
 
65
  for j in range(num_classes):
66
  tp = np.sum(c_matrix[j, j])
67
  fp = np.sum(c_matrix[j, np.concatenate((np.arange(0, j), np.arange(j+1, num_classes)))])
68
  fn = np.sum(c_matrix[np.concatenate((np.arange(0, j), np.arange(j+1, num_classes))), j])
69
  tn = int(total - tp - fp - fn)
70
- total_tp = total_tp + tp
71
- total_fp = total_fp + fp
72
- total_fn = total_fn + fn
73
- total_tn = total_tn + tn
74
- #print(tp,fp,fn,tn)
75
- mcc[j] = ((tp*tn)-(fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
76
- #print(mcc)
77
- #if len(mcc) == 6:
78
- # mcc = np.delete(mcc, 5)
79
-
80
  mcc = pd.Series(mcc, index=c_report.index)
81
  c_report['mcc'] = mcc
82
- #c_report.to_excel('../results/resultss_class_based_'+dataset+'.xlsx')
83
- #print(c_report)
84
- return c_report, total_tp, total_fp, total_fn, total_tn
85
-
86
 
 
87
 
88
  def score_protein_rep(dataset):
89
- #def score_protein_rep(pkl_data_path):
90
-
91
- vecsize = 0
92
- #protein_list = pd.read_csv('../data/auxilary_input/entry_class.csv')
93
  protein_list = pd.read_csv(os.path.join(script_dir, '../data/preprocess/entry_class_nn.csv'))
94
  dataframe = pd.read_csv(representation_path)
95
- #dataframe = convert_dataframe_to_multi_col(dataframe)
96
- #dataframe = pd.read_pickle(pkl_data_path)
97
- vecsize = dataframe.shape[1]-1
98
  x = np.empty([0, vecsize])
99
  xemp = np.zeros((1, vecsize), dtype=float)
100
  y = []
@@ -104,125 +67,82 @@ def score_protein_rep(dataset):
104
  for index, row in tqdm(protein_list.iterrows(), total=len(protein_list)):
105
  pdrow = dataframe.loc[dataframe['Entry'] == row['Entry']]
106
  if len(pdrow) != 0:
107
- a = pdrow.loc[ : , pdrow.columns != 'Entry']
108
  a = np.array(a)
109
- a.shape = (1,vecsize)
110
  x = np.append(x, a, axis=0)
111
  y.append(row['Class'])
112
  else:
113
  ne.append(index)
114
- x = np.append(x, xemp, axis=0,)
115
  y.append(0.0)
116
- #print(index)
117
 
118
  x = x.astype(np.float64)
119
  y = np.array(y)
120
  y = y.astype(np.float64)
121
- #print(len(y))
122
- scoring = ['precision_weighted', 'recall_weighted', 'f1_weighted', 'accuracy']
123
  target_names = ['Enzyme', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other']
124
  labels = [1.0, 11.0, 12.0, 1005.0, 2000.0]
125
-
126
  f1 = []
127
  accuracy = []
128
  mcc = []
129
- f1_perclass = []
130
- ac_perclass = []
131
- mcc_perclass = []
132
- sup_perclass = []
133
  report_list = []
134
- train_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/'+dataset+'_trainindex.csv'))
 
135
  test_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/testindex_family.csv'))
136
- train_index = train_index.dropna(axis=1)
137
  test_index = test_index.dropna(axis=1)
138
- #print(train_index)
139
- #for index in ne:
140
-
141
 
142
- conf = pd.DataFrame()
143
 
144
  print('Producing protein family predictions...\n')
145
- for i in tqdm(range(10)):
146
- clf = linear_model.SGDClassifier(class_weight="balanced", loss="log", penalty="elasticnet", max_iter=1000, tol=1e-3,random_state=i,n_jobs=-1)
147
- clf2 = OneVsRestClassifier(clf,n_jobs=-1)
148
- #print(test_index)
149
  train_indexx = train_index.iloc[i].astype(int)
150
  test_indexx = test_index.iloc[i].astype(int)
151
- #print(train_indexx)
152
- #train_indexx.drop(labels=ne)
153
- #print(type(train_indexx))
154
- for index in ne:
155
-
156
- train_indexx = train_indexx[train_indexx!=index]
157
- test_indexx = test_indexx[test_indexx!=index]
158
-
159
 
 
 
 
160
 
161
  train_X, test_X = x[train_indexx], x[test_indexx]
162
  train_y, test_y = y[train_indexx], y[test_indexx]
163
 
164
- clf2.fit(train_X, train_y)
165
-
166
- #print(train_X)
167
  y_pred = clf2.predict(test_X)
168
-
169
- #y_pred = cross_val_predict(clf2, x, y, cv=10, n_jobs=-1)
170
- #mcc.append(matthews_corrcoef(test_y, y_pred, sample_weight = test_y))
171
  f1_ = f1_score(test_y, y_pred, average='weighted')
172
  f1.append(f1_)
 
173
  ac = accuracy_score(test_y, y_pred)
174
  accuracy.append(ac)
 
175
  c_report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
176
  c_matrix = confusion_matrix(test_y, y_pred, labels=labels)
 
177
 
178
- conf = conf.append(pd.DataFrame(c_matrix, columns=['Enzymes', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other']), ignore_index=True)
179
- class_report, tp, fp, fn, tn = class_based_scores(c_report, c_matrix)
180
-
181
- #print(total_tp)
182
- mcc.append(((tp*tn)-(fp*fn))/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
183
-
184
 
185
- f1_perclass.append(class_report['f1-score'])
186
- ac_perclass.append(class_report['accuracy'])
187
- mcc_perclass.append(class_report['mcc'])
188
- sup_perclass.append(class_report['support'])
189
  report_list.append(class_report)
190
-
191
- if detailed_output:
192
- conf.to_csv(os.path.join(script_dir, '../results/Drug_target_protein_family_classification_confusion_'+dataset+'_'+representation_name+'.csv'), index=None)
193
-
194
- f1_perclass = pd.concat(f1_perclass, axis=1)
195
- ac_perclass = pd.concat(ac_perclass, axis=1)
196
- mcc_perclass = pd.concat(mcc_perclass, axis=1)
197
- sup_perclass = pd.concat(sup_perclass, axis=1)
198
-
199
- report_list = pd.concat(report_list, axis=1)
200
- report_list.to_csv(os,path,join(script_dir, '../results/Drug_target_protein_family_classification_class_based_results_'+dataset+'_'+representation_name+'.csv'))
201
-
202
- report = pd.DataFrame()
203
- f1mean = np.mean(f1, axis=0)
204
- #print(f1mean)
205
- f1mean = f1mean.round(decimals=5)
206
- f1std = np.std(f1).round(decimals=5)
207
- acmean = np.mean(accuracy, axis=0).round(decimals=5)
208
- acstd = np.std(accuracy).round(decimals=5)
209
- mccmean = np.mean(mcc, axis=0).round(decimals=5)
210
- mccstd = np.std(mcc).round(decimals=5)
211
- labels = ['Average Score', 'Standard Deviation']
212
- report['Protein Family'] = labels
213
- report['F1_score'] = [f1mean, f1std]
214
- report['Accuracy'] = [acmean, acstd]
215
- report['MCC'] = [mccmean, mccstd]
216
-
217
- report.to_csv(os.path.join(script_dir, '../results/Drug_target_protein_family_classification_mean_results_'+dataset+'_'+representation_name+'.csv',index=False))
218
- #report.to_csv('scores_general.csv')
219
- #print(report)
220
- if detailed_output:
221
- save('../results/Drug_target_protein_family_classification_f1_'+dataset+'_'+representation_name+'.npy', f1)
222
- save('../results/Drug_target_protein_family_classification_accuracy_'+dataset+'_'+representation_name+'.npy', accuracy)
223
- save('../results/Drug_target_protein_family_classification_mcc_'+dataset+'_'+representation_name+'.npy', mcc)
224
- save('../results/Drug_target_protein_family_classification_class_based_f1_'+dataset+'_'+representation_name+'.npy', f1_perclass)
225
- save('../results/Drug_target_protein_family_classification_class_based_accuracy_'+dataset+'_'+representation_name+'.npy', ac_perclass)
226
- save('../results/Drug_target_protein_family_classification_class_based_mcc_'+dataset+'_'+representation_name+'.npy', mcc_perclass)
227
- save('../results/Drug_target_protein_family_classification_class_based_support_'+dataset+'_'+representation_name+'.npy', sup_perclass)
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  @author: Muammer
6
  """
7
+ import os
 
 
8
  import numpy as np
 
 
 
 
 
 
 
 
9
  from sklearn.model_selection import train_test_split
10
+ from sklearn import linear_model
11
+ from sklearn.metrics import (
12
+ f1_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef
13
+ )
14
+ from sklearn.multiclass import OneVsRestClassifier
15
+ import pandas as pd
16
  from tqdm import tqdm
 
17
  import math
18
 
 
19
  representation_name = ""
20
  representation_path = ""
21
  dataset = "nc"
 
24
  def convert_dataframe_to_multi_col(representation_dataframe):
25
  entry = pd.DataFrame(representation_dataframe['Entry'])
26
  vector = pd.DataFrame(list(representation_dataframe['Vector']))
27
+ multi_col_representation_vector = pd.merge(left=entry, right=vector, left_index=True, right_index=True)
28
  return multi_col_representation_vector
29
 
30
  def class_based_scores(c_report, c_matrix):
31
  c_report = pd.DataFrame(c_report).transpose()
 
32
  c_report = c_report.drop(['precision', 'recall'], axis=1)
33
  c_report = c_report.drop(labels=['accuracy', 'macro avg', 'weighted avg'], axis=0)
34
+
35
  cm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
 
36
  accuracy = cm.diagonal()
 
 
 
 
 
37
  accuracy = pd.Series(accuracy, index=c_report.index)
38
  c_report['accuracy'] = accuracy
39
 
40
  total = c_report['support'].sum()
 
41
  num_classes = np.shape(c_matrix)[0]
42
  mcc = np.zeros(shape=(num_classes,), dtype='float32')
 
 
 
 
 
43
 
44
  for j in range(num_classes):
45
  tp = np.sum(c_matrix[j, j])
46
  fp = np.sum(c_matrix[j, np.concatenate((np.arange(0, j), np.arange(j+1, num_classes)))])
47
  fn = np.sum(c_matrix[np.concatenate((np.arange(0, j), np.arange(j+1, num_classes))), j])
48
  tn = int(total - tp - fp - fn)
49
+ mcc[j] = ((tp * tn) - (fp * fn)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
50
+
 
 
 
 
 
 
 
 
51
  mcc = pd.Series(mcc, index=c_report.index)
52
  c_report['mcc'] = mcc
 
 
 
 
53
 
54
+ return c_report
55
 
56
  def score_protein_rep(dataset):
 
 
 
 
57
  protein_list = pd.read_csv(os.path.join(script_dir, '../data/preprocess/entry_class_nn.csv'))
58
  dataframe = pd.read_csv(representation_path)
59
+ vecsize = dataframe.shape[1] - 1
60
+
 
61
  x = np.empty([0, vecsize])
62
  xemp = np.zeros((1, vecsize), dtype=float)
63
  y = []
 
67
  for index, row in tqdm(protein_list.iterrows(), total=len(protein_list)):
68
  pdrow = dataframe.loc[dataframe['Entry'] == row['Entry']]
69
  if len(pdrow) != 0:
70
+ a = pdrow.loc[:, pdrow.columns != 'Entry']
71
  a = np.array(a)
72
+ a.shape = (1, vecsize)
73
  x = np.append(x, a, axis=0)
74
  y.append(row['Class'])
75
  else:
76
  ne.append(index)
77
+ x = np.append(x, xemp, axis=0)
78
  y.append(0.0)
 
79
 
80
  x = x.astype(np.float64)
81
  y = np.array(y)
82
  y = y.astype(np.float64)
83
+
 
84
  target_names = ['Enzyme', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other']
85
  labels = [1.0, 11.0, 12.0, 1005.0, 2000.0]
86
+
87
  f1 = []
88
  accuracy = []
89
  mcc = []
 
 
 
 
90
  report_list = []
91
+
92
+ train_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/' + dataset + '_trainindex.csv'))
93
  test_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/testindex_family.csv'))
94
+ train_index = train_index.dropna(axis=1)
95
  test_index = test_index.dropna(axis=1)
 
 
 
96
 
97
+ conf_matrices = []
98
 
99
  print('Producing protein family predictions...\n')
100
+ for i in tqdm(range(10)):
101
+ clf = linear_model.SGDClassifier(class_weight="balanced", loss="log", penalty="elasticnet", max_iter=1000, tol=1e-3, random_state=i, n_jobs=-1)
102
+ clf2 = OneVsRestClassifier(clf, n_jobs=-1)
103
+
104
  train_indexx = train_index.iloc[i].astype(int)
105
  test_indexx = test_index.iloc[i].astype(int)
 
 
 
 
 
 
 
 
106
 
107
+ for index in ne:
108
+ train_indexx = train_indexx[train_indexx != index]
109
+ test_indexx = test_indexx[test_indexx != index]
110
 
111
  train_X, test_X = x[train_indexx], x[test_indexx]
112
  train_y, test_y = y[train_indexx], y[test_indexx]
113
 
114
+ clf2.fit(train_X, train_y)
 
 
115
  y_pred = clf2.predict(test_X)
116
+
 
 
117
  f1_ = f1_score(test_y, y_pred, average='weighted')
118
  f1.append(f1_)
119
+
120
  ac = accuracy_score(test_y, y_pred)
121
  accuracy.append(ac)
122
+
123
  c_report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
124
  c_matrix = confusion_matrix(test_y, y_pred, labels=labels)
125
+ conf_matrices.append(c_matrix)
126
 
127
+ class_report = class_based_scores(c_report, c_matrix)
128
+ mcc_score = matthews_corrcoef(test_y, y_pred)
129
+ mcc.append(mcc_score)
 
 
 
130
 
 
 
 
 
131
  report_list.append(class_report)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ f1_perclass = pd.concat([r['f1-score'] for r in report_list], axis=1)
134
+ ac_perclass = pd.concat([r['accuracy'] for r in report_list], axis=1)
135
+ mcc_perclass = pd.concat([r['mcc'] for r in report_list], axis=1)
136
+
137
+ results = {
138
+ "f1": f1,
139
+ "accuracy": accuracy,
140
+ "mcc": mcc,
141
+ "confusion_matrices": conf_matrices,
142
+ "class_reports": report_list,
143
+ "f1_per_class": f1_perclass,
144
+ "accuracy_per_class": ac_perclass,
145
+ "mcc_per_class": mcc_perclass
146
+ }
147
+
148
+ return results