File size: 5,340 Bytes
dd49f8a
 
 
 
 
 
37a12fb
acd43b4
 
dd49f8a
 
37a12fb
 
 
 
 
 
dd49f8a
 
 
 
 
 
 
 
 
 
 
37a12fb
dd49f8a
 
 
 
 
 
37a12fb
dd49f8a
 
 
 
 
 
 
 
 
 
 
 
 
 
37a12fb
 
dd49f8a
 
 
37a12fb
dd49f8a
 
7dcad68
dd49f8a
37a12fb
 
dd49f8a
 
 
 
 
 
 
 
 
37a12fb
dd49f8a
37a12fb
dd49f8a
 
 
 
37a12fb
dd49f8a
 
 
 
 
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
 
 
37a12fb
 
7dcad68
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
37a12fb
 
 
 
dd49f8a
 
 
37a12fb
 
 
dd49f8a
 
 
 
37a12fb
dd49f8a
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
37a12fb
dd49f8a
 
37a12fb
dd49f8a
37a12fb
 
 
dd49f8a
 
 
37a12fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  8 09:32:26 2020

@author: Muammer
"""
import os
script_dir = os.path.dirname(os.path.abspath(__file__))

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import (
    f1_score, accuracy_score, confusion_matrix, classification_report, matthews_corrcoef
)
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from tqdm import tqdm
import math

representation_name = ""
representation_path = ""
dataset = "nc"
detailed_output = False

def convert_dataframe_to_multi_col(representation_dataframe):
    entry = pd.DataFrame(representation_dataframe['Entry'])
    vector = pd.DataFrame(list(representation_dataframe['Vector']))
    multi_col_representation_vector = pd.merge(left=entry, right=vector, left_index=True, right_index=True)
    return multi_col_representation_vector

def class_based_scores(c_report, c_matrix):
    c_report = pd.DataFrame(c_report).transpose()
    c_report = c_report.drop(['precision', 'recall'], axis=1)
    c_report = c_report.drop(labels=['accuracy', 'macro avg', 'weighted avg'], axis=0)
    
    cm = c_matrix.astype('float') / c_matrix.sum(axis=1)[:, np.newaxis]
    accuracy = cm.diagonal()
    accuracy = pd.Series(accuracy, index=c_report.index)
    c_report['accuracy'] = accuracy
    
    total = c_report['support'].sum()
    num_classes = np.shape(c_matrix)[0]
    mcc = np.zeros(shape=(num_classes,), dtype='float32')

    for j in range(num_classes):
        tp = np.sum(c_matrix[j, j])
        fp = np.sum(c_matrix[j, np.concatenate((np.arange(0, j), np.arange(j+1, num_classes)))])
        fn = np.sum(c_matrix[np.concatenate((np.arange(0, j), np.arange(j+1, num_classes))), j])
        tn = int(total - tp - fp - fn)
        mcc[j] = ((tp * tn) - (fp * fn)) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    
    mcc = pd.Series(mcc, index=c_report.index)
    c_report['mcc'] = mcc

    return c_report

def score_protein_rep(dataset):
    protein_list = pd.read_csv(os.path.join(script_dir, '../data/preprocess/entry_class_nn.csv'))
    dataframe = pd.read_csv(representation_path)
    vecsize = dataframe.shape[1] - 1

    x = np.empty([0, vecsize])
    xemp = np.zeros((1, vecsize), dtype=float)
    y = []
    ne = []

    print("\n\nPreprocessing data for drug-target protein family prediction...\n ")
    for index, row in tqdm(protein_list.iterrows(), total=len(protein_list)):
        pdrow = dataframe.loc[dataframe['Entry'] == row['Entry']]
        if len(pdrow) != 0:
            a = pdrow.loc[:, pdrow.columns != 'Entry']
            a = np.array(a)
            a.shape = (1, vecsize)
            x = np.append(x, a, axis=0)
            y.append(row['Class'])
        else:
            ne.append(index)
            x = np.append(x, xemp, axis=0)
            y.append(0.0)

    x = x.astype(np.float64)
    y = np.array(y)
    y = y.astype(np.float64)

    target_names = ['Enzyme', 'Membrane receptor', 'Transcription factor', 'Ion channel', 'Other']
    labels = [1.0, 11.0, 12.0, 1005.0, 2000.0]

    f1 = []
    accuracy = []
    mcc = []
    report_list = []

    train_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/' + dataset + '_trainindex.csv'))
    test_index = pd.read_csv(os.path.join(script_dir, '../data/preprocess/indexes/testindex_family.csv'))
    train_index = train_index.dropna(axis=1)
    test_index = test_index.dropna(axis=1)

    conf_matrices = []

    print('Producing protein family predictions...\n')
    for i in tqdm(range(10)):
        clf = linear_model.SGDClassifier(class_weight="balanced", loss="log", penalty="elasticnet", max_iter=1000, tol=1e-3, random_state=i, n_jobs=-1)
        clf2 = OneVsRestClassifier(clf, n_jobs=-1)

        train_indexx = train_index.iloc[i].astype(int)
        test_indexx = test_index.iloc[i].astype(int)

        for index in ne:
            train_indexx = train_indexx[train_indexx != index]
            test_indexx = test_indexx[test_indexx != index]

        train_X, test_X = x[train_indexx], x[test_indexx]
        train_y, test_y = y[train_indexx], y[test_indexx]

        clf2.fit(train_X, train_y)
        y_pred = clf2.predict(test_X)

        f1_ = f1_score(test_y, y_pred, average='weighted')
        f1.append(f1_)

        ac = accuracy_score(test_y, y_pred)
        accuracy.append(ac)

        c_report = classification_report(test_y, y_pred, target_names=target_names, output_dict=True)
        c_matrix = confusion_matrix(test_y, y_pred, labels=labels)
        conf_matrices.append(c_matrix)

        class_report = class_based_scores(c_report, c_matrix)
        mcc_score = matthews_corrcoef(test_y, y_pred)
        mcc.append(mcc_score)
        
        report_list.append(class_report)

    f1_perclass = pd.concat([r['f1-score'] for r in report_list], axis=1)
    ac_perclass = pd.concat([r['accuracy'] for r in report_list], axis=1)
    mcc_perclass = pd.concat([r['mcc'] for r in report_list], axis=1)

    results = {
        "f1": f1,
        "accuracy": accuracy,
        "mcc": mcc,
        "confusion_matrices": conf_matrices,
        "class_reports": report_list,
        "f1_per_class": f1_perclass,
        "accuracy_per_class": ac_perclass,
        "mcc_per_class": mcc_perclass
    }

    return results