File size: 3,468 Bytes
dd49f8a
 
 
4db2d24
 
 
 
dd49f8a
 
 
 
 
 
 
 
 
 
39ebd1e
a2e6203
50ca4fc
 
a2e6203
 
 
 
 
 
 
 
 
 
 
db965ce
50ca4fc
a2e6203
 
 
 
 
 
 
 
54b2fde
50ca4fc
a2e6203
 
 
 
 
 
50ca4fc
a2e6203
54b2fde
50ca4fc
a2e6203
 
 
 
 
75b4caf
50ca4fc
 
b86034a
a2e6203
b86034a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import yaml
import pandas as pd
import tqdm
from . import semantic_similarity_infer as ssi
from . import target_family_classifier as tfc
from . import function_predictor as fp
from . import binding_affinity_estimator as bae

def load_representation(multi_col_representation_vector_file_path):
    multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
    vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
    original_values_as_df = pd.DataFrame({'Entry': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
    for index, row in tqdm.tqdm(vals.iterrows(), total = len(vals)):
        list_of_floats = [float(item) for item in list(row)]
        original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['Entry']] + [list_of_floats]
    return original_values_as_df

def run_probe(benchmarks, representation_name, representation_file_human, representation_file_affinity, similarity_tasks=["Sparse","200","500"], function_prediction_aspect="All_Aspects", function_prediction_dataset="All_Data_Sets", family_prediction_dataset=["nc","uc50","uc30","mm15"], detailed_output=False):
    print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is started...\n\n")
    result = {}

    if any(item in ['similarity', 'function', 'family', 'all'] for item in benchmarks):
        print("\nRepresentation vectors are loading...\n")
        human_representation_dataframe = load_representation(representation_file_human)

    if "similarity" in benchmarks:
        print("\nSemantic similarity Inference Benchmark is running...\n")
        ssi.representation_dataframe = human_representation_dataframe
        ssi.representation_name = representation_name
        ssi.protein_names = ssi.representation_dataframe['Entry'].tolist()
        ssi.similarity_tasks = similarity_tasks
        ssi.detailed_output = detailed_output
        similarity_result = ssi.calculate_all_correlations()
        result['similarity'] = similarity_result

    if "function" in benchmarks:
        print("\n\nOntology-based protein function prediction benchmark is running...\n")
        fp.aspect_type = function_prediction_aspect
        fp.dataset_type = function_prediction_dataset
        fp.representation_dataframe = human_representation_dataframe
        fp.representation_name = representation_name
        fp.detailed_output = detailed_output
        function_results = fp.pred_output()
        result['function'] = function_results

    if "family" in benchmarks:
        print("\n\nDrug target protein family classification benchmark is running...\n")
        tfc.representation_path = representation_file_human
        tfc.representation_name = representation_name
        tfc.detailed_output = detailed_output
        result['family'] = {}
        for dataset in family_prediction_dataset:
            family_result = tfc.score_protein_rep(dataset)
            result['family']['dataset'] = family_result

    if "affinity" in benchmarks:
        print("\n\nProtein-protein binding affinity estimation benchmark is running...\n")
        bae.skempi_vectors_path = representation_file_affinity
        bae.representation_name = representation_name
        affinity_result = bae.predict_affinities_and_report_results()
        result['affinity'] = affinity_result

    
    print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is finished...\n")
    return results