PROBE

Sleeping

App Files Files Community

PROBE / src /bin /PROBE.py

mgyigit

Update src/bin/PROBE.py

4db2d24 verified 4 months ago

raw

history blame

5.32 kB

	import yaml
	import pandas as pd
	import tqdm
	from . import semantic_similarity_infer as ssi
	from . import target_family_classifier as tfc
	from . import function_predictor as fp
	from . import binding_affinity_estimator as bae

	print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is started...\n\n")

	with open('probe_config.yaml') as f:
	args = yaml.load(f, Loader=yaml.FullLoader)

	if args["benchmark"] not in ["similarity","family","function","affinity","all"]:
	parser.error('At least one benchmark type should be selected')

	print(args)

	def load_representation(multi_col_representation_vector_file_path):
	multi_col_representation_vector = pd.read_csv(multi_col_representation_vector_file_path)
	vals = multi_col_representation_vector.iloc[:,1:(len(multi_col_representation_vector.columns))]
	original_values_as_df = pd.DataFrame({'Entry': pd.Series([], dtype='str'),'Vector': pd.Series([], dtype='object')})
	for index, row in tqdm.tqdm(vals.iterrows(), total = len(vals)):
	list_of_floats = [float(item) for item in list(row)]
	original_values_as_df.loc[index] = [multi_col_representation_vector.iloc[index]['Entry']] + [list_of_floats]
	return original_values_as_df

	if args["benchmark"] in ["similarity","function","all"]:
	print("\nRepresentation vectors are loading...\n")
	representation_dataframe = load_representation(args["representation_file_human"])

	if args["benchmark"] in ["similarity","all"]:
	print("\nSemantic similarity Inference Benchmark is running...\n")
	ssi.representation_dataframe = representation_dataframe
	ssi.representation_name = args["representation_name"]
	ssi.protein_names = ssi.representation_dataframe['Entry'].tolist()
	ssi.similarity_tasks = args["similarity_tasks"]
	ssi.detailed_output = args["detailed_output"]
	ssi.calculate_all_correlations()
	if args["benchmark"] in ["function","all"]:
	print("\n\nOntology-based protein function prediction benchmark is running...\n")
	fp.aspect_type = args["function_prediction_aspect"]
	fp.dataset_type = args["function_prediction_dataset"]
	fp.representation_dataframe = representation_dataframe
	fp.representation_name = args["representation_name"]
	fp.detailed_output = args["detailed_output"]
	fp.pred_output()
	if args["benchmark"] in ["family","all"]:
	print("\n\nDrug target protein family classification benchmark is running...\n")
	tfc.representation_path = args["representation_file_human"]
	tfc.representation_name = args["representation_name"]
	tfc.detailed_output = args["detailed_output"]
	for dataset in args["family_prediction_dataset"]:
	tfc.score_protein_rep(dataset)
	if args["benchmark"] in ["affinity","all"]:
	print("\n\nProtein-protein binding affinity estimation benchmark is running...\n")
	bae.skempi_vectors_path = args["representation_file_affinity"]
	bae.representation_name = args["representation_name"]
	bae.predict_affinities_and_report_results()
	print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is finished...\n")

	def run_probe(benchmarks, representation_name, representation_file_human, representation_file_affinity, similarity_tasks=["Sparse","200","500"], function_prediction_aspec="All_Aspects", function_prediction_dataset="All_Data_Sets", family_prediction_dataset=["nc","uc50","uc30","mm15"], detailed_output=False):
	print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is started...\n\n")

	if any(item in ['similarity', 'function', 'family', 'all'] for item in benchmarks):
	print("\nRepresentation vectors are loading...\n")
	human_representation_dataframe = load_representation(representation_file_human)

	if "similarity" in benchmarks:
	print("\nSemantic similarity Inference Benchmark is running...\n")
	ssi.representation_dataframe = human_representation_dataframe
	ssi.representation_name = representation_name
	ssi.protein_names = ssi.representation_dataframe['Entry'].tolist()
	ssi.similarity_tasks = similarity_tasks
	ssi.detailed_output = detailed_output
	ssi.calculate_all_correlations()

	if "function" in benchmarks:
	print("\n\nOntology-based protein function prediction benchmark is running...\n")
	fp.aspect_type = function_prediction_aspect
	fp.dataset_type = function_prediction_dataset
	fp.representation_dataframe = human_representation_dataframe
	fp.representation_name = representation_name
	fp.detailed_output = detailed_output
	fp.pred_output()

	if "family" in benchmarks:
	print("\n\nDrug target protein family classification benchmark is running...\n")
	tfc.representation_path = representation_file_human
	tfc.representation_name = representation_name
	tfc.detailed_output = detailed_output
	for dataset in family_prediction_dataset:
	tfc.score_protein_rep(dataset)

	if "affinity" in benchmarks:
	print("\n\nProtein-protein binding affinity estimation benchmark is running...\n")
	bae.skempi_vectors_path = representation_file_affinity
	bae.representation_name = representation_name
	bae.predict_affinities_and_report_results()

	print("\n\nPROBE (Protein RepresentatiOn Benchmark) run is finished...\n")
	return 0