diff --git "a/code/pdb_featureVector.py" "b/code/pdb_featureVector.py" --- "a/code/pdb_featureVector.py" +++ "b/code/pdb_featureVector.py" @@ -25,16 +25,13 @@ from Bio.PDB import PDBList from Bio import Align from Bio import SeqIO from Bio.PDB import * -import streamlit as st -from urllib.error import HTTPError -import Bio - warnings.filterwarnings("ignore") start = timer() # FUNCTIONS + # FUNCTIONS from calc_pc_property import * from add_domains import * @@ -52,6 +49,7 @@ from uniprotSequenceMatch import uniprotSequenceMatch from process_input import clean_data + def pdb(input_set, mode, impute): aligner = Align.PairwiseAligner() """ @@ -60,1166 +58,1046 @@ def pdb(input_set, mode, impute): Add datapoint identifier and remove non-standard input. """ data = clean_data(input_set) - path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files( - mode) + path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode) + out_path = path_to_output_files / 'log.txt' + sys.stdout = open(out_path, 'w') print('Creating directories...') annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand', - 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', - 'region', + 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region', 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', 'transitPeptide', 'glycosylation', 'propeptide'] print('Feature vector generation started...\n') - cont = True - try: - if cont == False: - print('Feature vectore generation terminated.') - else: - """ - STEP 2 - Add physicochemical properties. - """ - print('Adding physicochemical properties...\n') - - data = add_physicochemical(data) - - """ - STEP 3 - Add domain-related information. - """ - print('Adding domains\n') - - data = add_domains(data, path_to_domains) - data = data.astype(str) - data = data.replace({'NaN': 'nan'}) - data.domain = data.domain.replace({'nan': '-1'}) - data.domStart = data.domStart.replace({'nan': '-1'}) - data.domEnd = data.domEnd.replace({'nan': '-1'}) - data.distance = data.distance.replace({'nan': '-1'}) - - """ - STEP 4 - Retrieve canonical and isoform UniProt sequences. - Add to the data frame. - """ - print('Retrieving UniProt sequences...\n') - - canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence']) - up_list = list(set(data['uniprotID'].to_list())) - for i in range(len(up_list)): - canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i]) - canonical_fasta.at[i, 'uniprotID'] = up_list[i] - canonical_fasta = canonical_fasta.drop_duplicates() - isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence']) - iso_dict = [] - for i in range(len(up_list)): - iso_dict.append(get_isoforms(up_list[i])) - - index = 0 - for i in iso_dict: - for key, val in i.items(): - isoform_fasta.at[index, 'uniprotID'] = key - isoform_fasta.at[index, 'isoformSequence'] = val - index += 1 - isoform_fasta = isoform_fasta.drop_duplicates() - for i in isoform_fasta.index: - isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip() - isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6] - print('Sequence files created...\n') - - data = data.merge(canonical_fasta, on='uniprotID', how='left') - data = data.astype(str) - data['whichIsoform'] = 'nan' - data.replace({'': 'nan'}, inplace=True) - data['wt_sequence_match'] = '' - for i in data.index: - if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']): - wt = data.at[i, 'wt'] - can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1] - if wt == can: - data.at[i, 'wt_sequence_match'] = 'm' - elif wt != can: - isoList = isoform_fasta[ - isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() - for k in isoList: - if len(k) >= int(data.at[i, 'pos']): - resInIso = k[int(int(data.at[i, 'pos']) - 1)] - if wt == resInIso: - whichIsoform = \ - isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] - data.at[i, 'wt_sequence_match'] = 'i' - data.at[i, 'whichIsoform'] = whichIsoform - break - - elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']): - isoList = isoform_fasta[ - isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() + if len(data) == 0: + print('Feature vectore generation terminated.') + else: + """ + STEP 2 + Add physicochemical properties. + """ + print('Adding physicochemical properties...\n') + + data = add_physicochemical(data) + + """ + STEP 3 + Add domain-related information. + """ + print('Adding domains\n') + + data = add_domains(data, path_to_domains) + + data = data.astype(str) + data = data.replace({'NaN': 'nan'}) + data.domain = data.domain.replace({'nan': '-1'}) + data.domStart = data.domStart.replace({'nan': '-1'}) + data.domEnd = data.domEnd.replace({'nan': '-1'}) + data.distance = data.distance.replace({'nan': '-1'}) + + """ + STEP 4 + Retrieve canonical and isoform UniProt sequences. + Add to the data frame. + """ + print('Retrieving UniProt sequences...\n') + + canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence']) + up_list = list(set(data['uniprotID'].to_list())) + for i in range(len(up_list)): + canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i]) + canonical_fasta.at[i, 'uniprotID'] = up_list[i] + + canonical_fasta = canonical_fasta.drop_duplicates() + isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence']) + iso_dict = [] + for i in range(len(up_list)): + iso_dict.append(get_isoforms(up_list[i])) + + index = 0 + for i in iso_dict: + for key, val in i.items(): + isoform_fasta.at[index, 'uniprotID'] = key + isoform_fasta.at[index, 'isoformSequence'] = val + index += 1 + isoform_fasta = isoform_fasta.drop_duplicates() + + for i in isoform_fasta.index: + isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip() + isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6] + print('Sequence files created...\n') + + data = data.merge(canonical_fasta, on='uniprotID', how='left') + data = data.astype(str) + data['whichIsoform'] = 'nan' + data.replace({'': 'nan'}, inplace=True) + data['wt_sequence_match'] = '' + for i in data.index: + if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']): + wt = data.at[i, 'wt'] + can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1] + if wt == can: + data.at[i, 'wt_sequence_match'] = 'm' + elif wt != can: + isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() for k in isoList: if len(k) >= int(data.at[i, 'pos']): resInIso = k[int(int(data.at[i, 'pos']) - 1)] - wt = data.at[i, 'wt'] if wt == resInIso: - whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[ - 0] + whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] data.at[i, 'wt_sequence_match'] = 'i' data.at[i, 'whichIsoform'] = whichIsoform break - data.wt_sequence_match = data.wt_sequence_match.astype('str') - data.replace({'': 'nan'}, inplace=True) - data_size = len(data.drop_duplicates(['datapoint'])) - not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')] - uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')] - data = None - - print( - 'You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n' - % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])), - len(uniprot_matched.drop_duplicates(['datapoint'])))) - - """ - STEP 5 - Retrieve related PDB sequences, extract their sequences. - Add to the data frame. - """ - - pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence']) - pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution']) - - print('Retrieving PDB structures...\n') - pdbs = [] - protein = uniprot_matched.uniprotID.to_list() - protein = list(set(protein)) - - - for prot in protein: - pdbs.append(get_pdb_ids(prot)) - st.write(pdbs) + elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']): + isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() + for k in isoList: + if len(k) >= int(data.at[i, 'pos']): + resInIso = k[int(int(data.at[i, 'pos']) - 1)] + wt = data.at[i, 'wt'] + if wt == resInIso: + whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] + data.at[i, 'wt_sequence_match'] = 'i' + data.at[i, 'whichIsoform'] = whichIsoform + break + + data.wt_sequence_match = data.wt_sequence_match.astype('str') + data.replace({'': 'nan'}, inplace=True) + data_size = len(data.drop_duplicates(['datapoint'])) + not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')] + uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')] + data = None + + print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n' + % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])), + len(uniprot_matched.drop_duplicates(['datapoint'])))) + + """ + STEP 5 + Retrieve related PDB sequences, extract their sequences. + Add to the data frame. + """ + from urllib.error import HTTPError + pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence']) + pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution']) + + print('Retrieving PDB structures...\n') + pdbs = [] + protein = uniprot_matched.uniprotID.to_list() + protein = list(set(protein)) + + for prot in protein: + pdbs.append(get_pdb_ids(prot)) + print('PDBs', pdbs) + if len(pdbs)>=1: + print('pdbs not empty') pdbs = [item for sublist in pdbs for item in sublist] - print('Processing PDB structures...\n') - if pdbs == []: - print('No PDB structure found for the query. ') - - print('Starting PDB structures download...\n') - pdbs = list(filter(None, pdbs)) - pdbs = (set(pdbs)) - pdbs = [i.lower() for i in pdbs] - pdbl = PDBList() - parser = PDBParser() - index = 0 + print('NEW', pdbs) + else: + print('pdbs empty') + pdbs =[] + print('Processing PDB structures...\n') + if pdbs == []: + print('No PDB structure found for the query. ') + """ + try: + pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in + ((',').join([str(item) for item in pdbs])).split(',')] + except IndexError: + pdbs = [] + print('No PDB structure found for the query. ') + """ + print('Starting PDB structures download...\n') + pdbs = list(filter(None, pdbs)) + pdbs = (set(pdbs)) + pdbs = [i.lower() for i in pdbs] + pdbl = PDBList() + parser = PDBParser() + index = 0 + + try: + shutil.rmtree('obsolete') + except OSError as e: + pass + pdb_structures_path = path_to_output_files / 'log.txt' + existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*")) + existing_pdb = [str(i) for i in existing_pdb] + existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb] + cnt = 0 + for search in pdbs: + try: + if search.lower() not in existing_pdb: + file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb") + else: + print('PDB structure file exists..') + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): + filename_replace_ext = filename.with_suffix(".pdb") + filename.rename(filename_replace_ext) + + file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb') + + base = os.path.splitext(str(file))[0] + base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1] + os.rename(file, base + ".ent") + file = base + '.ent' + + resolution_method = parser.get_structure(search, file) + for record in SeqIO.parse(file, "pdb-seqres"): + if record.dbxrefs[0].split(':')[0] == 'UNP': + pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0] + pdb_fasta.at[index, 'chain'] = record.id.split(':')[1] + pdb_fasta.at[index, 'pdbSequence'] = str(record.seq) + pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1] + pdb_info.at[index, 'pdbID'] = record.id.split(':')[0] + pdb_info.at[index, 'chain'] = record.annotations["chain"] + pdb_info.at[index, 'resolution'] = resolution_method.header['resolution'] + index += 1 + except: + IndexError + pdb_info.at[index, 'uniprotID'] = 'nan' + pdb_info.at[index, 'pdbID'] = 'nan' + pdb_info.at[index, 'chain'] = 'nan' + pdb_info.at[index, 'resolution'] = 'nan' + cnt +=1 + print() + print('PDB file processing finished..') + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): + try: + filename_replace_ext = filename.with_suffix(".pdb") + filename.rename(filename_replace_ext) + except: + FileNotFoundError + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): try: - shutil.rmtree('obsolete') - except OSError as e: - pass + if filename.stem.startswith("pdb"): + filename_replace_ext = filename.with_name(filename.stem[3:]) + filename.rename(filename_replace_ext.with_suffix('.pdb')) + except: + FileNotFoundError + + uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left') + uniprot_matched = uniprot_matched.astype(str) + uniprot_matched = uniprot_matched.drop_duplicates() + + uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left') + uniprot_matched = uniprot_matched.astype(str) + + with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & ( + (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & ( + uniprot_matched.resolution != 'None'))].drop_duplicates() + no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | ( + (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | ( + uniprot_matched.resolution == 'None'))] + no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())] + no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True) + + print( + 'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' + % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])), + len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])))) + + with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + with_pdb.replace({'': 'nan'}, inplace=True) + + if len(with_pdb) == 0: + with_pdb['pdbInfo'] = '' + else: + for i in with_pdb.index: + try: + res = str(with_pdb.at[i, 'resolution']) + chain = with_pdb.at[i, 'chain'] + new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res + with_pdb.at[i, 'pdbInfo'] = new + except: + TypeError + with_pdb.at[i, 'pdbInfo'] = 'nan' - existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*")) - existing_pdb = [str(i) for i in existing_pdb] - existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb] + with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence', + 'wt_sequence_match', + 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']] - cnt = 0 - for search in pdbs: - st.write('pdb',pdb) - try: - if search.lower() not in existing_pdb: - - # Specify the URL of the PDB file you want to download - pdb_url = f"https://files.rcsb.org/download/{search}.pdb" - # Set the path within your Hugging Face space where you want to store the PDB files - pdb_folder_path = Path(path_to_output_files / 'pdb_structures') - st.write(pdb_folder_path) - # Extract the PDB filename from the URL - pdb_filename = pdb_url.split("/")[-1] - - # Set the path for the downloaded file - pdb_file_path = os.path.join(pdb_folder_path, pdb_filename) - - # Send a GET request to download the PDB file - response = requests.get(pdb_url) - if response.status_code == 200: - # Save the file to the specified path - with open(pdb_file_path, "wb") as file: - file.write(response.content) - print("PDB file downloaded successfully!") - else: - print("Failed to download the PDB file.") - - else: - print('PDB structure file exists..') - for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): - filename_replace_ext = filename.with_suffix(".pdb") - filename.rename(filename_replace_ext) - - file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb') - - base = os.path.splitext(str(file))[0] - base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1] - os.rename(file, base + ".ent") - file = base + '.ent' - - # Parse the PDB file - structure = parser.get_structure("structure", file) - # Get the resolution from the Structure object - resolution = structure.header["resolution"] - - for record in SeqIO.parse(file, "pdb-seqres"): - if record.dbxrefs[0].split(':')[0] == 'UNP': - pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0] - pdb_fasta.at[index, 'chain'] = record.id.split(':')[1] - pdb_fasta.at[index, 'pdbSequence'] = str(record.seq) - pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1] - pdb_info.at[index, 'pdbID'] = record.id.split(':')[0] - pdb_info.at[index, 'chain'] = record.annotations["chain"] - pdb_info.at[index, 'resolution'] = resolution - index += 1 - except: - IndexError - pdb_info.at[index, 'uniprotID'] = 'nan' - pdb_info.at[index, 'pdbID'] = 'nan' - pdb_info.at[index, 'chain'] = 'nan' - pdb_info.at[index, 'resolution'] = 'nan' - index += 1 - cnt += 1 - st.write('pdb_info') - - st.write(pdb_info) - print('PDB file processing finished..') - for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): - try: - filename_replace_ext = filename.with_suffix(".pdb") - filename.rename(filename_replace_ext) - except: - FileNotFoundError - for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): - try: - if filename.stem.startswith("pdb"): - filename_replace_ext = filename.with_name(filename.stem[3:]) - filename.rename(filename_replace_ext.with_suffix('.pdb')) - except: - FileNotFoundError - - uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left') - uniprot_matched = uniprot_matched.astype(str) - uniprot_matched = uniprot_matched.drop_duplicates() - - uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left') - uniprot_matched = uniprot_matched.astype(str) - st.write('uniprot_matched') - st.write(uniprot_matched) - with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & ( - (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & ( - uniprot_matched.resolution != 'None'))].drop_duplicates() - no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | ( - (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | ( - uniprot_matched.resolution == 'None'))] - no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())] - no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True) - st.write('with_pdb') - st.write(with_pdb) - print( - 'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' - % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])), - len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])))) - - with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - with_pdb.replace({'': 'nan'}, inplace=True) - - if len(with_pdb) == 0: - with_pdb['pdbInfo'] = '' - else: - for i in with_pdb.index: - try: - res = str(with_pdb.at[i, 'resolution']) - chain = with_pdb.at[i, 'chain'] - new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res - with_pdb.at[i, 'pdbInfo'] = new - except: - TypeError - with_pdb.at[i, 'pdbInfo'] = 'nan' - - with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence', - 'wt_sequence_match', - 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']] - - # If the query data points are found in no_match_in_uniprot data frame, it will not give any results. - # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps. - # If the query data points are found in with_pdb data frame, it will be searched in the following steps. - - """ - STEP 6 - Retrieve sequence annotations. - Add to the data frame. - """ - - if len(with_pdb) > 0: - with_pdb = add_annotations(with_pdb) - else: - new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', - 'dnaBinding', - 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', 'mutagenesis', 'strand', - 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', - 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', 'peptide', - 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', - 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', - 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary'] - with_pdb = pd.DataFrame(columns=new_cols) - try: - with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str') - except: - AttributeError - with_pdb['whichIsoform'] = '' - - with_pdb = with_pdb.astype(str) - with_pdb = with_pdb.replace({'NaN': 'nan'}) - with_pdb.replace({'[]': 'nan'}, inplace=True) - with_pdb.replace({'nan-nan': 'nan'}, inplace=True) - with_pdb.replace({'': 'nan'}, inplace=True) - - """ - STEP 7 - Do alignment for PDB - """ - # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences. - # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences. - with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C') - with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C') - - dfM = with_pdb[with_pdb.wt_sequence_match == 'm'] - dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - - dfNM = with_pdb[with_pdb.wt_sequence_match == 'i'] - dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True) - - dfM = dfM.astype(str) - dfNM = dfNM.astype(str) - - dfM.reset_index(inplace=True) - dfM.drop(['index'], axis=1, inplace=True) - dfNM.reset_index(inplace=True) - dfNM.drop(['index'], axis=1, inplace=True) - - uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint'])) - uniprot_matched = None - pdb_fasta = None - pdb_info = None - pdbs = None - existing_pdb = None - with_pdb_size = len(with_pdb.drop_duplicates(['datapoint'])) - with_pdb = None + # If the query data points are found in no_match_in_uniprot data frame, it will not give any results. + # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps. + # If the query data points are found in with_pdb data frame, it will be searched in the following steps. - print('Aligning sequences...\n') + """ + STEP 6 + Retrieve sequence annotations. + Add to the data frame. + """ - aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files')) - aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files')) - # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them. - for i in aligned_m.index: - if aligned_m.at[i, 'pdbSequence'] == 'nan': - aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan' - aligned_m.at[i, 'domainStartonPDB'] = 'nan' - aligned_m.at[i, 'domainEndonPDB'] = 'nan' - aligned_m.at[i, 'pdb_alignStatus'] = 'nan' - - for i in aligned_nm.index: - if aligned_nm.at[i, 'pdbSequence'] == 'nan': - aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan' - aligned_nm.at[i, 'domainStartonPDB'] = 'nan' - aligned_nm.at[i, 'domainEndonPDB'] = 'nan' - aligned_nm.at[i, 'pdb_alignStatus'] = 'nan' - - # Check if they the same column name before merging. - aligned_m = aligned_m.astype(str) - aligned_nm = aligned_nm.astype(str) - - frames = [aligned_m, aligned_nm] - after_up_pdb_alignment = pd.concat(frames, sort=False) - if len(after_up_pdb_alignment) == 0: - after_up_pdb_alignment['pdb_alignStatus'] = '' - after_up_pdb_alignment['mutationPositionOnPDB'] = '' - after_up_pdb_alignment['domainStartonPDB'] = '' - after_up_pdb_alignment['domainEndonPDB'] = '' - - after_up_pdb_alignment = after_up_pdb_alignment.sort_values( - by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'], - ascending=[True, True, True, True, True, True, True]) - - after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], - keep='first') - - after_up_pdb_alignment = after_up_pdb_alignment.astype('str') - - pdb_aligned = after_up_pdb_alignment[ - (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')] - yes_pdb_no_match = after_up_pdb_alignment[ - (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')] - no_pdb = no_pdb.copy() - - print('PDB matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d not found on the covered area by the structure.' % ( - len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint']))) - print('--%d will be searched in Swiss-Model database.\n' % ( - len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint'])))) - - dfM = None - dfNM = None - aligned_nm = None - aligned_m = None - after_up_pdb_alignment = None - - print('Proceeding to SwissModel search...') - print('------------------------------------\n') - - # At this point we have 4 dataframes - # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well. - # 1a. aligned --- we are done with this. - # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases. - # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. - # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases. - - """ - Step 8 - Neutralize data points that are to be searched in Swiss-Model - # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before. - # They need to be converted to their old original UniProt annotation positions. - """ - yes_pdb_no_match.drop(['disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'caBinding', 'topologicalDomain', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', - 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID', - 'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB', - 'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True) - - to_swiss = pd.concat( - [yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])]) - no_pdb = None - to_swiss.reset_index(inplace=True) - to_swiss.drop(['index'], axis=1, inplace=True) - to_swiss = to_swiss.astype('str') - to_swiss = to_swiss.replace({'NaN': 'nan'}) - # Create model summary dataframe. - if len(to_swiss) != 0: - # import zipfile - # with zipfile.ZipFile(Path(path_to_input_files / 'swissmodel_structures.txt.zip'),"r") as zip_ref: - # zip_ref.extractall(Path(path_to_input_files)) - - print('Generating SwissModel file...\n') - - swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t', - dtype=str, header=None, skiprows=1, - names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', - 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', - 'qmean_norm', 'seqid', 'url']) + if len(with_pdb) > 0: + with_pdb = add_annotations(with_pdb) + else: + new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', + 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', 'mutagenesis', 'strand', + 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', + 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', 'peptide', + 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', + 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary'] + with_pdb = pd.DataFrame(columns = new_cols) + try: + with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str') + except: + AttributeError + with_pdb['whichIsoform'] = '' + + with_pdb = with_pdb.astype(str) + with_pdb = with_pdb.replace({'NaN': 'nan'}) + with_pdb.replace({'[]': 'nan'}, inplace=True) + with_pdb.replace({'nan-nan': 'nan'}, inplace=True) + with_pdb.replace({'': 'nan'}, inplace=True) + + """ + STEP 7 + Do alignment for PDB + """ + # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences. + # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences. + with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C') + with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C') + + dfM = with_pdb[with_pdb.wt_sequence_match == 'm'] + dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + + dfNM = with_pdb[with_pdb.wt_sequence_match == 'i'] + dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True) + + dfM = dfM.astype(str) + dfNM = dfNM.astype(str) + + dfM.reset_index(inplace=True) + dfM.drop(['index'], axis=1, inplace=True) + dfNM.reset_index(inplace=True) + dfNM.drop(['index'], axis=1, inplace=True) + + uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint'])) + uniprot_matched = None + pdb_fasta = None + pdb_info = None + pdbs = None + existing_pdb = None + with_pdb_size = len(with_pdb.drop_duplicates(['datapoint'])) + with_pdb = None + + print('Aligning sequences...\n') + aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files')) + aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files')) + + # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them. + for i in aligned_m.index: + if aligned_m.at[i, 'pdbSequence'] == 'nan': + aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan' + aligned_m.at[i, 'domainStartonPDB'] = 'nan' + aligned_m.at[i, 'domainEndonPDB'] = 'nan' + aligned_m.at[i, 'pdb_alignStatus'] = 'nan' + + for i in aligned_nm.index: + if aligned_nm.at[i, 'pdbSequence'] == 'nan': + aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan' + aligned_nm.at[i, 'domainStartonPDB'] = 'nan' + aligned_nm.at[i, 'domainEndonPDB'] = 'nan' + aligned_nm.at[i, 'pdb_alignStatus'] = 'nan' + + # Check if they the same column name before merging. + aligned_m = aligned_m.astype(str) + aligned_nm = aligned_nm.astype(str) + + + frames = [aligned_m, aligned_nm] + after_up_pdb_alignment = pd.concat(frames, sort=False) + if len(after_up_pdb_alignment) == 0: + after_up_pdb_alignment['pdb_alignStatus'] = '' + after_up_pdb_alignment['mutationPositionOnPDB'] = '' + after_up_pdb_alignment['domainStartonPDB'] = '' + after_up_pdb_alignment['domainEndonPDB'] = '' + + after_up_pdb_alignment = after_up_pdb_alignment.sort_values( + by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'], + ascending=[True, True, True, True, True, True, True]) + + after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], keep='first') + + after_up_pdb_alignment = after_up_pdb_alignment.astype('str') + + pdb_aligned = after_up_pdb_alignment[ + (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')] + yes_pdb_no_match = after_up_pdb_alignment[ + (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')] + no_pdb = no_pdb.copy() + + + print('PDB matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d not found on the covered area by the structure.' % ( + len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint']))) + print('--%d will be searched in Swiss-Model database.\n' % ( + len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint'])))) + + + dfM = None + dfNM = None + aligned_nm = None + aligned_m = None + after_up_pdb_alignment = None + + print('Proceeding to SwissModel search...') + print('------------------------------------\n') + + # At this point we have 4 dataframes + # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well. + # 1a. aligned --- we are done with this. + # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases. + # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. + # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases. + + """ + Step 8 + Neutralize data points that are to be searched in Swiss-Model + # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before. + # They need to be converted to their old original UniProt annotation positions. + """ + yes_pdb_no_match.drop(['disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'caBinding', 'topologicalDomain', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', + 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID', + 'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB', + 'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True) + + to_swiss = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])]) + no_pdb = None + to_swiss.reset_index(inplace=True) + to_swiss.drop(['index'], axis=1, inplace=True) + to_swiss = to_swiss.astype('str') + to_swiss = to_swiss.replace({'NaN': 'nan'}) + # Create model summary dataframe. + if len(to_swiss) != 0: + print('Generating SwissModel file...\n') + + swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t', + dtype=str, header=None, skiprows=1, + names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', + 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url']) + else: + swiss_model = pd.DataFrame( + columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id', + 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', 'whichIsoform']) + swiss_model = swiss_model.astype('str') + try: + swiss_model.iso_id = swiss_model.iso_id.astype('str') + except: + AttributeError + swiss_model['iso_id'] = 'nan' + swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan'] + for ind in swiss_model.index: + swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0] + if swiss_model.at[ind, 'iso_id'] != 'nan': + + swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1] else: - swiss_model = pd.DataFrame( - columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id', - 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', - 'whichIsoform']) - swiss_model = swiss_model.astype('str') + swiss_model.at[ind, 'whichIsoform'] = 'nan' +# swiss_model.drop(['input'], axis=1, inplace=True) + swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL'] + print('Index File Processed...\n') + + + # Get relevant columns + swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']] + # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one. + swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False) + swiss_model.reset_index(inplace=True) + swiss_model.drop(['index'], axis=1, inplace=True) + + # Get protein IDs for which there exist models. + swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list()) + to_swiss = to_swiss.astype(str) + no_swiss_models = pd.DataFrame() + for i in to_swiss.index: + if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids: + k = pd.Series(to_swiss.iloc[i]) + no_swiss_models = no_swiss_models.append(k, ignore_index=True) + + no_swiss_models = no_swiss_models.astype(str) + if len(no_swiss_models) == 0: + no_swiss_models = pd.DataFrame(columns=to_swiss.columns) + else: + no_swiss_models = no_swiss_models[to_swiss.columns] + no_swiss_models.reset_index(inplace=True) + no_swiss_models.drop('index', axis=1, inplace=True) + + with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False) + with_swiss_models = with_swiss_models[to_swiss.columns] + + # Add model info. + + with_swiss_models = with_swiss_models.astype(str) + swiss_model = swiss_model.astype(str) + swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'], + right_on=['UniProtKB_ac', 'whichIsoform'], + how='left') + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data = swiss_models_with_data.sort_values(by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'], + ascending=False) + swiss_models_with_data = swiss_models_with_data.drop_duplicates() + swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1) + swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') + swiss_models_with_data = swiss_models_with_data.astype(str) + + # Get the ones in the list but without model url and add to the list to go to modbase. + url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan'] + + # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls. + url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1) + + no_swiss_models_2 = pd.concat([no_swiss_models, url_nan]) + swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan'] + for i in swiss_models_with_data.index: try: - swiss_model.iso_id = swiss_model.iso_id.astype('str') + swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2] + swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0] except: - AttributeError - swiss_model['iso_id'] = 'nan' - swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan'] - for ind in swiss_model.index: - swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0] - if swiss_model.at[ind, 'iso_id'] != 'nan': - - swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1] - else: - swiss_model.at[ind, 'whichIsoform'] = 'nan' - # swiss_model.drop(['input'], axis=1, inplace=True) - swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL'] - print('Index File Processed...\n') - - # Get relevant columns - swiss_model = swiss_model[ - ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']] - # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one. - swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False) - swiss_model.reset_index(inplace=True) - swiss_model.drop(['index'], axis=1, inplace=True) - - # Get protein IDs for which there exist models. - swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list()) - to_swiss = to_swiss.astype(str) - no_swiss_models = pd.DataFrame() - for i in to_swiss.index: - if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids: - k = pd.Series(to_swiss.iloc[i]) - no_swiss_models = no_swiss_models.append(k, ignore_index=True) - - no_swiss_models = no_swiss_models.astype(str) - if len(no_swiss_models) == 0: - no_swiss_models = pd.DataFrame(columns=to_swiss.columns) + IndexError + if len(swiss_models_with_data) == 0: + swiss_models_with_data['chain'] = '' + swiss_models_with_data['template'] = '' + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str') + swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str') + swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2)) + swiss_models_with_data = swiss_models_with_data.astype(str) + + # swiss_models_with_data: These data points will be aligned with their corresponding model sequences. + # Add sequences + + no_swiss_models_2.reset_index(inplace=True) + no_swiss_models_2.drop('index', axis=1, inplace=True) + + swiss_models_with_data.reset_index(inplace=True) + swiss_models_with_data.drop('index', axis=1, inplace=True) + + swiss_model_ids = None + with_swiss_models = None + swiss_model = None + no_swiss_models = None + url_nan = None + + # At this point we have: + # pdb_aligned --- Align in the PDB phase + # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. + # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database + # to_swiss (with_swiss_models & no_swiss_models) + # swiss_models_with_data --- We found swiss models for them. + # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries & broken_swiss will be added here) + + """ + STEP 9 + Associated model IDs are added. + Download model files. + """ + print('Beginning SwissModel files download...') + existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*")) + existing_swiss = [str(i) for i in existing_swiss] + existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss] + swissmodels_fasta = pd.DataFrame() + + for i in swiss_models_with_data.index: + protein = swiss_models_with_data.at[i, 'uniprotID'] + template = swiss_models_with_data.at[i, 'template'].split('.')[0] + qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2)) + if protein + '_' + template + '_' + qmean_norm not in existing_swiss: + url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip('\"').replace( + 'https', + 'https:') + req = requests.get(url) + name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') + print('Downloading for Protein:', protein + ' Model: ' + template) + with open(name, 'wb') as f: + f.write(req.content) else: - no_swiss_models = no_swiss_models[to_swiss.columns] - no_swiss_models.reset_index(inplace=True) - no_swiss_models.drop('index', axis=1, inplace=True) - - with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False) - with_swiss_models = with_swiss_models[to_swiss.columns] - - # Add model info. - - with_swiss_models = with_swiss_models.astype(str) - swiss_model = swiss_model.astype(str) - swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'], - right_on=['UniProtKB_ac', 'whichIsoform'], - how='left') - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data = swiss_models_with_data.sort_values( - by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'], - ascending=False) - swiss_models_with_data = swiss_models_with_data.drop_duplicates() - swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1) - swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') - swiss_models_with_data = swiss_models_with_data.astype(str) - - # Get the ones in the list but without model url and add to the list to go to modbase. - url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan'] - - # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls. - url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1) - - no_swiss_models_2 = pd.concat([no_swiss_models, url_nan]) - swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan'] - for i in swiss_models_with_data.index: - try: - swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2] - swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0] - except: - IndexError - if len(swiss_models_with_data) == 0: - swiss_models_with_data['chain'] = '' - swiss_models_with_data['template'] = '' - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str') - swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str') - swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2)) - swiss_models_with_data = swiss_models_with_data.astype(str) - - # swiss_models_with_data: These data points will be aligned with their corresponding model sequences. - # Add sequences - - no_swiss_models_2.reset_index(inplace=True) - no_swiss_models_2.drop('index', axis=1, inplace=True) - - swiss_models_with_data.reset_index(inplace=True) - swiss_models_with_data.drop('index', axis=1, inplace=True) - - swiss_model_ids = None - with_swiss_models = None - swiss_model = None - no_swiss_models = None - url_nan = None - - # At this point we have: - # pdb_aligned --- Align in the PDB phase - # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. - # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database - # to_swiss (with_swiss_models & no_swiss_models) - # swiss_models_with_data --- We found swiss models for them. - # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries & broken_swiss will be added here) - - """ - STEP 9 - Associated model IDs are added. - Download model files. - """ - print('Beginning SwissModel files download...') - existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*")) - existing_swiss = [str(i) for i in existing_swiss] - existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss] - swissmodels_fasta = pd.DataFrame() - - for i in swiss_models_with_data.index: - protein = swiss_models_with_data.at[i, 'uniprotID'] - template = swiss_models_with_data.at[i, 'template'].split('.')[0] - qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2)) - if protein + '_' + template + '_' + qmean_norm not in existing_swiss: - url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip( - '\"').replace( - 'https', - 'https:') + print('Model exists.') + name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') + with open(name, encoding="utf8") as f: + fasta = '' + lines = f.readlines() + chain = '' + for row in lines: + if row[0:4] == 'ATOM' and row[13:15] == 'CA': + chain = row[20:22].strip() + fasta += threeToOne(row[17:20]) + if row[0:3] == 'TER': + k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta]) + swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True) + fasta = '' + + if len(swissmodels_fasta) == 0: + swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']) + else: + swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'] + + swissmodels_fasta = swissmodels_fasta.astype(str) + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float) + swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float) + + swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'], + axis=0) # example = 3gdh + swissmodels_fasta.reset_index(inplace=True) + swissmodels_fasta.drop(['index'], axis=1, inplace=True) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain']) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta']) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta']) + # Some files were broken, thus their PDBs couldnt be recorded. + swissmodels_fasta = swissmodels_fasta.drop_duplicates() + swissmodels_fasta = swissmodels_fasta.astype(str) + + swiss_models_with_data = swiss_models_with_data.astype(str) + swissmodels_fasta = swissmodels_fasta.astype(str) + swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta, + on=['uniprotID', 'template', 'qmean_norm', 'chain']) + + swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0, + ascending=[True, False]) + swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template']) + + + swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list())) + swiss_models_with_data.reset_index(inplace=True) + swiss_models_with_data.drop(['index'], axis=1, inplace=True) + broken_swiss = pd.DataFrame() + c = 0 + for i in swiss_models_with_data.index: # en baştaki dfde var ama model gelende yok. + if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp: + k = pd.Series(swiss_models_with_data.iloc[i]) + broken_swiss = broken_swiss.append(k, ignore_index=True) + c += 1 + + if len(broken_swiss) == 0: + broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list()) + + swiss_models_with_data = swiss_models_with_data1.copy() + + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float') + swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'], + axis=0, ascending=[True, True, True, False]) + + # Delete the same model sequence with lower quality + swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], + keep='first') + swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str') + swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') + len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len( + no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint'])) + # This printed data here includes all possible models with different qualities, + # because we may get a hit in either of them. + swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True) # for convenience. + + # NOW DO ALIGNMENT HERE + + swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'}) + swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'}) + swiss_models_with_data.rename({'template': 'pdbID'}, axis=1, + inplace=True) # Only to be able use the alignment code above. + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str') + swiss_models_with_data = add_annotations(swiss_models_with_data) + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True) + swiss_models_with_data_copy = swiss_models_with_data.copy() + swiss_models_with_data1_dp = None + swiss_models_with_data1 = None + existing_swiss = None + swissmodels_fasta = None + + print('Aligning sequences...\n') + + swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C') + swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C') + swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files') + swiss_models_with_data = None + + + if len(swiss_model_aligned) == 0: + swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns) + swiss_model_aligned['qmean_norm'] = 'nan' + else: + swiss_model_aligned = swiss_model_aligned.astype(str) + swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True) + + # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. + nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan'] + not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan'] + not_nan.qmean_norm = not_nan.qmean_norm.astype('float') + not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], inplace=True) + + which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') + swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] + + swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float') + swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'], + ascending=[True, True, True, True, True, False], inplace=True) + swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True) + swiss_not_match = swiss_not_match[no_swiss_models_2.columns] + broken_swiss = broken_swiss[no_swiss_models_2.columns] + swiss_not_match = swiss_not_match.drop_duplicates(['datapoint']) + broken_swiss = broken_swiss.drop_duplicates(['datapoint']) + + to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates() + to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates() + to_modbase = to_modbase.astype(str) + to_swiss_columns = to_swiss.columns + to_swiss_size = len(to_swiss.drop_duplicates(['datapoint'])) + to_swiss = None + + # CONTROL + + """ + # This should be the whole data. + len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data) + len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data) + """ + print('SwissModel matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d successfully aligned with SwissModels structures.' % ( + len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) + print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint']))) + + print('Proceeding to ModBase search...') + print('------------------------------------\n') + no_swiss_models_2 = None + broken_swiss = None + swiss_model_aligned = None + nan = None + not_nan = None + which_ones_are_match = None + swiss_not_match = None + + # STEP : GO TO MODBASE + # Should not include anything related to prev models. + if len(to_modbase) != 0: + to_modbase = to_modbase.astype(str) + + # GET MODBASE MODELS + + # Get IDs from data to retrieve only their models from MODBASE + to_modbase.reset_index(inplace=True) + to_modbase.drop(['index'], axis=1, inplace=True) + + existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) + existing_modbase_models = [str(i) for i in existing_modbase_models] + existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] + + existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*")) + existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] + existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] + + modbase_reduced = pd.DataFrame() + modbase_fasta = pd.DataFrame() + + print('Retrieving ModBase models...\n') + # Get model files associated with each UniProtID + for protein in list(set(to_modbase.uniprotID.to_list())): + if protein not in existing_modbase_models: + print('Downloading Modbase models for ', protein) + url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein + print(url) req = requests.get(url) - name = Path( - path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') - print('Downloading for Protein:', protein + ' Model: ' + template) + name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' with open(name, 'wb') as f: f.write(req.content) else: - print('Model exists.') - name = Path( - path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') + print('Model exists for', protein) + name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') with open(name, encoding="utf8") as f: - fasta = '' - lines = f.readlines() - chain = '' - for row in lines: - if row[0:4] == 'ATOM' and row[13:15] == 'CA': - chain = row[20:22].strip() - fasta += threeToOne(row[17:20]) - if row[0:3] == 'TER': - k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta]) - swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True) + a = open(name, 'r').read() + soup = BeautifulSoup(a, 'lxml') + for pdb in soup.findAll('pdbfile'): + model_id = str(pdb.contents[1])[10:-11] + if model_id not in existing_modbase_models_ind: + with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', + encoding="utf8") as individual: + individual.write(str('UniProt ID: ' + protein)) + individual.write('\n') + individual.write(str(pdb.contents[3])[10:-11].strip()) + with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', + encoding="utf8") as f: fasta = '' + chain = '' + template_chain = '' + score = -999 + for ind_line in f.readlines(): + if ind_line[0:10] == 'UniProt ID': + uniprot_id = ind_line.split(':')[1].strip() + if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': + target_begin = ind_line[40:43].strip() + if ind_line[0:21] == 'REMARK 220 TARGET END': + target_end = ind_line[40:43].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN': + pdb_begin = ind_line[40:43].strip() + if ind_line[0:23] == 'REMARK 220 TEMPLATE END': + pdb_end = ind_line[40:43].strip() + if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB': + pdb_code = ind_line[40:43].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': + pdb_chain = ind_line[40:43].strip() + if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': + quality_score = ind_line[40:].strip() + if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': + model_id = ind_line[40:].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': + template_chain = ind_line[40:42].strip() + if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': + fasta += threeToOne(ind_line[17:20]) + if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': + try: + score = ind_line[40:].strip() + except (ValueError): + score = -999 + if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END': + k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta]) + modbase_fasta = modbase_fasta.append(k, ignore_index=True) + fasta = '' + try: + k = pd.Series( + [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end, + quality_score, + model_id]) + modbase_reduced = modbase_reduced.append(k, ignore_index=True) + except: + NameError + print('This file doesnt have Quality Score. Replacer: -999', model_id) + quality_score = -999 - if len(swissmodels_fasta) == 0: - swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']) + print() + if len(modbase_fasta) != 0: + modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta'] else: - swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'] - - swissmodels_fasta = swissmodels_fasta.astype(str) - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float) - swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float) - - swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'], - axis=0) # example = 3gdh - swissmodels_fasta.reset_index(inplace=True) - swissmodels_fasta.drop(['index'], axis=1, inplace=True) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain']) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta']) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta']) - # Some files were broken, thus their PDBs couldnt be recorded. - swissmodels_fasta = swissmodels_fasta.drop_duplicates() - swissmodels_fasta = swissmodels_fasta.astype(str) - - swiss_models_with_data = swiss_models_with_data.astype(str) - swissmodels_fasta = swissmodels_fasta.astype(str) - swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta, - on=['uniprotID', 'template', 'qmean_norm', 'chain']) - - swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0, - ascending=[True, False]) - swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template']) - - swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list())) - swiss_models_with_data.reset_index(inplace=True) - swiss_models_with_data.drop(['index'], axis=1, inplace=True) - broken_swiss = pd.DataFrame() - c = 0 - for i in swiss_models_with_data.index: # en baştaki dfde var ama model gelende yok. - if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp: - k = pd.Series(swiss_models_with_data.iloc[i]) - broken_swiss = broken_swiss.append(k, ignore_index=True) - c += 1 - - if len(broken_swiss) == 0: - broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list()) - - swiss_models_with_data = swiss_models_with_data1.copy() - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float') - swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'], - axis=0, ascending=[True, True, True, False]) - - # Delete the same model sequence with lower quality - swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], - keep='first') - swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str') - swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') - len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len( - broken_swiss.drop_duplicates(['datapoint'])) + len( - no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint'])) - # This printed data here includes all possible models with different qualities, - # because we may get a hit in either of them. - swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True) # for convenience. - - # NOW DO ALIGNMENT HERE - - swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'}) - swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'}) - swiss_models_with_data.rename({'template': 'pdbID'}, axis=1, - inplace=True) # Only to be able use the alignment code above. - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str') - swiss_models_with_data = add_annotations(swiss_models_with_data) - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True) - swiss_models_with_data_copy = swiss_models_with_data.copy() - swiss_models_with_data1_dp = None - swiss_models_with_data1 = None - existing_swiss = None - swissmodels_fasta = None - - print('Aligning sequences...\n') - - swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C') - swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C') - swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, - path_to_output_files / 'alignment_files') - swiss_models_with_data = None - - if len(swiss_model_aligned) == 0: - swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns) - swiss_model_aligned['qmean_norm'] = 'nan' + modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta']) + modbase_fasta = modbase_fasta.astype(str) + modbase_fasta = modbase_fasta.replace({'': 'nan'}) + modbase_fasta = modbase_fasta.replace({'NaN': 'nan'}) + modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan'] + + print('Modbase model frame constructed.\n') + if len(modbase_reduced) != 0: + modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', + 'PDBEnd', + 'ModPipeQualityScore', 'ModelID'] else: - swiss_model_aligned = swiss_model_aligned.astype(str) - swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True) + modbase_reduced = pd.DataFrame( + columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd', + 'ModPipeQualityScore', 'ModelID']) - # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. - nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan'] - not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan'] - not_nan.qmean_norm = not_nan.qmean_norm.astype('float') - not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], - inplace=True) + to_modbase = add_annotations(to_modbase) - which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') - swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - - swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float') - swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'], - ascending=[True, True, True, True, True, False], inplace=True) - swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True) - swiss_not_match = swiss_not_match[no_swiss_models_2.columns] - broken_swiss = broken_swiss[no_swiss_models_2.columns] - swiss_not_match = swiss_not_match.drop_duplicates(['datapoint']) - broken_swiss = broken_swiss.drop_duplicates(['datapoint']) - - to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates() - to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates() to_modbase = to_modbase.astype(str) - to_swiss_columns = to_swiss.columns - to_swiss_size = len(to_swiss.drop_duplicates(['datapoint'])) - to_swiss = None - - # CONTROL - - """ - # This should be the whole data. - len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data) - len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data) - """ - print('SwissModel matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d successfully aligned with SwissModels structures.' % ( - len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) - print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint']))) - - print('Proceeding to ModBase search...') - print('------------------------------------\n') - no_swiss_models_2 = None - broken_swiss = None - swiss_model_aligned = None - nan = None - not_nan = None - which_ones_are_match = None - swiss_not_match = None - - # STEP : GO TO MODBASE - # Should not include anything related to prev models. - if len(to_modbase) != 0: - to_modbase = to_modbase.astype(str) - - # GET MODBASE MODELS - - # Get IDs from data to retrieve only their models from MODBASE - to_modbase.reset_index(inplace=True) - to_modbase.drop(['index'], axis=1, inplace=True) - - existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) - existing_modbase_models = [str(i) for i in existing_modbase_models] - existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] - - existing_modbase_models_ind = list( - Path(path_to_output_files / 'modbase_structures_individual').glob("*")) - existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] - existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] - - modbase_reduced = pd.DataFrame() - modbase_fasta = pd.DataFrame() - - print('Retrieving ModBase models...\n') - # Get model files associated with each UniProtID - for protein in list(set(to_modbase.uniprotID.to_list())): - if protein not in existing_modbase_models: - print('Downloading Modbase models for ', protein) - url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein - print(url) - req = requests.get(url) - name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' - with open(name, 'wb') as f: - f.write(req.content) - else: - print('Model exists for', protein) - name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') - with open(name, encoding="utf8") as f: - a = open(name, 'r').read() - soup = BeautifulSoup(a, 'lxml') - for pdb in soup.findAll('pdbfile'): - model_id = str(pdb.contents[1])[10:-11] - if model_id not in existing_modbase_models_ind: - with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', - 'w', - encoding="utf8") as individual: - individual.write(str('UniProt ID: ' + protein)) - individual.write('\n') - individual.write(str(pdb.contents[3])[10:-11].strip()) - with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', - encoding="utf8") as f: - fasta = '' - chain = '' - template_chain = '' - score = -999 - for ind_line in f.readlines(): - if ind_line[0:10] == 'UniProt ID': - uniprot_id = ind_line.split(':')[1].strip() - if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': - target_begin = ind_line[40:43].strip() - if ind_line[0:21] == 'REMARK 220 TARGET END': - target_end = ind_line[40:43].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN': - pdb_begin = ind_line[40:43].strip() - if ind_line[0:23] == 'REMARK 220 TEMPLATE END': - pdb_end = ind_line[40:43].strip() - if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB': - pdb_code = ind_line[40:43].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': - pdb_chain = ind_line[40:43].strip() - if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': - quality_score = ind_line[40:].strip() - if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': - model_id = ind_line[40:].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': - template_chain = ind_line[40:42].strip() - if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': - fasta += threeToOne(ind_line[17:20]) - if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': - try: - score = ind_line[40:].strip() - except (ValueError): - score = -999 - if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END': - k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta]) - modbase_fasta = modbase_fasta.append(k, ignore_index=True) - fasta = '' - try: - k = pd.Series( - [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end, - quality_score, - model_id]) - modbase_reduced = modbase_reduced.append(k, ignore_index=True) - except: - NameError - print('This file doesnt have Quality Score. Replacer: -999', model_id) - quality_score = -999 - - print() - if len(modbase_fasta) != 0: - modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta'] - else: - modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta']) - modbase_fasta = modbase_fasta.astype(str) - modbase_fasta = modbase_fasta.replace({'': 'nan'}) - modbase_fasta = modbase_fasta.replace({'NaN': 'nan'}) - modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan'] - - print('Modbase model frame constructed.\n') - if len(modbase_reduced) != 0: - modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', - 'PDBEnd', - 'ModPipeQualityScore', 'ModelID'] - else: - modbase_reduced = pd.DataFrame( - columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd', - 'ModPipeQualityScore', 'ModelID']) - - to_modbase = add_annotations(to_modbase) - - to_modbase = to_modbase.astype(str) - to_modbase.fillna('nan', inplace=True) - to_modbase = to_modbase.replace({'NaN': 'nan'}) - to_modbase.replace({'[]': 'nan'}, inplace=True) - to_modbase.replace({'nan-nan': 'nan'}, inplace=True) - to_modbase.replace({'': 'nan'}, inplace=True) - model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID', - how='left') - modbase_reduced = None - existing_modbase_models = None - existing_modbase_models_ind = None - - model_info_added = model_info_added.drop(['UniprotID'], axis=1) - model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to', - 'PDBCode': 'template', 'PDBChain': 'chain', - 'ModPipeQualityScore': 'score', - 'ModelID': 'pdbID'}) - model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True) - model_info_added.score = model_info_added.score.astype(float) - model_info_added = model_info_added.sort_values(by=['datapoint', 'score'], - ascending=False) - model_info_added.reset_index(inplace=True) - model_info_added.drop(['index'], axis=1, inplace=True) - model_info_added = model_info_added.drop_duplicates() - - model_info_added = model_info_added.astype(str) - model_info_added = model_info_added.replace({'NaN': 'nan'}) - no_info = model_info_added[model_info_added.pdbID == 'nan'] - with_modbase_info = model_info_added[model_info_added.pdbID != 'nan'] - model_info_added = None - - len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint'])) - len(no_info.drop_duplicates(['datapoint'])) + len( - with_modbase_info.drop_duplicates(['datapoint'])) == len( - to_modbase.drop_duplicates(['datapoint'])) - - # Add no_info to the rest down below! - no_info = no_info[to_swiss_columns] - - with_modbase_info.score = with_modbase_info.score.astype(float) - modbase_fasta.score = modbase_fasta.score.astype(float) - - modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'], - ascending=[True, False, True, True], axis=0) # example = 3gdh - - # I added this newly downloaded ones to the main model file. - - modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'}) - with_modbase_info.pos = with_modbase_info.pos.astype('int') - with_modbase_info.score = with_modbase_info.score.astype(float) - with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2)) - modbase_fasta.score = modbase_fasta.score.astype(float) - modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2)) - - with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left') - - with_modbase_info.drop(['score_y'], axis=1, inplace=True) - with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True) - with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True) - with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True) - - with_modbase_info.score = with_modbase_info.score.astype('float') - with_modbase_info = with_modbase_info.sort_values( - ['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'], - axis=0, - ascending=[True, True, True, True, False, True, False]) - with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], - keep='first') - - with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'}) - with_modbase_info = with_modbase_info.replace({'[]': 'nan'}) - with_modbase_info = with_modbase_info.replace({'\'?\', ': ''}) - with_modbase_info = with_modbase_info.replace({', \'?\'': ''}) - with_modbase_info = with_modbase_info.replace({'(': ''}) - with_modbase_info = with_modbase_info.replace( - {')': ''}) - with_modbase_info = with_modbase_info.astype(str) - with_modbase_info.fasta = with_modbase_info.fasta.astype('str') - with_modbase_info.reset_index(inplace=True) - with_modbase_info.drop('index', axis=1, inplace=True) - - align = with_modbase_info[ - with_modbase_info.fasta != 'nan'] - yes_pdb_no_match = with_modbase_info[ - with_modbase_info.fasta == 'nan'] - yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())] - - align.rename(columns={'fasta': 'pdbSequence'}, inplace=True) - align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C') - align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C') - - to_modbase_size = len(to_modbase.drop_duplicates(['datapoint'])) - modbase_fasta = None - to_modbase = None - print('Aligning sequences...\n') - modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files') - modbase_aligned = modbase_aligned.astype(str) - modbase_aligned = modbase_aligned.replace({'NaN': 'nan'}) - - # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.) - if len(with_modbase_info) != 0: - not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']), - with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates( - ['datapoint'], - keep=False) - else: - not_in_aligned = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', - 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfide', - 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', - 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', - 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) - with_modbase_info = None - if len(not_in_aligned) != 0: - not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), - not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates( - ['datapoint'], - keep='first') - # Retain the best model among the aligned ones. - else: - not_models = pd.DataFrame(columns=not_in_aligned.columns) - - yes_pdb_no_match = None - # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. - modbase_aligned = modbase_aligned.astype(str) - if len(modbase_aligned) != 0: - nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan'] - not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan'] - not_nan.score = not_nan.score.astype(float) - not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], - inplace=True) - - not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], - ascending=[True, True, False]) - not_nan = not_nan.drop_duplicates(['datapoint'], keep='first') - else: - nan = pd.DataFrame(columns=modbase_aligned.columns) - not_nan = pd.DataFrame(columns=modbase_aligned.columns) - modbase_aligned = None - which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') - if len(which_ones_are_match) == 0: - which_ones_are_match = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', - 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', - 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) - modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - - else: - modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - - which_ones_are_match = None - modbase_match.score = modbase_match.score.astype('float') - modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], - ascending=[True, True, False]) - modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True) - not_nan = None - nan = None - - # merge not_in_align and modbase_not_match as they were both excluded from modbase match. - - # No model - no_info = no_info[to_swiss_columns] - no_info = no_info.drop_duplicates() - - # Model present, no sequence - not_models = not_models[to_swiss_columns] - not_models = not_models.drop_duplicates() - - # Modbase model and sequence present, no match in PDB - modbase_not_match = modbase_not_match[to_swiss_columns] - modbase_not_match = modbase_not_match.drop_duplicates() - if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0: - rest = pd.concat([not_in_aligned, modbase_not_match, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0: - rest = pd.concat([not_in_aligned, modbase_not_match]) - elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0: - rest = pd.concat([modbase_not_match, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0: - rest = pd.concat([not_in_aligned, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0: - rest = not_in_aligned - elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0: - rest = modbase_not_match - elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0: - rest = no_info - else: - rest = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - - rest = rest[to_swiss_columns] - rest = rest.drop_duplicates() - - rest.reset_index(inplace=True) - rest.drop(['index'], axis=1, inplace=True) - rest = rest.astype('str') + to_modbase.fillna('nan', inplace=True) + to_modbase = to_modbase.replace({'NaN': 'nan'}) + to_modbase.replace({'[]': 'nan'}, inplace=True) + to_modbase.replace({'nan-nan': 'nan'}, inplace=True) + to_modbase.replace({'': 'nan'}, inplace=True) + model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID', + how='left') + modbase_reduced = None + existing_modbase_models = None + existing_modbase_models_ind = None + + + model_info_added = model_info_added.drop(['UniprotID'], axis=1) + model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to', + 'PDBCode': 'template', 'PDBChain': 'chain', + 'ModPipeQualityScore': 'score', + 'ModelID': 'pdbID'}) + model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True) + model_info_added.score = model_info_added.score.astype(float) + model_info_added = model_info_added.sort_values(by=['datapoint', 'score'], + ascending=False) + model_info_added.reset_index(inplace=True) + model_info_added.drop(['index'], axis=1, inplace=True) + model_info_added = model_info_added.drop_duplicates() + + model_info_added = model_info_added.astype(str) + model_info_added = model_info_added.replace({'NaN': 'nan'}) + no_info = model_info_added[model_info_added.pdbID == 'nan'] + with_modbase_info = model_info_added[model_info_added.pdbID != 'nan'] + model_info_added = None + + len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint'])) + len(no_info.drop_duplicates(['datapoint'])) + len(with_modbase_info.drop_duplicates(['datapoint'])) == len( + to_modbase.drop_duplicates(['datapoint'])) + + # Add no_info to the rest down below! + no_info = no_info[to_swiss_columns] + + with_modbase_info.score = with_modbase_info.score.astype(float) + modbase_fasta.score = modbase_fasta.score.astype(float) + + modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'], + ascending=[True, False, True, True], axis=0) # example = 3gdh + + # I added this newly downloaded ones to the main model file. + + modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'}) + with_modbase_info.pos = with_modbase_info.pos.astype('int') + with_modbase_info.score = with_modbase_info.score.astype(float) + with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2)) + modbase_fasta.score = modbase_fasta.score.astype(float) + modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2)) + + with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left') + + with_modbase_info.drop(['score_y'], axis=1, inplace=True) + with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True) + with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True) + with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True) + + with_modbase_info.score = with_modbase_info.score.astype('float') + with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'], + axis=0, + ascending=[True, True, True, True, False, True, False]) + with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first') + + with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'}) + with_modbase_info = with_modbase_info.replace({'[]': 'nan'}) + with_modbase_info = with_modbase_info.replace({'\'?\', ': ''}) + with_modbase_info = with_modbase_info.replace({', \'?\'': ''}) + with_modbase_info = with_modbase_info.replace({'(': ''}) + with_modbase_info = with_modbase_info.replace( + {')': ''}) + with_modbase_info = with_modbase_info.astype(str) + with_modbase_info.fasta = with_modbase_info.fasta.astype('str') + with_modbase_info.reset_index(inplace=True) + with_modbase_info.drop('index', axis=1, inplace=True) + + + align = with_modbase_info[ + with_modbase_info.fasta != 'nan'] + yes_pdb_no_match = with_modbase_info[ + with_modbase_info.fasta == 'nan'] + yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())] + + align.rename(columns={'fasta': 'pdbSequence'}, inplace=True) + align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C') + align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C') + + to_modbase_size = len(to_modbase.drop_duplicates(['datapoint'])) + modbase_fasta = None + to_modbase = None + print('Aligning sequences...\n') + modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files') + modbase_aligned = modbase_aligned.astype(str) + modbase_aligned = modbase_aligned.replace({'NaN': 'nan'}) + # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.) + if len(with_modbase_info) != 0: + not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']), + with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates( + ['datapoint'], + keep=False) else: - - modbase_match = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', + 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfide', + 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', + 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', + 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) + with_modbase_info = None + if len(not_in_aligned) != 0: + not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), + not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(['datapoint'], + keep='first') + # Retain the best model among the aligned ones. + else: + not_models = pd.DataFrame(columns=not_in_aligned.columns) + + yes_pdb_no_match = None + # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. + modbase_aligned = modbase_aligned.astype(str) + if len(modbase_aligned) != 0: + nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan'] + not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan'] + not_nan.score = not_nan.score.astype(float) + not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True) + + not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], + ascending=[True, True, False]) + not_nan = not_nan.drop_duplicates(['datapoint'], keep='first') + else: + nan = pd.DataFrame(columns=modbase_aligned.columns) + not_nan = pd.DataFrame(columns=modbase_aligned.columns) + modbase_aligned = None + which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') + if len(which_ones_are_match) == 0: + which_ones_are_match = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', @@ -1240,483 +1118,547 @@ def pdb(input_set, mode, impute): 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) - not_in_aligned = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide', - 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', - 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) - no_info = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - rest = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - - rest = rest[to_swiss_columns] - rest = rest.drop_duplicates() - - rest.reset_index(inplace=True) - rest.drop(['index'], axis=1, inplace=True) - rest = rest.astype('str') - to_modbase_size = 0 - - print('Modbase matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d successfully aligned with SwissModels structures.' % ( - len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) - print('--%d of %d successfully aligned with Modbase structures.\n' % ( - len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size)) - print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint']))) - print('--A total of %d datapoints will not be evaluated.\n' % ( - len(rest.drop_duplicates(['datapoint'])) + len( - not_match_in_uniprot.drop_duplicates(['datapoint'])))) - - print('FOR CHECKING : ', - len(rest.drop_duplicates(['datapoint'])) + len( - not_match_in_uniprot.drop_duplicates(['datapoint'])) + len( - pdb_aligned.drop_duplicates(['datapoint'])) + len( - swiss_match.drop_duplicates(['datapoint'])) + len( - modbase_match.drop_duplicates(['datapoint'])) == data_size) - no_info = None - align = None - not_in_aligned = None - not_models = None - modbase_not_match = None - - # Final corrections - - # Now 3D alignment. - pdb = pdb_aligned.copy() - swiss = swiss_match.copy() - modbase = modbase_match.copy() - - pdb_aligned = None - swiss_match = None - modbase_match = None - - """ - WHAT DO WE HAVE NOW? - - uniprot sequence not found - - pdb aligned - - swiss aligned - - modbase aligned - - not aligned with anything (rest) - """ - - # Fix the axes and merge all data. - - pdb.drop(['pdbInfo'], axis=1, inplace=True) - pdb.rename(columns={'resolution': 'score'}, inplace=True) - swiss.rename(columns={'qmean_norm': 'score'}, inplace=True) - modbase.rename(columns={'qmean_norm': 'score'}, inplace=True) - - swiss = swiss[pdb.columns] - modbase = modbase[pdb.columns] - pdb['source'] = 'PDB' - swiss['source'] = 'SWISSMODEL' - modbase['source'] = 'MODBASE' - data = pd.concat([swiss, modbase, pdb]) - - data.reset_index(inplace=True) - data.drop(['index'], axis=1, inplace=True) - data = data.astype('str') - data_spare = pd.concat([not_match_in_uniprot, rest]) - not_match_in_uniprot = None - pdb = None - swiss = None - modbase = None - rest = None - - print('Generating FreeSASA files...') - print('------------------------------------\n') - # Folder to calculated RSA values. - - existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) - - existing_free_sasa = [str(i) for i in existing_free_sasa] - existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] - - print('Calculation RSA for PDB Structure Files...\n') - - pdb_only = data[data.source == 'PDB'] - for pdbID in pdb_only.pdbID.to_list(): - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), - include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - print('Calculation RSA for SwissModel Files...\n') - swiss_only = data[data.source == 'SWISSMODEL'] - swiss_dp = [] - for i in swiss_only.index: - swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str( - round(float(swiss_only.at[i, 'score']), 2))) - for pdbID in swiss_dp: - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - print('Calculation RSA for Modbase Model Files...\n') - modbase_only = data[data.source == 'MODBASE'] - for pdbID in modbase_only.pdbID.to_list(): - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), - include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - # This annotation list is different than the prev one, keep it. - - annotation_list += ['domainStartonPDB', 'domainEndonPDB'] - - folder_path = path_to_output_files / 'freesasa_files' - - aligner = Align.PairwiseAligner() - print('Proceeding to 3D distance calculation...\n') - - data.domainEndonPDB = data.domainEndonPDB.astype(str) - data.domainStartonPDB = data.domainStartonPDB.astype(str) - - existing_free_sasa = None - swiss_dp = None - pdb_only = None - swiss_only = None - modbase_only = None - data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C') - data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C') - for i in data.index: - id_ = data.at[i, 'pdbID'].lower() - up_id_ = data.at[i, 'uniprotID'] - score_ = str(data.at[i, 'score']) - if data.at[i, 'source'] == 'PDB': - pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb') - elif data.at[i, 'source'] == 'MODBASE': - pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt') - elif data.at[i, 'source'] == 'SWISSMODEL': - pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt') - - pdbSequence = data.at[i, 'pdbSequence'] - source = data.at[i, 'source'] - chain = data.at[i, 'chain'] - uniprotID = data.at[i, 'uniprotID'] - pdbID = data.at[i, 'pdbID'] - alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, - Path(path_to_output_files / '3D_alignment'), file_format='gzip') - mutPos = data.at[i, 'mutationPositionOnPDB'] - try: - coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0] - except: - ValueError - coordMut = 'nan' - try: - sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2] - data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], - sasa_pos, data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb') - except: - ValueError - data.at[i, 'sasa'] = 'nan' # mutation position is nan - for annot in annotation_list: - annotx = [] - try: - positions_of_annotations = data.at[i, annot].split(',') - for pos in positions_of_annotations: - pos = pos.strip().strip('\'').strip('[\'').strip('\']') - try: - if '-' not in pos: - pos = int(float(pos)) - coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0] - try: - annotx.append(find_distance(coordMut, coordAnnot)) - except: - ValueError - - else: - for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1): - coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0] - annotx.append(find_distance(coordMut, coordAnnot)) - except: - ValueError - try: - data.at[i, annot] = min([float(i) for i in annotx]) - except: - ValueError - data.at[i, annot] = 'nan' + modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - except: - ValueError + else: + modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and ( - str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'): - data.at[i, 'domainStartonPDB'] = 100000 - elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and ( - str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'): - data.at[i, 'domainEndonPDB'] = 100000 - elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'): - data.at[i, 'domaindistance3D'] = 'nan' + which_ones_are_match = None + modbase_match.score = modbase_match.score.astype('float') + modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], + ascending=[True, True, False]) + modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True) + not_nan = None + nan = None - data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), - float(data.at[i, 'domainEndonPDB'])) - data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), - float(data.at[i, 'domainEndonPDB'])) - data = data.astype(str) - data.replace({'NaN': 'nan'}, inplace=True) + # merge not_in_align and modbase_not_match as they were both excluded from modbase match. + + # No model + no_info = no_info[to_swiss_columns] + no_info = no_info.drop_duplicates() + + # Model present, no sequence + not_models = not_models[to_swiss_columns] + not_models = not_models.drop_duplicates() + + # Modbase model and sequence present, no match in PDB + modbase_not_match = modbase_not_match[to_swiss_columns] + modbase_not_match = modbase_not_match.drop_duplicates() + if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0: + rest = pd.concat([not_in_aligned, modbase_not_match, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0: + rest = pd.concat([not_in_aligned, modbase_not_match]) + elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0: + rest = pd.concat([modbase_not_match, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0: + rest = pd.concat([not_in_aligned, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0: + rest = not_in_aligned + elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0: + rest = modbase_not_match + elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0: + rest = no_info + else: + rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) - # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match. + rest = rest[to_swiss_columns] + rest = rest.drop_duplicates() - # Get interface positions from ECLAIR. Download HQ human - print() - print('Assigning surface regions...') - print('------------------------------------\n') + rest.reset_index(inplace=True) + rest.drop(['index'], axis=1, inplace=True) + rest = rest.astype('str') - print('Extracting interface residues...\n') - data_interface = pd.read_csv(path_to_interfaces, sep='\t') - positions = get_interface_positions(data_interface, 'P1', 'P2') + else: - interface_dataframe = pd.DataFrame() + modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', + 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', + 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) + not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide', + 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', + 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) + no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) + rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) + + rest = rest[to_swiss_columns] + rest = rest.drop_duplicates() + + rest.reset_index(inplace=True) + rest.drop(['index'], axis=1, inplace=True) + rest = rest.astype('str') + to_modbase_size = 0 + + print('Modbase matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d successfully aligned with SwissModels structures.' % ( + len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) + print('--%d of %d successfully aligned with Modbase structures.\n' % ( + len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size)) + print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint']))) + print('--A total of %d datapoints will not be evaluated.\n' % ( + len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])))) + + print('FOR CHECKING : ', + len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) + len( + pdb_aligned.drop_duplicates(['datapoint'])) + len(swiss_match.drop_duplicates(['datapoint'])) + len( + modbase_match.drop_duplicates(['datapoint'])) == data_size) + no_info = None + align = None + not_in_aligned = None + not_models = None + modbase_not_match = None + + + # Final corrections + + # Now 3D alignment. + pdb = pdb_aligned.copy() + swiss = swiss_match.copy() + modbase = modbase_match.copy() + pdb_aligned = None + swiss_match = None + modbase_match = None + + """ + WHAT DO WE HAVE NOW? + - uniprot sequence not found + - pdb aligned + - swiss aligned + - modbase aligned + - not aligned with anything (rest) + """ + + # Fix the axes and merge all data. + + + pdb.drop(['pdbInfo'], axis=1, inplace=True) + pdb.rename(columns={'resolution': 'score'}, inplace=True) + swiss.rename(columns={'qmean_norm': 'score'}, inplace=True) + modbase.rename(columns={'qmean_norm': 'score'}, inplace=True) + + swiss = swiss[pdb.columns] + modbase = modbase[pdb.columns] + pdb['source'] = 'PDB' + swiss['source'] = 'SWISSMODEL' + modbase['source'] = 'MODBASE' + data = pd.concat([swiss, modbase, pdb]) + + + data.reset_index(inplace=True) + data.drop(['index'], axis=1, inplace=True) + data = data.astype('str') + data_spare = pd.concat([not_match_in_uniprot, rest]) + not_match_in_uniprot = None + pdb = None + swiss = None + modbase = None + rest = None + + print('Generating FreeSASA files...') + print('------------------------------------\n') + # Folder to calculated RSA values. + + existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) + existing_free_sasa = [str(i) for i in existing_free_sasa] + existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] + + print('Calculation RSA for PDB Structure Files...\n') + + pdb_only = data[data.source == 'PDB'] + for pdbID in pdb_only.pdbID.to_list(): + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + + print('Calculation RSA for SwissModel Files...\n') + swiss_only = data[data.source == 'SWISSMODEL'] + swiss_dp = [] + for i in swiss_only.index: + swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str( + round(float(swiss_only.at[i, 'score']), 2))) + for pdbID in swiss_dp: + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + print('Calculation RSA for Modbase Model Files...\n') + modbase_only = data[data.source == 'MODBASE'] + for pdbID in modbase_only.pdbID.to_list(): + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + # This annotation list is different than the prev one, keep it. + + annotation_list += ['domainStartonPDB', 'domainEndonPDB'] + + folder_path = path_to_output_files / 'freesasa_files' + + aligner = Align.PairwiseAligner() + print('Proceeding to 3D distance calculation...\n') + + data.domainEndonPDB = data.domainEndonPDB.astype(str) + data.domainStartonPDB = data.domainStartonPDB.astype(str) + + existing_free_sasa = None + swiss_dp = None + pdb_only = None + swiss_only = None + modbase_only = None + data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C') + data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C') + for i in data.index: + id_ = data.at[i, 'pdbID'].lower() + up_id_ = data.at[i, 'uniprotID'] + score_ = str(data.at[i, 'score']) + if data.at[i, 'source'] == 'PDB': + pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb') + elif data.at[i, 'source'] == 'MODBASE': + pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt') + elif data.at[i, 'source'] == 'SWISSMODEL': + pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt') + + pdbSequence = data.at[i, 'pdbSequence'] + source = data.at[i, 'source'] + chain = data.at[i, 'chain'] + uniprotID = data.at[i, 'uniprotID'] + pdbID = data.at[i, 'pdbID'] + alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip') + mutPos = data.at[i, 'mutationPositionOnPDB'] + try: + coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0] + except: + ValueError + coordMut = 'nan' + try: + sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2] + data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb') + except: + ValueError + data.at[i, 'sasa'] = 'nan' # mutation position is nan + for annot in annotation_list: + annotx = [] + try: + positions_of_annotations = data.at[i, annot].split(',') + for pos in positions_of_annotations: + pos = pos.strip().strip('\'').strip('[\'').strip('\']') + try: + if '-' not in pos: + pos = int(float(pos)) + coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0] + try: + annotx.append(find_distance(coordMut, coordAnnot)) + except: + ValueError - for key, val in positions.items(): - k = pd.Series((key, str(list(set(val))))) - interface_dataframe = interface_dataframe.append(k, ignore_index=True) - interface_dataframe.columns = ['uniprotID', 'positions'] + else: + for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1): + coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0] + annotx.append(find_distance(coordMut, coordAnnot)) + except: + ValueError + try: + data.at[i, annot] = min([float(i) for i in annotx]) + except: + ValueError + data.at[i, annot] = 'nan' - if len(data) == 0: - data = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score', - 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane', - 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', - 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', - 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus', - 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB', - 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher']) - else: - data.sasa = data.sasa.astype('str') - - for i in data.index: - if '*' in data.at[i, 'sasa']: - data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0] - - data.sasa = data.sasa.replace({'N/A': 'nan'}) - data.sasa = data.sasa.replace({'None': 'nan'}) - data.replace({' N/A': 'nan'}, inplace=True) - data.replace({'None': 'nan'}, inplace=True) - data.sasa = data.sasa.astype(float) - data = data.astype(str) - for i in data.index: - if float(data.at[i, 'sasa']) < 5: - data.at[i, 'trsh4'] = 'core' - elif float(data.at[i, 'sasa']) >= 5: - data.at[i, 'trsh4'] = 'surface' - elif data.at[i, 'sasa'] == 'nan': - data.at[i, 'trsh4'] = 'nan' - - data = data.merge(interface_dataframe, on='uniprotID', how='left') - data.positions = data.positions.astype('str') - for i in data.index: - if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': - print((str(data.at[i, 'pos']) in data.at[i, 'positions'])) - data.at[i, 'threeState_trsh4_HQ'] = 'interface' - elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': - data.at[i, 'threeState_trsh4_HQ'] = 'surface' - elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': - data.at[i, 'threeState_trsh4_HQ'] = 'core' - elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': - data.at[i, 'threeState_trsh4_HQ'] = 'conflict' - elif data.at[i, 'trsh4'] == 'nan': - data.at[i, 'threeState_trsh4_HQ'] = 'nan' - - data.drop(['positions'], axis=1, inplace=True) - - # OPTIONAL - # DOMAIN SELECTION - # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most - # significant domains and 53th category will be NULL. - - fisherResult = pd.read_csv(fisher_path, sep='\t') - - significant_domains = fisherResult.domain.to_list() - for i in data.index: - if data.at[i, 'domain'] in significant_domains: - data.at[i, 'domain_fisher'] = data.at[i, 'domain'] - else: - data.at[i, 'domain_fisher'] = 'NULL' - - # Change the numbering for binary annotations and create 3 classes: - # nan--> 0, 0 -->1 and 1 -->2 - - print('Final adjustments are being done...\n') - binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', - 'dnaBindingBinary', - 'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary'] - data = data.astype(str) - data.replace({'NaN': 'nan'}, inplace=True) - for i in data.index: - for j in binaryCols: - data[j] = data[j].astype('str') - if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): - data.at[i, j] = '1' - elif data.at[i, j] == 'nan': - data.at[i, j] = '0' - elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): - data.at[i, j] = '2' - - annotCols = ['disulfide', 'intMet', 'intramembrane', - 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', - 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', - 'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding', - 'topologicalDomain', 'bindingSite', 'region', 'signalPeptide', - 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', - 'transitPeptide', 'glycosylation', 'propeptide'] - - for i in data.index: - for annot in annotCols: - binaryName = str(annot) + 'Binary' - if data.at[i, binaryName] == '2': - data.at[i, annot] = '0.0' - data.replace({'100000': 'nan'}, inplace=True) - data = add_physicochemical(data) - data.rename( - columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', - 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', - 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', - 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', - 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', - 'intramembraneBinary': 'intramembrane_bin', - 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', - 'activeSiteBinary': 'activeSite_bin', - 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', - 'siteBinary': 'site_bin', - 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', - 'mutagenesisBinary': 'mutagenesis_bin', - 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', - 'metalBindingBinary': 'metalBinding_bin', - 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', - 'caBindingBinary': 'caBinding_bin', - 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', - 'signalPeptideBinary': 'signalPeptide_bin', - 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', - 'motifBinary': 'motif_bin', - 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', - 'transitPeptideBinary': 'transitPeptide_bin', - 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', - 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', - 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', - 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', - 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', - 'site': 'site_dist', - 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', - 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', - 'turn': 'turn_dist', - 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', - 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', - 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', - 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', - 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', - 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', - 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) - - data = data[ - ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', - 'volume', - 'granthamScore', 'domains_all', - 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', - 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', - 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', - 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', - 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', - 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', - 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', - 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', - 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', - 'intramembrane_dist', - 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', - 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', - 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', - 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', - 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', - 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', - 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', - 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', - 'glycosylation_dist', 'propeptide_dist']] - - ready = data.copy() - # Imputation - if (impute == 'True') or (impute == 'true') or (impute == True): - filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, - 15.99, 16.82, - 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, - 22.36] - col_index = 0 - for col_ in ready.columns[-30:]: - ready[col_] = ready[col_].fillna(filler[col_index]) - ready[col_] = ready[col_].replace({'nan': filler[col_index]}) - col_index += 1 - ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5) - ready['sasa'] = ready['sasa'].fillna(29.5) - ready['location_3state'] = ready['location_3state'].fillna('unknown') - elif (impute == 'False') or (impute == 'false') or (impute == False): - pass - ready = ready.replace({'nan': np.NaN}) - ready = ready.astype(str) - ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) - if len(ready) == 0: - print( - 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') - #st.write(ready) - print('Feature vector successfully created...') - end = timer() - hours, rem = divmod(end - start, 3600) - minutes, seconds = divmod(rem, 60) - print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) + except: + ValueError + if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and ( + str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'): + data.at[i, 'domainStartonPDB'] = 100000 + elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and ( + str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'): + data.at[i, 'domainEndonPDB'] = 100000 + elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'): + data.at[i, 'domaindistance3D'] = 'nan' + + data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), + float(data.at[i, 'domainEndonPDB'])) + data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), + float(data.at[i, 'domainEndonPDB'])) + + + data = data.astype(str) + data.replace({'NaN': 'nan'}, inplace=True) + + + # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match. + + # Get interface positions from ECLAIR. Download HQ human + print() + print('Assigning surface regions...') + print('------------------------------------\n') + + print('Extracting interface residues...\n') + data_interface = pd.read_csv(path_to_interfaces, sep='\t') + + positions = get_interface_positions(data_interface, 'P1', 'P2') + + interface_dataframe = pd.DataFrame() + + for key, val in positions.items(): + k = pd.Series((key, str(list(set(val))))) + interface_dataframe = interface_dataframe.append(k, ignore_index=True) + interface_dataframe.columns = ['uniprotID', 'positions'] + + if len(data) == 0: + data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score', + 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane', + 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', + 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', + 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus', + 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB', + 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher']) + else: + data.sasa = data.sasa.astype('str') + + for i in data.index: + if '*' in data.at[i, 'sasa']: + data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0] + + data.sasa = data.sasa.replace({'N/A': 'nan'}) + data.sasa = data.sasa.replace({'None': 'nan'}) + data.replace({' N/A': 'nan'}, inplace=True) + data.replace({'None': 'nan'}, inplace=True) + data.sasa = data.sasa.astype(float) + data = data.astype(str) + for i in data.index: + if float(data.at[i, 'sasa']) < 5: + data.at[i, 'trsh4'] = 'core' + elif float(data.at[i, 'sasa']) >= 5: + data.at[i, 'trsh4'] = 'surface' + elif data.at[i, 'sasa'] == 'nan': + data.at[i, 'trsh4'] = 'nan' + + data = data.merge(interface_dataframe, on='uniprotID', how='left') + data.positions = data.positions.astype('str') + for i in data.index: + if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + print((str(data.at[i, 'pos']) in data.at[i, 'positions'])) + data.at[i, 'threeState_trsh4_HQ'] = 'interface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + data.at[i, 'threeState_trsh4_HQ'] = 'surface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'core' + elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'conflict' + elif data.at[i, 'trsh4'] == 'nan': + data.at[i, 'threeState_trsh4_HQ'] = 'nan' + + data.drop(['positions'], axis=1, inplace=True) + + + # OPTIONAL + # DOMAIN SELECTION + # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most + # significant domains and 53th category will be NULL. + + fisherResult = pd.read_csv(fisher_path, sep='\t') + + significant_domains = fisherResult.domain.to_list() + for i in data.index: + if data.at[i, 'domain'] in significant_domains: + data.at[i, 'domain_fisher'] = data.at[i, 'domain'] + else: + data.at[i, 'domain_fisher'] = 'NULL' + + # Change the numbering for binary annotations and create 3 classes: + # nan--> 0, 0 -->1 and 1 -->2 + + print('Final adjustments are being done...\n') + binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', + 'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary'] + data = data.astype(str) + data.replace({'NaN': 'nan'}, inplace=True) + for i in data.index: + for j in binaryCols: + data[j] = data[j].astype('str') + if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): + data.at[i, j] = '1' + elif data.at[i, j] == 'nan': + data.at[i, j] = '0' + elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): + data.at[i, j] = '2' + + annotCols = ['disulfide', 'intMet', 'intramembrane', + 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', + 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', + 'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding', + 'topologicalDomain', 'bindingSite', 'region', 'signalPeptide', + 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', + 'transitPeptide', 'glycosylation', 'propeptide'] + + for i in data.index: + for annot in annotCols: + binaryName = str(annot) + 'Binary' + if data.at[i, binaryName] == '2': + data.at[i, annot] = '0.0' + data.replace({'100000': 'nan'}, inplace=True) + data = add_physicochemical(data) + data.rename( + columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', + 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', + 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', + 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', + 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', + 'intramembraneBinary': 'intramembrane_bin', + 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', + 'activeSiteBinary': 'activeSite_bin', + 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', + 'siteBinary': 'site_bin', + 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', + 'mutagenesisBinary': 'mutagenesis_bin', + 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', + 'metalBindingBinary': 'metalBinding_bin', + 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', + 'caBindingBinary': 'caBinding_bin', + 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', + 'signalPeptideBinary': 'signalPeptide_bin', + 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', + 'motifBinary': 'motif_bin', + 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', + 'transitPeptideBinary': 'transitPeptide_bin', + 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', + 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', + 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', + 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', + 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', + 'site': 'site_dist', + 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', + 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', + 'turn': 'turn_dist', + 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', + 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', + 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', + 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', + 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', + 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', + 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) + + data = data[ + ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', + 'volume', + 'granthamScore', 'domains_all', + 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', + 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', + 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', + 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', + 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', + 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', + 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', + 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', + 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', + 'intramembrane_dist', + 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', + 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', + 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', + 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', + 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', + 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', + 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', + 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', + 'glycosylation_dist', 'propeptide_dist']] + ready = data.copy() + # Imputation + if (impute == 'True') or (impute == 'true'): + filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82, + 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] + col_index = 0 + for col_ in ready.columns[-30:]: + ready[col_] = ready[col_].fillna(filler[col_index]) + ready[col_] = ready[col_].replace({'nan': filler[col_index]}) + col_index += 1 + ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5) + ready['sasa'] = ready['sasa'].fillna(29.5) + ready['location_3state'] = ready['location_3state'].fillna('unknown') + elif (impute == 'False') or (impute == 'false'): + pass + ready = ready.replace({'nan': np.NaN}) + ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) + if len(ready) == 0: + print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') + print(ready) + print('Feature vector successfully created...') return ready - except: - AttributeError - - + end = timer() + hours, rem = divmod(end - start, 3600) + minutes, seconds = divmod(rem, 60) + print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) + sys.stdout.close() + return ready