Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Jul 25, 2023

Commit

d955409

1 Parent(s): 26f6c21

Update code/pdb_featureVector.py

Browse files

Files changed (1) hide show

code/pdb_featureVector.py +0 -18

code/pdb_featureVector.py CHANGED Viewed

@@ -60,7 +60,6 @@ def pdb(input_set, mode, impute):
     data = clean_data(input_set)
     path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =  manage_files(mode)
     out_path = path_to_output_files / 'log.txt'
-    st.write(out_path)
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
@@ -226,24 +225,18 @@ def pdb(input_set, mode, impute):
         existing_pdb = [str(i) for i in existing_pdb]
         existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
         cnt = 0
-        st.write('existing_pdb', existing_pdb)
         for search in pdbs:
-            st.write('PDBS', search)
             try:
                 if search.lower() not in existing_pdb:
-                    st.write(Path(path_to_output_files / 'pdb_structures'))
                     file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
-                    st.write(file)
                 else:
                     print('PDB structure file exists..')
                     for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                        st.write('filename', filename)
                         filename_replace_ext = filename.with_suffix(".pdb")
                         filename.rename(filename_replace_ext)
                     file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
-                    st.write('file', file)
                     base = os.path.splitext(str(file))[0]
                     base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
@@ -253,7 +246,6 @@ def pdb(input_set, mode, impute):
                 resolution_method = parser.get_structure(search, file)
                 for record in SeqIO.parse(file, "pdb-seqres"):
                     if record.dbxrefs[0].split(':')[0] == 'UNP':
-                        st.write('RECORD', record)
                         pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
                         pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
                         pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
@@ -263,7 +255,6 @@ def pdb(input_set, mode, impute):
                         pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
                     index += 1
             except:
-                st.write('ERROR INDEX')
                 IndexError
                 pdb_info.at[index, 'uniprotID'] = 'nan'
                 pdb_info.at[index, 'pdbID'] = 'nan'
@@ -288,13 +279,10 @@ def pdb(input_set, mode, impute):
                 FileNotFoundError
         uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
-        st.write('pdb_info', pdb_info)
         uniprot_matched = uniprot_matched.astype(str)
         uniprot_matched = uniprot_matched.drop_duplicates()
-        st.write('pdb_fasta', pdb_fasta)
         uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
         uniprot_matched = uniprot_matched.astype(str)
-        st.write('uniprot_matched', uniprot_matched)
         with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
                 (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
@@ -304,12 +292,10 @@ def pdb(input_set, mode, impute):
                 uniprot_matched.resolution == 'None'))]
         no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
         no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
-        st.write('with_pdb', with_pdb)
         print(
             'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
             % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
                len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
-        st.write('with_pdb1', with_pdb)
         with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
         with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
@@ -388,7 +374,6 @@ def pdb(input_set, mode, impute):
         # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
         with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
         with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
-        st.write('with_pdb2', with_pdb)
         dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
         dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
@@ -415,13 +400,11 @@ def pdb(input_set, mode, impute):
         existing_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
-        st.write('dfM', dfM)
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
-        st.write('aligned_m', aligned_m)
         # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
         for i in aligned_m.index:
             if aligned_m.at[i, 'pdbSequence'] == 'nan':
@@ -463,7 +446,6 @@ def pdb(input_set, mode, impute):
         yes_pdb_no_match = after_up_pdb_alignment[
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
-        st.write('pdb_aligned', pdb_aligned)
         print('PDB matching is completed...\n')

     data = clean_data(input_set)
     path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =  manage_files(mode)
     out_path = path_to_output_files / 'log.txt'
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
         existing_pdb = [str(i) for i in existing_pdb]
         existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
         cnt = 0
         for search in pdbs:
             try:
                 if search.lower() not in existing_pdb:
                     file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
                 else:
                     print('PDB structure file exists..')
                     for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
                         filename_replace_ext = filename.with_suffix(".pdb")
                         filename.rename(filename_replace_ext)
                     file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
                     base = os.path.splitext(str(file))[0]
                     base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
                 resolution_method = parser.get_structure(search, file)
                 for record in SeqIO.parse(file, "pdb-seqres"):
                     if record.dbxrefs[0].split(':')[0] == 'UNP':
                         pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
                         pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
                         pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
                         pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
                     index += 1
             except:
                 IndexError
                 pdb_info.at[index, 'uniprotID'] = 'nan'
                 pdb_info.at[index, 'pdbID'] = 'nan'
                 FileNotFoundError
         uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
         uniprot_matched = uniprot_matched.astype(str)
         uniprot_matched = uniprot_matched.drop_duplicates()
         uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
         uniprot_matched = uniprot_matched.astype(str)
         with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
                 (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
                 uniprot_matched.resolution == 'None'))]
         no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
         no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
         print(
             'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
             % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
                len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
         with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
         with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
         # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
         with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
         with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
         dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
         dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
         existing_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
         for i in aligned_m.index:
             if aligned_m.at[i, 'pdbSequence'] == 'nan':
         yes_pdb_no_match = after_up_pdb_alignment[
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')