Spaces:
Sleeping
Sleeping
fatmacankara
commited on
Commit
·
7c5bea8
1
Parent(s):
80cfda5
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +13 -10
code/pdb_featureVector.py
CHANGED
@@ -172,6 +172,7 @@ def pdb(input_set, mode, impute):
|
|
172 |
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
173 |
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
174 |
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
|
|
175 |
|
176 |
"""
|
177 |
STEP 5
|
@@ -262,6 +263,7 @@ def pdb(input_set, mode, impute):
|
|
262 |
cnt +=1
|
263 |
print()
|
264 |
print('PDB file processing finished..')
|
|
|
265 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
266 |
try:
|
267 |
filename_replace_ext = filename.with_suffix(".pdb")
|
@@ -325,7 +327,7 @@ def pdb(input_set, mode, impute):
|
|
325 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
326 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
327 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
328 |
-
|
329 |
"""
|
330 |
STEP 6
|
331 |
Retrieve sequence annotations.
|
@@ -366,7 +368,7 @@ def pdb(input_set, mode, impute):
|
|
366 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
367 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
368 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
369 |
-
|
370 |
"""
|
371 |
STEP 7
|
372 |
Do alignment for PDB
|
@@ -448,7 +450,7 @@ def pdb(input_set, mode, impute):
|
|
448 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
449 |
no_pdb = no_pdb.copy()
|
450 |
|
451 |
-
|
452 |
print('PDB matching is completed...\n')
|
453 |
print('SUMMARY')
|
454 |
print('-------')
|
@@ -543,7 +545,7 @@ def pdb(input_set, mode, impute):
|
|
543 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
544 |
print('Index File Processed...\n')
|
545 |
|
546 |
-
|
547 |
# Get relevant columns
|
548 |
swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
549 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
@@ -623,7 +625,7 @@ def pdb(input_set, mode, impute):
|
|
623 |
swiss_model = None
|
624 |
no_swiss_models = None
|
625 |
url_nan = None
|
626 |
-
|
627 |
# At this point we have:
|
628 |
# pdb_aligned --- Align in the PDB phase
|
629 |
# not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
|
@@ -713,7 +715,7 @@ def pdb(input_set, mode, impute):
|
|
713 |
k = pd.Series(swiss_models_with_data.iloc[i])
|
714 |
broken_swiss = broken_swiss.append(k, ignore_index=True)
|
715 |
c += 1
|
716 |
-
|
717 |
if len(broken_swiss) == 0:
|
718 |
broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
|
719 |
|
@@ -821,7 +823,7 @@ def pdb(input_set, mode, impute):
|
|
821 |
not_nan = None
|
822 |
which_ones_are_match = None
|
823 |
swiss_not_match = None
|
824 |
-
|
825 |
# STEP : GO TO MODBASE
|
826 |
# Should not include anything related to prev models.
|
827 |
if len(to_modbase) != 0:
|
@@ -843,7 +845,7 @@ def pdb(input_set, mode, impute):
|
|
843 |
|
844 |
modbase_reduced = pd.DataFrame()
|
845 |
modbase_fasta = pd.DataFrame()
|
846 |
-
|
847 |
print('Retrieving ModBase models...\n')
|
848 |
# Get model files associated with each UniProtID
|
849 |
for protein in list(set(to_modbase.uniprotID.to_list())):
|
@@ -919,6 +921,7 @@ def pdb(input_set, mode, impute):
|
|
919 |
quality_score = -999
|
920 |
|
921 |
print()
|
|
|
922 |
if len(modbase_fasta) != 0:
|
923 |
modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
|
924 |
else:
|
@@ -1227,7 +1230,7 @@ def pdb(input_set, mode, impute):
|
|
1227 |
rest.drop(['index'], axis=1, inplace=True)
|
1228 |
rest = rest.astype('str')
|
1229 |
to_modbase_size = 0
|
1230 |
-
|
1231 |
print('Modbase matching is completed...\n')
|
1232 |
print('SUMMARY')
|
1233 |
print('-------')
|
@@ -1299,7 +1302,7 @@ def pdb(input_set, mode, impute):
|
|
1299 |
swiss = None
|
1300 |
modbase = None
|
1301 |
rest = None
|
1302 |
-
|
1303 |
print('Generating FreeSASA files...')
|
1304 |
print('------------------------------------\n')
|
1305 |
# Folder to calculated RSA values.
|
|
|
172 |
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
173 |
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
174 |
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
175 |
+
st.write('Checkpoint1')
|
176 |
|
177 |
"""
|
178 |
STEP 5
|
|
|
263 |
cnt +=1
|
264 |
print()
|
265 |
print('PDB file processing finished..')
|
266 |
+
st.write('Checkpoint2')
|
267 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
268 |
try:
|
269 |
filename_replace_ext = filename.with_suffix(".pdb")
|
|
|
327 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
328 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
329 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
330 |
+
st.write('Checkpoint3')
|
331 |
"""
|
332 |
STEP 6
|
333 |
Retrieve sequence annotations.
|
|
|
368 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
369 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
370 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
371 |
+
st.write('Checkpoint4')
|
372 |
"""
|
373 |
STEP 7
|
374 |
Do alignment for PDB
|
|
|
450 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
451 |
no_pdb = no_pdb.copy()
|
452 |
|
453 |
+
st.write('Checkpoint5')
|
454 |
print('PDB matching is completed...\n')
|
455 |
print('SUMMARY')
|
456 |
print('-------')
|
|
|
545 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
546 |
print('Index File Processed...\n')
|
547 |
|
548 |
+
st.write('Checkpoint6')
|
549 |
# Get relevant columns
|
550 |
swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
551 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
|
|
625 |
swiss_model = None
|
626 |
no_swiss_models = None
|
627 |
url_nan = None
|
628 |
+
st.write('Checkpoint7')
|
629 |
# At this point we have:
|
630 |
# pdb_aligned --- Align in the PDB phase
|
631 |
# not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
|
|
|
715 |
k = pd.Series(swiss_models_with_data.iloc[i])
|
716 |
broken_swiss = broken_swiss.append(k, ignore_index=True)
|
717 |
c += 1
|
718 |
+
st.write('Checkpoint7')
|
719 |
if len(broken_swiss) == 0:
|
720 |
broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
|
721 |
|
|
|
823 |
not_nan = None
|
824 |
which_ones_are_match = None
|
825 |
swiss_not_match = None
|
826 |
+
st.write('Checkpoint8')
|
827 |
# STEP : GO TO MODBASE
|
828 |
# Should not include anything related to prev models.
|
829 |
if len(to_modbase) != 0:
|
|
|
845 |
|
846 |
modbase_reduced = pd.DataFrame()
|
847 |
modbase_fasta = pd.DataFrame()
|
848 |
+
st.write('Checkpoint9')
|
849 |
print('Retrieving ModBase models...\n')
|
850 |
# Get model files associated with each UniProtID
|
851 |
for protein in list(set(to_modbase.uniprotID.to_list())):
|
|
|
921 |
quality_score = -999
|
922 |
|
923 |
print()
|
924 |
+
st.write('Checkpoint10')
|
925 |
if len(modbase_fasta) != 0:
|
926 |
modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
|
927 |
else:
|
|
|
1230 |
rest.drop(['index'], axis=1, inplace=True)
|
1231 |
rest = rest.astype('str')
|
1232 |
to_modbase_size = 0
|
1233 |
+
st.write('Checkpoint11')
|
1234 |
print('Modbase matching is completed...\n')
|
1235 |
print('SUMMARY')
|
1236 |
print('-------')
|
|
|
1302 |
swiss = None
|
1303 |
modbase = None
|
1304 |
rest = None
|
1305 |
+
st.write('Checkpoint12')
|
1306 |
print('Generating FreeSASA files...')
|
1307 |
print('------------------------------------\n')
|
1308 |
# Folder to calculated RSA values.
|