fatmacankara commited on
Commit
7c5bea8
·
1 Parent(s): 80cfda5

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +13 -10
code/pdb_featureVector.py CHANGED
@@ -172,6 +172,7 @@ def pdb(input_set, mode, impute):
172
  print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
173
  % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
174
  len(uniprot_matched.drop_duplicates(['datapoint']))))
 
175
 
176
  """
177
  STEP 5
@@ -262,6 +263,7 @@ def pdb(input_set, mode, impute):
262
  cnt +=1
263
  print()
264
  print('PDB file processing finished..')
 
265
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
266
  try:
267
  filename_replace_ext = filename.with_suffix(".pdb")
@@ -325,7 +327,7 @@ def pdb(input_set, mode, impute):
325
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
326
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
327
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
328
-
329
  """
330
  STEP 6
331
  Retrieve sequence annotations.
@@ -366,7 +368,7 @@ def pdb(input_set, mode, impute):
366
  with_pdb.replace({'[]': 'nan'}, inplace=True)
367
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
368
  with_pdb.replace({'': 'nan'}, inplace=True)
369
-
370
  """
371
  STEP 7
372
  Do alignment for PDB
@@ -448,7 +450,7 @@ def pdb(input_set, mode, impute):
448
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
449
  no_pdb = no_pdb.copy()
450
 
451
-
452
  print('PDB matching is completed...\n')
453
  print('SUMMARY')
454
  print('-------')
@@ -543,7 +545,7 @@ def pdb(input_set, mode, impute):
543
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
544
  print('Index File Processed...\n')
545
 
546
-
547
  # Get relevant columns
548
  swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
549
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
@@ -623,7 +625,7 @@ def pdb(input_set, mode, impute):
623
  swiss_model = None
624
  no_swiss_models = None
625
  url_nan = None
626
-
627
  # At this point we have:
628
  # pdb_aligned --- Align in the PDB phase
629
  # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
@@ -713,7 +715,7 @@ def pdb(input_set, mode, impute):
713
  k = pd.Series(swiss_models_with_data.iloc[i])
714
  broken_swiss = broken_swiss.append(k, ignore_index=True)
715
  c += 1
716
-
717
  if len(broken_swiss) == 0:
718
  broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
719
 
@@ -821,7 +823,7 @@ def pdb(input_set, mode, impute):
821
  not_nan = None
822
  which_ones_are_match = None
823
  swiss_not_match = None
824
-
825
  # STEP : GO TO MODBASE
826
  # Should not include anything related to prev models.
827
  if len(to_modbase) != 0:
@@ -843,7 +845,7 @@ def pdb(input_set, mode, impute):
843
 
844
  modbase_reduced = pd.DataFrame()
845
  modbase_fasta = pd.DataFrame()
846
-
847
  print('Retrieving ModBase models...\n')
848
  # Get model files associated with each UniProtID
849
  for protein in list(set(to_modbase.uniprotID.to_list())):
@@ -919,6 +921,7 @@ def pdb(input_set, mode, impute):
919
  quality_score = -999
920
 
921
  print()
 
922
  if len(modbase_fasta) != 0:
923
  modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
924
  else:
@@ -1227,7 +1230,7 @@ def pdb(input_set, mode, impute):
1227
  rest.drop(['index'], axis=1, inplace=True)
1228
  rest = rest.astype('str')
1229
  to_modbase_size = 0
1230
-
1231
  print('Modbase matching is completed...\n')
1232
  print('SUMMARY')
1233
  print('-------')
@@ -1299,7 +1302,7 @@ def pdb(input_set, mode, impute):
1299
  swiss = None
1300
  modbase = None
1301
  rest = None
1302
-
1303
  print('Generating FreeSASA files...')
1304
  print('------------------------------------\n')
1305
  # Folder to calculated RSA values.
 
172
  print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
173
  % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
174
  len(uniprot_matched.drop_duplicates(['datapoint']))))
175
+ st.write('Checkpoint1')
176
 
177
  """
178
  STEP 5
 
263
  cnt +=1
264
  print()
265
  print('PDB file processing finished..')
266
+ st.write('Checkpoint2')
267
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
268
  try:
269
  filename_replace_ext = filename.with_suffix(".pdb")
 
327
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
328
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
329
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
330
+ st.write('Checkpoint3')
331
  """
332
  STEP 6
333
  Retrieve sequence annotations.
 
368
  with_pdb.replace({'[]': 'nan'}, inplace=True)
369
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
370
  with_pdb.replace({'': 'nan'}, inplace=True)
371
+ st.write('Checkpoint4')
372
  """
373
  STEP 7
374
  Do alignment for PDB
 
450
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
451
  no_pdb = no_pdb.copy()
452
 
453
+ st.write('Checkpoint5')
454
  print('PDB matching is completed...\n')
455
  print('SUMMARY')
456
  print('-------')
 
545
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
546
  print('Index File Processed...\n')
547
 
548
+ st.write('Checkpoint6')
549
  # Get relevant columns
550
  swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
551
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
 
625
  swiss_model = None
626
  no_swiss_models = None
627
  url_nan = None
628
+ st.write('Checkpoint7')
629
  # At this point we have:
630
  # pdb_aligned --- Align in the PDB phase
631
  # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
 
715
  k = pd.Series(swiss_models_with_data.iloc[i])
716
  broken_swiss = broken_swiss.append(k, ignore_index=True)
717
  c += 1
718
+ st.write('Checkpoint7')
719
  if len(broken_swiss) == 0:
720
  broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
721
 
 
823
  not_nan = None
824
  which_ones_are_match = None
825
  swiss_not_match = None
826
+ st.write('Checkpoint8')
827
  # STEP : GO TO MODBASE
828
  # Should not include anything related to prev models.
829
  if len(to_modbase) != 0:
 
845
 
846
  modbase_reduced = pd.DataFrame()
847
  modbase_fasta = pd.DataFrame()
848
+ st.write('Checkpoint9')
849
  print('Retrieving ModBase models...\n')
850
  # Get model files associated with each UniProtID
851
  for protein in list(set(to_modbase.uniprotID.to_list())):
 
921
  quality_score = -999
922
 
923
  print()
924
+ st.write('Checkpoint10')
925
  if len(modbase_fasta) != 0:
926
  modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
927
  else:
 
1230
  rest.drop(['index'], axis=1, inplace=True)
1231
  rest = rest.astype('str')
1232
  to_modbase_size = 0
1233
+ st.write('Checkpoint11')
1234
  print('Modbase matching is completed...\n')
1235
  print('SUMMARY')
1236
  print('-------')
 
1302
  swiss = None
1303
  modbase = None
1304
  rest = None
1305
+ st.write('Checkpoint12')
1306
  print('Generating FreeSASA files...')
1307
  print('------------------------------------\n')
1308
  # Folder to calculated RSA values.