fatmacankara commited on
Commit
d955409
·
1 Parent(s): 26f6c21

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +0 -18
code/pdb_featureVector.py CHANGED
@@ -60,7 +60,6 @@ def pdb(input_set, mode, impute):
60
  data = clean_data(input_set)
61
  path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
62
  out_path = path_to_output_files / 'log.txt'
63
- st.write(out_path)
64
  sys.stdout = open(out_path, 'w')
65
  print('Creating directories...')
66
 
@@ -226,24 +225,18 @@ def pdb(input_set, mode, impute):
226
  existing_pdb = [str(i) for i in existing_pdb]
227
  existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
228
  cnt = 0
229
- st.write('existing_pdb', existing_pdb)
230
  for search in pdbs:
231
- st.write('PDBS', search)
232
 
233
  try:
234
  if search.lower() not in existing_pdb:
235
- st.write(Path(path_to_output_files / 'pdb_structures'))
236
  file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
237
- st.write(file)
238
  else:
239
  print('PDB structure file exists..')
240
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
241
- st.write('filename', filename)
242
  filename_replace_ext = filename.with_suffix(".pdb")
243
  filename.rename(filename_replace_ext)
244
 
245
  file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
246
- st.write('file', file)
247
 
248
  base = os.path.splitext(str(file))[0]
249
  base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
@@ -253,7 +246,6 @@ def pdb(input_set, mode, impute):
253
  resolution_method = parser.get_structure(search, file)
254
  for record in SeqIO.parse(file, "pdb-seqres"):
255
  if record.dbxrefs[0].split(':')[0] == 'UNP':
256
- st.write('RECORD', record)
257
  pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
258
  pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
259
  pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
@@ -263,7 +255,6 @@ def pdb(input_set, mode, impute):
263
  pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
264
  index += 1
265
  except:
266
- st.write('ERROR INDEX')
267
  IndexError
268
  pdb_info.at[index, 'uniprotID'] = 'nan'
269
  pdb_info.at[index, 'pdbID'] = 'nan'
@@ -288,13 +279,10 @@ def pdb(input_set, mode, impute):
288
  FileNotFoundError
289
 
290
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
291
- st.write('pdb_info', pdb_info)
292
  uniprot_matched = uniprot_matched.astype(str)
293
  uniprot_matched = uniprot_matched.drop_duplicates()
294
- st.write('pdb_fasta', pdb_fasta)
295
  uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
296
  uniprot_matched = uniprot_matched.astype(str)
297
- st.write('uniprot_matched', uniprot_matched)
298
 
299
  with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
300
  (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
@@ -304,12 +292,10 @@ def pdb(input_set, mode, impute):
304
  uniprot_matched.resolution == 'None'))]
305
  no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
306
  no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
307
- st.write('with_pdb', with_pdb)
308
  print(
309
  'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
310
  % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
311
  len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
312
- st.write('with_pdb1', with_pdb)
313
 
314
  with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
315
  with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
@@ -388,7 +374,6 @@ def pdb(input_set, mode, impute):
388
  # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
389
  with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
390
  with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
391
- st.write('with_pdb2', with_pdb)
392
 
393
  dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
394
  dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
@@ -415,13 +400,11 @@ def pdb(input_set, mode, impute):
415
  existing_pdb = None
416
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
417
  with_pdb = None
418
- st.write('dfM', dfM)
419
 
420
 
421
  print('Aligning sequences...\n')
422
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
423
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
424
- st.write('aligned_m', aligned_m)
425
  # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
426
  for i in aligned_m.index:
427
  if aligned_m.at[i, 'pdbSequence'] == 'nan':
@@ -463,7 +446,6 @@ def pdb(input_set, mode, impute):
463
  yes_pdb_no_match = after_up_pdb_alignment[
464
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
465
  no_pdb = no_pdb.copy()
466
- st.write('pdb_aligned', pdb_aligned)
467
 
468
 
469
  print('PDB matching is completed...\n')
 
60
  data = clean_data(input_set)
61
  path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
62
  out_path = path_to_output_files / 'log.txt'
 
63
  sys.stdout = open(out_path, 'w')
64
  print('Creating directories...')
65
 
 
225
  existing_pdb = [str(i) for i in existing_pdb]
226
  existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
227
  cnt = 0
 
228
  for search in pdbs:
 
229
 
230
  try:
231
  if search.lower() not in existing_pdb:
 
232
  file = pdbl.retrieve_pdb_file(search, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
 
233
  else:
234
  print('PDB structure file exists..')
235
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
 
236
  filename_replace_ext = filename.with_suffix(".pdb")
237
  filename.rename(filename_replace_ext)
238
 
239
  file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
 
240
 
241
  base = os.path.splitext(str(file))[0]
242
  base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
 
246
  resolution_method = parser.get_structure(search, file)
247
  for record in SeqIO.parse(file, "pdb-seqres"):
248
  if record.dbxrefs[0].split(':')[0] == 'UNP':
 
249
  pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
250
  pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
251
  pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
 
255
  pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
256
  index += 1
257
  except:
 
258
  IndexError
259
  pdb_info.at[index, 'uniprotID'] = 'nan'
260
  pdb_info.at[index, 'pdbID'] = 'nan'
 
279
  FileNotFoundError
280
 
281
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
 
282
  uniprot_matched = uniprot_matched.astype(str)
283
  uniprot_matched = uniprot_matched.drop_duplicates()
 
284
  uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
285
  uniprot_matched = uniprot_matched.astype(str)
 
286
 
287
  with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
288
  (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
 
292
  uniprot_matched.resolution == 'None'))]
293
  no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
294
  no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
 
295
  print(
296
  'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
297
  % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
298
  len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
 
299
 
300
  with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
301
  with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
 
374
  # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
375
  with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
376
  with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
 
377
 
378
  dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
379
  dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
 
400
  existing_pdb = None
401
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
402
  with_pdb = None
 
403
 
404
 
405
  print('Aligning sequences...\n')
406
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
407
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
 
408
  # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
409
  for i in aligned_m.index:
410
  if aligned_m.at[i, 'pdbSequence'] == 'nan':
 
446
  yes_pdb_no_match = after_up_pdb_alignment[
447
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
448
  no_pdb = no_pdb.copy()
 
449
 
450
 
451
  print('PDB matching is completed...\n')