fatmacankara commited on
Commit
f4dc3e4
·
1 Parent(s): 9e94583

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +208 -202
code/pdb_featureVector.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  # IMPORT NECESSARY MODULES AND LIBRARIES
3
  from timeit import default_timer as timer
4
  import xml.etree.ElementTree as ET
@@ -26,13 +25,13 @@ from Bio.PDB import PDBList
26
  from Bio import Align
27
  from Bio import SeqIO
28
  from Bio.PDB import *
 
29
  warnings.filterwarnings("ignore")
30
  start = timer()
31
  import streamlit as st
32
  # FUNCTIONS
33
 
34
 
35
-
36
  # FUNCTIONS
37
  from calc_pc_property import *
38
  from add_domains import *
@@ -58,14 +57,16 @@ def pdb(input_set, mode, impute):
58
  Add datapoint identifier and remove non-standard input.
59
  """
60
  data = clean_data(input_set)
61
- path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
 
62
  out_path = path_to_output_files / 'log.txt'
63
  sys.stdout = open(out_path, 'w')
64
  print('Creating directories...')
65
 
66
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
67
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
68
- 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
 
69
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
70
  'transitPeptide', 'glycosylation', 'propeptide']
71
 
@@ -140,12 +141,14 @@ def pdb(input_set, mode, impute):
140
  if wt == can:
141
  data.at[i, 'wt_sequence_match'] = 'm'
142
  elif wt != can:
143
- isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
 
144
  for k in isoList:
145
  if len(k) >= int(data.at[i, 'pos']):
146
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
147
  if wt == resInIso:
148
- whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
 
149
  data.at[i, 'wt_sequence_match'] = 'i'
150
  data.at[i, 'whichIsoform'] = whichIsoform
151
  break
@@ -190,24 +193,16 @@ def pdb(input_set, mode, impute):
190
  for prot in protein:
191
  pdbs.append(get_pdb_ids(prot))
192
  print('PDBs', pdbs)
193
- if len(pdbs)>=1:
194
  print('pdbs not empty')
195
  pdbs = [item for sublist in pdbs for item in sublist]
196
  print('NEW', pdbs)
197
  else:
198
  print('pdbs empty')
199
- pdbs =[]
200
  print('Processing PDB structures...\n')
201
  if pdbs == []:
202
  print('No PDB structure found for the query. ')
203
- """
204
- try:
205
- pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in
206
- ((',').join([str(item) for item in pdbs])).split(',')]
207
- except IndexError:
208
- pdbs = []
209
- print('No PDB structure found for the query. ')
210
- """
211
  print('Starting PDB structures download...\n')
212
  pdbs = list(filter(None, pdbs))
213
  pdbs = (set(pdbs))
@@ -219,59 +214,70 @@ def pdb(input_set, mode, impute):
219
  try:
220
  shutil.rmtree('obsolete')
221
  except OSError as e:
222
- pass
223
- existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
224
- st.write('existing_pdb')
225
- st.write(existing_pdb)
226
- existing_pdb = [str(i) for i in existing_pdb]
227
- existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
228
  cnt = 0
229
  st.write('this is the pdbs', pdbs)
 
 
 
 
 
 
 
 
 
230
  for search in pdbs:
231
- st.write('searching for pdb:', search)
 
 
232
  try:
233
- if search.lower() not in existing_pdb:
234
- path_pdb = 'out_files/pdb/pdb_structures'
235
- st.write('path for pdb: ',path_pdb)
236
- file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
237
- st.write('file: ',file)
238
- existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
239
- st.write('after download:', existing_pdb)
240
- st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
241
- existing_pdb = list(path_pdb.glob("*"))
242
- st.write('after download:', existing_pdb)
243
- else:
244
- print('PDB structure file exists..')
245
- for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
246
- filename_replace_ext = filename.with_suffix(".pdb")
247
- filename.rename(filename_replace_ext)
248
-
249
- file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
250
-
251
- base = os.path.splitext(str(file))[0]
252
- base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
253
- os.rename(file, base + ".ent")
254
- file = base + '.ent'
255
-
256
- resolution_method = parser.get_structure(search, file)
257
- for record in SeqIO.parse(file, "pdb-seqres"):
258
- if record.dbxrefs[0].split(':')[0] == 'UNP':
259
- pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
260
- pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
261
- pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
262
- pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
263
- pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
264
- pdb_info.at[index, 'chain'] = record.annotations["chain"]
265
- pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
266
- index += 1
267
- except IndexError as a:
268
- st.write(a)
269
- pdb_info.at[index, 'uniprotID'] = 'nan'
270
- pdb_info.at[index, 'pdbID'] = 'nan'
271
- pdb_info.at[index, 'chain'] = 'nan'
272
- pdb_info.at[index, 'resolution'] = 'nan'
273
- cnt +=1
 
 
274
  print()
 
 
275
  print('PDB file processing finished..')
276
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
277
  try:
@@ -324,13 +330,11 @@ def pdb(input_set, mode, impute):
324
  TypeError
325
  with_pdb.at[i, 'pdbInfo'] = 'nan'
326
 
327
- with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
328
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
329
  'wt_sequence_match',
330
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
331
 
332
-
333
-
334
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
335
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
336
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
@@ -344,7 +348,8 @@ def pdb(input_set, mode, impute):
344
  if len(with_pdb) > 0:
345
  with_pdb = add_annotations(with_pdb)
346
  else:
347
- new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
 
348
  'activeSite',
349
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
350
  'crosslink', 'mutagenesis', 'strand',
@@ -363,7 +368,7 @@ def pdb(input_set, mode, impute):
363
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
364
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
365
  'glycosylationBinary', 'propeptideBinary']
366
- with_pdb = pd.DataFrame(columns = new_cols)
367
  try:
368
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
369
  except:
@@ -375,7 +380,7 @@ def pdb(input_set, mode, impute):
375
  with_pdb.replace({'[]': 'nan'}, inplace=True)
376
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
377
  with_pdb.replace({'': 'nan'}, inplace=True)
378
-
379
  """
380
  STEP 7
381
  Do alignment for PDB
@@ -407,11 +412,11 @@ def pdb(input_set, mode, impute):
407
  pdb_fasta = None
408
  pdb_info = None
409
  pdbs = None
410
- existing_pdb = None
 
411
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
412
  with_pdb = None
413
-
414
-
415
  print('Aligning sequences...\n')
416
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
417
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
@@ -434,7 +439,6 @@ def pdb(input_set, mode, impute):
434
  aligned_m = aligned_m.astype(str)
435
  aligned_nm = aligned_nm.astype(str)
436
 
437
-
438
  frames = [aligned_m, aligned_nm]
439
  after_up_pdb_alignment = pd.concat(frames, sort=False)
440
  if len(after_up_pdb_alignment) == 0:
@@ -457,7 +461,6 @@ def pdb(input_set, mode, impute):
457
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
458
  no_pdb = no_pdb.copy()
459
 
460
-
461
  print('PDB matching is completed...\n')
462
  print('SUMMARY')
463
  print('-------')
@@ -472,7 +475,6 @@ def pdb(input_set, mode, impute):
472
  print('--%d will be searched in Swiss-Model database.\n' % (
473
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
474
 
475
-
476
  dfM = None
477
  dfNM = None
478
  aligned_nm = None
@@ -528,7 +530,8 @@ def pdb(input_set, mode, impute):
528
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
529
  dtype=str, header=None, skiprows=1,
530
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
531
- 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
 
532
 
533
  else:
534
  swiss_model = pd.DataFrame(
@@ -548,13 +551,13 @@ def pdb(input_set, mode, impute):
548
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
549
  else:
550
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
551
- # swiss_model.drop(['input'], axis=1, inplace=True)
552
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
553
  print('Index File Processed...\n')
554
 
555
-
556
  # Get relevant columns
557
- swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
 
558
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
559
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
560
  swiss_model.reset_index(inplace=True)
@@ -711,7 +714,6 @@ def pdb(input_set, mode, impute):
711
  ascending=[True, False])
712
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
713
 
714
-
715
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
716
  swiss_models_with_data.reset_index(inplace=True)
717
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
@@ -728,7 +730,6 @@ def pdb(input_set, mode, impute):
728
 
729
  swiss_models_with_data = swiss_models_with_data1.copy()
730
 
731
-
732
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
733
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
734
  axis=0, ascending=[True, True, True, False])
@@ -738,7 +739,8 @@ def pdb(input_set, mode, impute):
738
  keep='first')
739
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
740
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
741
- len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
 
742
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
743
  # This printed data here includes all possible models with different qualities,
744
  # because we may get a hit in either of them.
@@ -765,10 +767,10 @@ def pdb(input_set, mode, impute):
765
 
766
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
767
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
768
- swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
 
769
  swiss_models_with_data = None
770
 
771
-
772
  if len(swiss_model_aligned) == 0:
773
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
774
  swiss_model_aligned['qmean_norm'] = 'nan'
@@ -861,7 +863,7 @@ def pdb(input_set, mode, impute):
861
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
862
  print(url)
863
  req = requests.get(url)
864
- name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
865
  with open(name, 'wb') as f:
866
  f.write(req.content)
867
  else:
@@ -878,7 +880,7 @@ def pdb(input_set, mode, impute):
878
  individual.write(str('UniProt ID: ' + protein))
879
  individual.write('\n')
880
  individual.write(str(pdb.contents[3])[10:-11].strip())
881
- with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
882
  encoding="utf8") as f:
883
  fasta = ''
884
  chain = ''
@@ -961,7 +963,6 @@ def pdb(input_set, mode, impute):
961
  existing_modbase_models = None
962
  existing_modbase_models_ind = None
963
 
964
-
965
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
966
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
967
  'PDBCode': 'template', 'PDBChain': 'chain',
@@ -1014,7 +1015,8 @@ def pdb(input_set, mode, impute):
1014
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1015
  axis=0,
1016
  ascending=[True, True, True, True, False, True, False])
1017
- with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
 
1018
 
1019
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1020
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
@@ -1028,7 +1030,6 @@ def pdb(input_set, mode, impute):
1028
  with_modbase_info.reset_index(inplace=True)
1029
  with_modbase_info.drop('index', axis=1, inplace=True)
1030
 
1031
-
1032
  align = with_modbase_info[
1033
  with_modbase_info.fasta != 'nan']
1034
  yes_pdb_no_match = with_modbase_info[
@@ -1047,7 +1048,6 @@ def pdb(input_set, mode, impute):
1047
  modbase_aligned = modbase_aligned.astype(str)
1048
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1049
 
1050
-
1051
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1052
  if len(with_modbase_info) != 0:
1053
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
@@ -1055,29 +1055,30 @@ def pdb(input_set, mode, impute):
1055
  ['datapoint'],
1056
  keep=False)
1057
  else:
1058
- not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1059
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1060
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1061
- 'intMet',
1062
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1063
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1064
- 'crosslink',
1065
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1066
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1067
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1068
- 'coiledCoil',
1069
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1070
- 'disulfide',
1071
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1072
- 'activeSite',
1073
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1074
- 'crosslink',
1075
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1076
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1077
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1078
- 'coiledCoil',
1079
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1080
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
 
1081
  with_modbase_info = None
1082
  if len(not_in_aligned) != 0:
1083
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
@@ -1094,7 +1095,8 @@ def pdb(input_set, mode, impute):
1094
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1095
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1096
  not_nan.score = not_nan.score.astype(float)
1097
- not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
 
1098
 
1099
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1100
  ascending=[True, True, False])
@@ -1106,7 +1108,7 @@ def pdb(input_set, mode, impute):
1106
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1107
  if len(which_ones_are_match) == 0:
1108
  which_ones_are_match = pd.DataFrame(
1109
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1110
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1111
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1112
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1142,7 +1144,6 @@ def pdb(input_set, mode, impute):
1142
  not_nan = None
1143
  nan = None
1144
 
1145
-
1146
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1147
 
1148
  # No model
@@ -1171,9 +1172,10 @@ def pdb(input_set, mode, impute):
1171
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1172
  rest = no_info
1173
  else:
1174
- rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1175
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1176
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
1177
 
1178
  rest = rest[to_swiss_columns]
1179
  rest = rest.drop_duplicates()
@@ -1185,49 +1187,53 @@ def pdb(input_set, mode, impute):
1185
 
1186
  else:
1187
 
1188
- modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1189
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1190
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1191
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1192
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1193
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1194
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1195
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1196
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1197
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1198
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1199
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1200
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1201
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1202
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1203
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1204
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1205
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1206
- 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1207
- 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1208
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1209
- not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1210
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1211
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1212
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1213
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1214
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1215
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1216
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1217
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1218
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1219
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1220
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1221
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1222
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1223
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1224
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1225
- no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1226
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1227
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1228
- rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1229
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1230
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
 
 
 
1231
 
1232
  rest = rest[to_swiss_columns]
1233
  rest = rest.drop_duplicates()
@@ -1263,7 +1269,6 @@ def pdb(input_set, mode, impute):
1263
  not_models = None
1264
  modbase_not_match = None
1265
 
1266
-
1267
  # Final corrections
1268
 
1269
  # Now 3D alignment.
@@ -1285,7 +1290,6 @@ def pdb(input_set, mode, impute):
1285
 
1286
  # Fix the axes and merge all data.
1287
 
1288
-
1289
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1290
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1291
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
@@ -1298,7 +1302,6 @@ def pdb(input_set, mode, impute):
1298
  modbase['source'] = 'MODBASE'
1299
  data = pd.concat([swiss, modbase, pdb])
1300
 
1301
-
1302
  data.reset_index(inplace=True)
1303
  data.drop(['index'], axis=1, inplace=True)
1304
  data = data.astype('str')
@@ -1322,10 +1325,10 @@ def pdb(input_set, mode, impute):
1322
  for pdbID in pdb_only.pdbID.to_list():
1323
  if pdbID not in existing_free_sasa:
1324
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1325
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1326
  outdir=None, force_rerun=False, file_type='pdb'))
1327
 
1328
-
1329
  print('Calculation RSA for SwissModel Files...\n')
1330
  swiss_only = data[data.source == 'SWISSMODEL']
1331
  swiss_dp = []
@@ -1343,7 +1346,8 @@ def pdb(input_set, mode, impute):
1343
  for pdbID in modbase_only.pdbID.to_list():
1344
  if pdbID not in existing_free_sasa:
1345
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1346
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1347
  outdir=None, force_rerun=False, file_type='pdb'))
1348
 
1349
  # This annotation list is different than the prev one, keep it.
@@ -1381,16 +1385,18 @@ def pdb(input_set, mode, impute):
1381
  chain = data.at[i, 'chain']
1382
  uniprotID = data.at[i, 'uniprotID']
1383
  pdbID = data.at[i, 'pdbID']
1384
- alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
 
1385
  mutPos = data.at[i, 'mutationPositionOnPDB']
1386
  try:
1387
- coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
1388
  except:
1389
  ValueError
1390
  coordMut = 'nan'
1391
  try:
1392
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1393
- data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
 
1394
  except:
1395
  ValueError
1396
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
@@ -1438,11 +1444,9 @@ def pdb(input_set, mode, impute):
1438
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1439
  float(data.at[i, 'domainEndonPDB']))
1440
 
1441
-
1442
  data = data.astype(str)
1443
  data.replace({'NaN': 'nan'}, inplace=True)
1444
 
1445
-
1446
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1447
 
1448
  # Get interface positions from ECLAIR. Download HQ human
@@ -1463,28 +1467,29 @@ def pdb(input_set, mode, impute):
1463
  interface_dataframe.columns = ['uniprotID', 'positions']
1464
 
1465
  if len(data) == 0:
1466
- data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1467
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1468
- 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1469
- 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1470
- 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1471
- 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1472
- 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1473
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1474
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1475
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1476
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1477
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1478
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1479
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1480
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1481
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1482
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1483
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1484
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1485
- 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1486
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1487
- 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
 
1488
  else:
1489
  data.sasa = data.sasa.astype('str')
1490
 
@@ -1523,7 +1528,6 @@ def pdb(input_set, mode, impute):
1523
 
1524
  data.drop(['positions'], axis=1, inplace=True)
1525
 
1526
-
1527
  # OPTIONAL
1528
  # DOMAIN SELECTION
1529
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
@@ -1542,7 +1546,8 @@ def pdb(input_set, mode, impute):
1542
  # nan--> 0, 0 -->1 and 1 -->2
1543
 
1544
  print('Final adjustments are being done...\n')
1545
- binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
 
1546
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1547
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1548
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
@@ -1644,7 +1649,8 @@ def pdb(input_set, mode, impute):
1644
  ready = data.copy()
1645
  # Imputation
1646
  if (impute == 'True') or (impute == 'true') or (impute == True):
1647
- filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
 
1648
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1649
  col_index = 0
1650
  for col_ in ready.columns[-30:]:
@@ -1659,7 +1665,8 @@ def pdb(input_set, mode, impute):
1659
  ready = ready.replace({'nan': np.NaN})
1660
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1661
  if len(ready) == 0:
1662
- print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
 
1663
  print(ready)
1664
  print('Feature vector successfully created...')
1665
  return ready
@@ -1669,5 +1676,4 @@ def pdb(input_set, mode, impute):
1669
  minutes, seconds = divmod(rem, 60)
1670
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1671
  sys.stdout.close()
1672
- return ready
1673
-
 
 
1
  # IMPORT NECESSARY MODULES AND LIBRARIES
2
  from timeit import default_timer as timer
3
  import xml.etree.ElementTree as ET
 
25
  from Bio import Align
26
  from Bio import SeqIO
27
  from Bio.PDB import *
28
+
29
  warnings.filterwarnings("ignore")
30
  start = timer()
31
  import streamlit as st
32
  # FUNCTIONS
33
 
34
 
 
35
  # FUNCTIONS
36
  from calc_pc_property import *
37
  from add_domains import *
 
57
  Add datapoint identifier and remove non-standard input.
58
  """
59
  data = clean_data(input_set)
60
+ path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
61
+ mode)
62
  out_path = path_to_output_files / 'log.txt'
63
  sys.stdout = open(out_path, 'w')
64
  print('Creating directories...')
65
 
66
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
67
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
68
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
69
+ 'region',
70
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
71
  'transitPeptide', 'glycosylation', 'propeptide']
72
 
 
141
  if wt == can:
142
  data.at[i, 'wt_sequence_match'] = 'm'
143
  elif wt != can:
144
+ isoList = isoform_fasta[
145
+ isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
146
  for k in isoList:
147
  if len(k) >= int(data.at[i, 'pos']):
148
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
149
  if wt == resInIso:
150
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
151
+ 0]
152
  data.at[i, 'wt_sequence_match'] = 'i'
153
  data.at[i, 'whichIsoform'] = whichIsoform
154
  break
 
193
  for prot in protein:
194
  pdbs.append(get_pdb_ids(prot))
195
  print('PDBs', pdbs)
196
+ if len(pdbs) >= 1:
197
  print('pdbs not empty')
198
  pdbs = [item for sublist in pdbs for item in sublist]
199
  print('NEW', pdbs)
200
  else:
201
  print('pdbs empty')
202
+ pdbs = []
203
  print('Processing PDB structures...\n')
204
  if pdbs == []:
205
  print('No PDB structure found for the query. ')
 
 
 
 
 
 
 
 
206
  print('Starting PDB structures download...\n')
207
  pdbs = list(filter(None, pdbs))
208
  pdbs = (set(pdbs))
 
214
  try:
215
  shutil.rmtree('obsolete')
216
  except OSError as e:
217
+ pass
218
+
 
 
 
 
219
  cnt = 0
220
  st.write('this is the pdbs', pdbs)
221
+ def fetch_uniprot_ids(pdb_code):
222
+ try:
223
+ response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
224
+ response.raise_for_status() # Check for a successful response
225
+ data = response.json()
226
+ st.write(list(list(list(data.values())[0].values())[0].keys()))
227
+ return list(list(list(data.values())[0].values())[0].keys())
228
+ except :
229
+ return []
230
  for search in pdbs:
231
+ # Step 1: Fetch the PDB file
232
+ pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
233
+ st.write(pdb_url)
234
  try:
235
+ response = requests.get(pdb_url)
236
+ st.write('response', response)
237
+ response.raise_for_status() # Check for a successful response
238
+ except :
239
+ continue # Skip to the next PDB code if fetching fails
240
+ st.write('response2', response)
241
+ # Step 2: Parse the PDB file from memory
242
+ pdb_data = response.text
243
+ pdb_parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
244
+ pdb_file_content = StringIO(pdb_data)
245
+ structure = pdb_parser.get_structure(pdb_code, pdb_file_content)
246
+ st.write(structure)
247
+ ppb = PPBuilder()
248
+ for model in structure:
249
+ st.write(model)
250
+ for pp in ppb.build_peptides(model):
251
+ sequence = pp.get_sequence()
252
+ st.write(sequence)
253
+ for chain in model:
254
+ chain_id = chain.get_id()
255
+ # Extract UniProt ID if available in the chain's annotations
256
+ uniprot_ids = fetch_uniprot_ids(search)
257
+ # Get the resolution from the PDB header
258
+ header = structure.header
259
+ resolution = header.get('resolution', 'N/A')
260
+ # Print UniProt IDs, chain ID, and resolution for the current model
261
+ for i, chain in enumerate(model, start=1):
262
+ chain_id = chain.get_id()
263
+ st.write(f"---- Information for Chain {chain_id} in Model {i} ----")
264
+ st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
265
+ st.write(f"Chain ID: {chain_id}")
266
+ st.write(f"PDB ID: {search.upper()}")
267
+ st.write(f"Resolution: {resolution}")
268
+ st.write(f"Sequence: {sequence}")
269
+ pdb_fasta.at[index, 'pdbID'] = search
270
+ pdb_fasta.at[index, 'chain'] = chain_id
271
+ pdb_fasta.at[index, 'pdbSequence'] = str(sequence)
272
+ pdb_info.at[index, 'uniprotID'] = ', '.join(uniprot_ids)
273
+ pdb_info.at[index, 'pdbID'] = search
274
+ pdb_info.at[index, 'chain'] = chain_id
275
+ pdb_info.at[index, 'resolution'] = resolution
276
+ index += 1
277
+
278
  print()
279
+ st.write()
280
+ st.write(pdb_info)
281
  print('PDB file processing finished..')
282
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
283
  try:
 
330
  TypeError
331
  with_pdb.at[i, 'pdbInfo'] = 'nan'
332
 
333
+ with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
334
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
335
  'wt_sequence_match',
336
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
337
 
 
 
338
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
339
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
340
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
 
348
  if len(with_pdb) > 0:
349
  with_pdb = add_annotations(with_pdb)
350
  else:
351
+ new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
352
+ 'dnaBinding',
353
  'activeSite',
354
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
355
  'crosslink', 'mutagenesis', 'strand',
 
368
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
369
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
370
  'glycosylationBinary', 'propeptideBinary']
371
+ with_pdb = pd.DataFrame(columns=new_cols)
372
  try:
373
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
374
  except:
 
380
  with_pdb.replace({'[]': 'nan'}, inplace=True)
381
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
382
  with_pdb.replace({'': 'nan'}, inplace=True)
383
+
384
  """
385
  STEP 7
386
  Do alignment for PDB
 
412
  pdb_fasta = None
413
  pdb_info = None
414
  pdbs = None
415
+
416
+ g_pdb = None
417
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
418
  with_pdb = None
419
+
 
420
  print('Aligning sequences...\n')
421
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
422
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
 
439
  aligned_m = aligned_m.astype(str)
440
  aligned_nm = aligned_nm.astype(str)
441
 
 
442
  frames = [aligned_m, aligned_nm]
443
  after_up_pdb_alignment = pd.concat(frames, sort=False)
444
  if len(after_up_pdb_alignment) == 0:
 
461
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
462
  no_pdb = no_pdb.copy()
463
 
 
464
  print('PDB matching is completed...\n')
465
  print('SUMMARY')
466
  print('-------')
 
475
  print('--%d will be searched in Swiss-Model database.\n' % (
476
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
477
 
 
478
  dfM = None
479
  dfNM = None
480
  aligned_nm = None
 
530
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
531
  dtype=str, header=None, skiprows=1,
532
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
533
+ 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
534
+ 'qmean_norm', 'seqid', 'url'])
535
 
536
  else:
537
  swiss_model = pd.DataFrame(
 
551
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
552
  else:
553
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
554
+ # swiss_model.drop(['input'], axis=1, inplace=True)
555
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
556
  print('Index File Processed...\n')
557
 
 
558
  # Get relevant columns
559
+ swiss_model = swiss_model[
560
+ ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
561
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
562
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
563
  swiss_model.reset_index(inplace=True)
 
714
  ascending=[True, False])
715
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
716
 
 
717
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
718
  swiss_models_with_data.reset_index(inplace=True)
719
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
 
730
 
731
  swiss_models_with_data = swiss_models_with_data1.copy()
732
 
 
733
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
734
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
735
  axis=0, ascending=[True, True, True, False])
 
739
  keep='first')
740
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
741
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
742
+ len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
743
+ broken_swiss.drop_duplicates(['datapoint'])) + len(
744
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
745
  # This printed data here includes all possible models with different qualities,
746
  # because we may get a hit in either of them.
 
767
 
768
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
769
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
770
+ swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
771
+ path_to_output_files / 'alignment_files')
772
  swiss_models_with_data = None
773
 
 
774
  if len(swiss_model_aligned) == 0:
775
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
776
  swiss_model_aligned['qmean_norm'] = 'nan'
 
863
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
864
  print(url)
865
  req = requests.get(url)
866
+ name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
867
  with open(name, 'wb') as f:
868
  f.write(req.content)
869
  else:
 
880
  individual.write(str('UniProt ID: ' + protein))
881
  individual.write('\n')
882
  individual.write(str(pdb.contents[3])[10:-11].strip())
883
+ with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
884
  encoding="utf8") as f:
885
  fasta = ''
886
  chain = ''
 
963
  existing_modbase_models = None
964
  existing_modbase_models_ind = None
965
 
 
966
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
967
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
968
  'PDBCode': 'template', 'PDBChain': 'chain',
 
1015
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1016
  axis=0,
1017
  ascending=[True, True, True, True, False, True, False])
1018
+ with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
1019
+ keep='first')
1020
 
1021
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1022
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
 
1030
  with_modbase_info.reset_index(inplace=True)
1031
  with_modbase_info.drop('index', axis=1, inplace=True)
1032
 
 
1033
  align = with_modbase_info[
1034
  with_modbase_info.fasta != 'nan']
1035
  yes_pdb_no_match = with_modbase_info[
 
1048
  modbase_aligned = modbase_aligned.astype(str)
1049
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1050
 
 
1051
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1052
  if len(with_modbase_info) != 0:
1053
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
 
1055
  ['datapoint'],
1056
  keep=False)
1057
  else:
1058
+ not_in_aligned = pd.DataFrame(
1059
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1060
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1061
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1062
+ 'intMet',
1063
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1064
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1065
+ 'crosslink',
1066
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1067
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1068
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1069
+ 'coiledCoil',
1070
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1071
+ 'disulfide',
1072
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1073
+ 'activeSite',
1074
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1075
+ 'crosslink',
1076
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1077
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1078
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1079
+ 'coiledCoil',
1080
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1081
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1082
  with_modbase_info = None
1083
  if len(not_in_aligned) != 0:
1084
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
 
1095
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1096
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1097
  not_nan.score = not_nan.score.astype(float)
1098
+ not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
1099
+ inplace=True)
1100
 
1101
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1102
  ascending=[True, True, False])
 
1108
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1109
  if len(which_ones_are_match) == 0:
1110
  which_ones_are_match = pd.DataFrame(
1111
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1112
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1113
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1114
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
 
1144
  not_nan = None
1145
  nan = None
1146
 
 
1147
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1148
 
1149
  # No model
 
1172
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1173
  rest = no_info
1174
  else:
1175
+ rest = pd.DataFrame(
1176
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1177
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1178
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1179
 
1180
  rest = rest[to_swiss_columns]
1181
  rest = rest.drop_duplicates()
 
1187
 
1188
  else:
1189
 
1190
+ modbase_match = pd.DataFrame(
1191
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1192
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1193
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1194
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1195
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1196
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1197
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1198
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1199
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1200
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1201
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1202
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1203
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1204
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1205
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1206
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1207
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1208
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1209
+ 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1210
+ 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1211
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1212
+ not_in_aligned = pd.DataFrame(
1213
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1214
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1215
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1216
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1217
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1218
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1219
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1220
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1221
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1222
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1223
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1224
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1225
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1226
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1227
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1228
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1229
+ no_info = pd.DataFrame(
1230
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1231
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1232
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1233
+ rest = pd.DataFrame(
1234
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1235
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1236
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1237
 
1238
  rest = rest[to_swiss_columns]
1239
  rest = rest.drop_duplicates()
 
1269
  not_models = None
1270
  modbase_not_match = None
1271
 
 
1272
  # Final corrections
1273
 
1274
  # Now 3D alignment.
 
1290
 
1291
  # Fix the axes and merge all data.
1292
 
 
1293
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1294
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1295
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
 
1302
  modbase['source'] = 'MODBASE'
1303
  data = pd.concat([swiss, modbase, pdb])
1304
 
 
1305
  data.reset_index(inplace=True)
1306
  data.drop(['index'], axis=1, inplace=True)
1307
  data = data.astype('str')
 
1325
  for pdbID in pdb_only.pdbID.to_list():
1326
  if pdbID not in existing_free_sasa:
1327
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1328
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1329
+ include_hetatms=True,
1330
  outdir=None, force_rerun=False, file_type='pdb'))
1331
 
 
1332
  print('Calculation RSA for SwissModel Files...\n')
1333
  swiss_only = data[data.source == 'SWISSMODEL']
1334
  swiss_dp = []
 
1346
  for pdbID in modbase_only.pdbID.to_list():
1347
  if pdbID not in existing_free_sasa:
1348
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1349
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1350
+ include_hetatms=True,
1351
  outdir=None, force_rerun=False, file_type='pdb'))
1352
 
1353
  # This annotation list is different than the prev one, keep it.
 
1385
  chain = data.at[i, 'chain']
1386
  uniprotID = data.at[i, 'uniprotID']
1387
  pdbID = data.at[i, 'pdbID']
1388
+ alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
1389
+ Path(path_to_output_files / '3D_alignment'), file_format='gzip')
1390
  mutPos = data.at[i, 'mutationPositionOnPDB']
1391
  try:
1392
+ coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
1393
  except:
1394
  ValueError
1395
  coordMut = 'nan'
1396
  try:
1397
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1398
+ data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
1399
+ data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
1400
  except:
1401
  ValueError
1402
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
 
1444
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1445
  float(data.at[i, 'domainEndonPDB']))
1446
 
 
1447
  data = data.astype(str)
1448
  data.replace({'NaN': 'nan'}, inplace=True)
1449
 
 
1450
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1451
 
1452
  # Get interface positions from ECLAIR. Download HQ human
 
1467
  interface_dataframe.columns = ['uniprotID', 'positions']
1468
 
1469
  if len(data) == 0:
1470
+ data = pd.DataFrame(
1471
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1472
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1473
+ 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1474
+ 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1475
+ 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1476
+ 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1477
+ 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1478
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1479
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1480
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1481
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1482
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1483
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1484
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1485
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1486
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1487
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1488
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1489
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1490
+ 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1491
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1492
+ 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
1493
  else:
1494
  data.sasa = data.sasa.astype('str')
1495
 
 
1528
 
1529
  data.drop(['positions'], axis=1, inplace=True)
1530
 
 
1531
  # OPTIONAL
1532
  # DOMAIN SELECTION
1533
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
 
1546
  # nan--> 0, 0 -->1 and 1 -->2
1547
 
1548
  print('Final adjustments are being done...\n')
1549
+ binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
1550
+ 'dnaBindingBinary',
1551
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1552
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1553
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
 
1649
  ready = data.copy()
1650
  # Imputation
1651
  if (impute == 'True') or (impute == 'true') or (impute == True):
1652
+ filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
1653
+ 16.82,
1654
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1655
  col_index = 0
1656
  for col_ in ready.columns[-30:]:
 
1665
  ready = ready.replace({'nan': np.NaN})
1666
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1667
  if len(ready) == 0:
1668
+ print(
1669
+ 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
1670
  print(ready)
1671
  print('Feature vector successfully created...')
1672
  return ready
 
1676
  minutes, seconds = divmod(rem, 60)
1677
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1678
  sys.stdout.close()
1679
+ return ready