fatmacankara commited on
Commit
9e94583
·
1 Parent(s): 42b9925

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +200 -206
code/pdb_featureVector.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # IMPORT NECESSARY MODULES AND LIBRARIES
2
  from timeit import default_timer as timer
3
  import xml.etree.ElementTree as ET
@@ -25,13 +26,13 @@ from Bio.PDB import PDBList
25
  from Bio import Align
26
  from Bio import SeqIO
27
  from Bio.PDB import *
28
-
29
  warnings.filterwarnings("ignore")
30
  start = timer()
31
  import streamlit as st
32
  # FUNCTIONS
33
 
34
 
 
35
  # FUNCTIONS
36
  from calc_pc_property import *
37
  from add_domains import *
@@ -57,16 +58,14 @@ def pdb(input_set, mode, impute):
57
  Add datapoint identifier and remove non-standard input.
58
  """
59
  data = clean_data(input_set)
60
- path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
61
- mode)
62
  out_path = path_to_output_files / 'log.txt'
63
  sys.stdout = open(out_path, 'w')
64
  print('Creating directories...')
65
 
66
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
67
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
68
- 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
69
- 'region',
70
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
71
  'transitPeptide', 'glycosylation', 'propeptide']
72
 
@@ -141,14 +140,12 @@ def pdb(input_set, mode, impute):
141
  if wt == can:
142
  data.at[i, 'wt_sequence_match'] = 'm'
143
  elif wt != can:
144
- isoList = isoform_fasta[
145
- isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
146
  for k in isoList:
147
  if len(k) >= int(data.at[i, 'pos']):
148
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
149
  if wt == resInIso:
150
- whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
151
- 0]
152
  data.at[i, 'wt_sequence_match'] = 'i'
153
  data.at[i, 'whichIsoform'] = whichIsoform
154
  break
@@ -193,16 +190,24 @@ def pdb(input_set, mode, impute):
193
  for prot in protein:
194
  pdbs.append(get_pdb_ids(prot))
195
  print('PDBs', pdbs)
196
- if len(pdbs) >= 1:
197
  print('pdbs not empty')
198
  pdbs = [item for sublist in pdbs for item in sublist]
199
  print('NEW', pdbs)
200
  else:
201
  print('pdbs empty')
202
- pdbs = []
203
  print('Processing PDB structures...\n')
204
  if pdbs == []:
205
  print('No PDB structure found for the query. ')
 
 
 
 
 
 
 
 
206
  print('Starting PDB structures download...\n')
207
  pdbs = list(filter(None, pdbs))
208
  pdbs = (set(pdbs))
@@ -214,69 +219,59 @@ def pdb(input_set, mode, impute):
214
  try:
215
  shutil.rmtree('obsolete')
216
  except OSError as e:
217
- pass
218
-
 
 
 
 
219
  cnt = 0
220
  st.write('this is the pdbs', pdbs)
221
- def fetch_uniprot_ids(pdb_code):
222
- try:
223
- response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
224
- response.raise_for_status() # Check for a successful response
225
- data = response.json()
226
- st.write(list(list(list(data.values())[0].values())[0].keys()))
227
- return list(list(list(data.values())[0].values())[0].keys())
228
- except :
229
- return []
230
  for search in pdbs:
231
- # Step 1: Fetch the PDB file
232
- pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
233
- st.write(pdb_url)
234
  try:
235
- response = requests.get(pdb_url)
236
- st.write('response', response)
237
- response.raise_for_status() # Check for a successful response
238
- except :
239
- continue # Skip to the next PDB code if fetching fails
240
- st.write('response2', response)
241
- # Step 2: Parse the PDB file from memory
242
- pdb_data = response.text
243
- pdb_parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
244
- pdb_file_content = StringIO(pdb_data)
245
- structure = pdb_parser.get_structure(pdb_code, pdb_file_content)
246
- ppb = PPBuilder()
247
- for model in structure:
248
- st.write(model)
249
- for pp in ppb.build_peptides(model):
250
- sequence = pp.get_sequence()
251
- st.write(sequence)
252
- for chain in model:
253
- chain_id = chain.get_id()
254
- # Extract UniProt ID if available in the chain's annotations
255
- uniprot_ids = fetch_uniprot_ids(search)
256
- # Get the resolution from the PDB header
257
- header = structure.header
258
- resolution = header.get('resolution', 'N/A')
259
- # Print UniProt IDs, chain ID, and resolution for the current model
260
- for i, chain in enumerate(model, start=1):
261
- chain_id = chain.get_id()
262
- st.write(f"---- Information for Chain {chain_id} in Model {i} ----")
263
- st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
264
- st.write(f"Chain ID: {chain_id}")
265
- st.write(f"PDB ID: {search.upper()}")
266
- st.write(f"Resolution: {resolution}")
267
- st.write(f"Sequence: {sequence}")
268
- pdb_fasta.at[index, 'pdbID'] = search
269
- pdb_fasta.at[index, 'chain'] = chain_id
270
- pdb_fasta.at[index, 'pdbSequence'] = str(sequence)
271
- pdb_info.at[index, 'uniprotID'] = ', '.join(uniprot_ids)
272
- pdb_info.at[index, 'pdbID'] = search
273
- pdb_info.at[index, 'chain'] = chain_id
274
- pdb_info.at[index, 'resolution'] = resolution
275
- index += 1
276
-
277
  print()
278
- st.write()
279
- st.write(pdb_info)
280
  print('PDB file processing finished..')
281
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
282
  try:
@@ -329,11 +324,13 @@ def pdb(input_set, mode, impute):
329
  TypeError
330
  with_pdb.at[i, 'pdbInfo'] = 'nan'
331
 
332
- with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
333
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
334
  'wt_sequence_match',
335
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
336
 
 
 
337
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
338
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
339
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
@@ -347,8 +344,7 @@ def pdb(input_set, mode, impute):
347
  if len(with_pdb) > 0:
348
  with_pdb = add_annotations(with_pdb)
349
  else:
350
- new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
351
- 'dnaBinding',
352
  'activeSite',
353
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
354
  'crosslink', 'mutagenesis', 'strand',
@@ -367,7 +363,7 @@ def pdb(input_set, mode, impute):
367
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
368
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
369
  'glycosylationBinary', 'propeptideBinary']
370
- with_pdb = pd.DataFrame(columns=new_cols)
371
  try:
372
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
373
  except:
@@ -379,7 +375,7 @@ def pdb(input_set, mode, impute):
379
  with_pdb.replace({'[]': 'nan'}, inplace=True)
380
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
381
  with_pdb.replace({'': 'nan'}, inplace=True)
382
-
383
  """
384
  STEP 7
385
  Do alignment for PDB
@@ -411,11 +407,11 @@ def pdb(input_set, mode, impute):
411
  pdb_fasta = None
412
  pdb_info = None
413
  pdbs = None
414
-
415
- g_pdb = None
416
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
417
  with_pdb = None
418
-
 
419
  print('Aligning sequences...\n')
420
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
421
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
@@ -438,6 +434,7 @@ def pdb(input_set, mode, impute):
438
  aligned_m = aligned_m.astype(str)
439
  aligned_nm = aligned_nm.astype(str)
440
 
 
441
  frames = [aligned_m, aligned_nm]
442
  after_up_pdb_alignment = pd.concat(frames, sort=False)
443
  if len(after_up_pdb_alignment) == 0:
@@ -460,6 +457,7 @@ def pdb(input_set, mode, impute):
460
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
461
  no_pdb = no_pdb.copy()
462
 
 
463
  print('PDB matching is completed...\n')
464
  print('SUMMARY')
465
  print('-------')
@@ -474,6 +472,7 @@ def pdb(input_set, mode, impute):
474
  print('--%d will be searched in Swiss-Model database.\n' % (
475
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
476
 
 
477
  dfM = None
478
  dfNM = None
479
  aligned_nm = None
@@ -529,8 +528,7 @@ def pdb(input_set, mode, impute):
529
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
530
  dtype=str, header=None, skiprows=1,
531
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
532
- 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
533
- 'qmean_norm', 'seqid', 'url'])
534
 
535
  else:
536
  swiss_model = pd.DataFrame(
@@ -550,13 +548,13 @@ def pdb(input_set, mode, impute):
550
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
551
  else:
552
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
553
- # swiss_model.drop(['input'], axis=1, inplace=True)
554
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
555
  print('Index File Processed...\n')
556
 
 
557
  # Get relevant columns
558
- swiss_model = swiss_model[
559
- ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
560
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
561
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
562
  swiss_model.reset_index(inplace=True)
@@ -713,6 +711,7 @@ def pdb(input_set, mode, impute):
713
  ascending=[True, False])
714
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
715
 
 
716
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
717
  swiss_models_with_data.reset_index(inplace=True)
718
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
@@ -729,6 +728,7 @@ def pdb(input_set, mode, impute):
729
 
730
  swiss_models_with_data = swiss_models_with_data1.copy()
731
 
 
732
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
733
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
734
  axis=0, ascending=[True, True, True, False])
@@ -738,8 +738,7 @@ def pdb(input_set, mode, impute):
738
  keep='first')
739
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
740
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
741
- len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
742
- broken_swiss.drop_duplicates(['datapoint'])) + len(
743
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
744
  # This printed data here includes all possible models with different qualities,
745
  # because we may get a hit in either of them.
@@ -766,10 +765,10 @@ def pdb(input_set, mode, impute):
766
 
767
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
768
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
769
- swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
770
- path_to_output_files / 'alignment_files')
771
  swiss_models_with_data = None
772
 
 
773
  if len(swiss_model_aligned) == 0:
774
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
775
  swiss_model_aligned['qmean_norm'] = 'nan'
@@ -862,7 +861,7 @@ def pdb(input_set, mode, impute):
862
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
863
  print(url)
864
  req = requests.get(url)
865
- name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
866
  with open(name, 'wb') as f:
867
  f.write(req.content)
868
  else:
@@ -879,7 +878,7 @@ def pdb(input_set, mode, impute):
879
  individual.write(str('UniProt ID: ' + protein))
880
  individual.write('\n')
881
  individual.write(str(pdb.contents[3])[10:-11].strip())
882
- with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
883
  encoding="utf8") as f:
884
  fasta = ''
885
  chain = ''
@@ -962,6 +961,7 @@ def pdb(input_set, mode, impute):
962
  existing_modbase_models = None
963
  existing_modbase_models_ind = None
964
 
 
965
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
966
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
967
  'PDBCode': 'template', 'PDBChain': 'chain',
@@ -1014,8 +1014,7 @@ def pdb(input_set, mode, impute):
1014
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1015
  axis=0,
1016
  ascending=[True, True, True, True, False, True, False])
1017
- with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
1018
- keep='first')
1019
 
1020
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1021
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
@@ -1029,6 +1028,7 @@ def pdb(input_set, mode, impute):
1029
  with_modbase_info.reset_index(inplace=True)
1030
  with_modbase_info.drop('index', axis=1, inplace=True)
1031
 
 
1032
  align = with_modbase_info[
1033
  with_modbase_info.fasta != 'nan']
1034
  yes_pdb_no_match = with_modbase_info[
@@ -1047,6 +1047,7 @@ def pdb(input_set, mode, impute):
1047
  modbase_aligned = modbase_aligned.astype(str)
1048
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1049
 
 
1050
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1051
  if len(with_modbase_info) != 0:
1052
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
@@ -1054,30 +1055,29 @@ def pdb(input_set, mode, impute):
1054
  ['datapoint'],
1055
  keep=False)
1056
  else:
1057
- not_in_aligned = pd.DataFrame(
1058
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1059
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1060
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1061
- 'intMet',
1062
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1063
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1064
- 'crosslink',
1065
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1066
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1067
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1068
- 'coiledCoil',
1069
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1070
- 'disulfide',
1071
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1072
- 'activeSite',
1073
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1074
- 'crosslink',
1075
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1076
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1077
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1078
- 'coiledCoil',
1079
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1080
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1081
  with_modbase_info = None
1082
  if len(not_in_aligned) != 0:
1083
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
@@ -1094,8 +1094,7 @@ def pdb(input_set, mode, impute):
1094
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1095
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1096
  not_nan.score = not_nan.score.astype(float)
1097
- not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
1098
- inplace=True)
1099
 
1100
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1101
  ascending=[True, True, False])
@@ -1107,7 +1106,7 @@ def pdb(input_set, mode, impute):
1107
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1108
  if len(which_ones_are_match) == 0:
1109
  which_ones_are_match = pd.DataFrame(
1110
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1111
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1112
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1113
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1143,6 +1142,7 @@ def pdb(input_set, mode, impute):
1143
  not_nan = None
1144
  nan = None
1145
 
 
1146
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1147
 
1148
  # No model
@@ -1171,10 +1171,9 @@ def pdb(input_set, mode, impute):
1171
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1172
  rest = no_info
1173
  else:
1174
- rest = pd.DataFrame(
1175
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1176
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1177
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1178
 
1179
  rest = rest[to_swiss_columns]
1180
  rest = rest.drop_duplicates()
@@ -1186,53 +1185,49 @@ def pdb(input_set, mode, impute):
1186
 
1187
  else:
1188
 
1189
- modbase_match = pd.DataFrame(
1190
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1191
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1192
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1193
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1194
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1195
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1196
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1197
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1198
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1199
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1200
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1201
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1202
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1203
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1204
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1205
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1206
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1207
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1208
- 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1209
- 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1210
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1211
- not_in_aligned = pd.DataFrame(
1212
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1213
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1214
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1215
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1216
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1217
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1218
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1219
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1220
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1221
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1222
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1223
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1224
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1225
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1226
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1227
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1228
- no_info = pd.DataFrame(
1229
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1230
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1231
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1232
- rest = pd.DataFrame(
1233
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1234
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1235
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1236
 
1237
  rest = rest[to_swiss_columns]
1238
  rest = rest.drop_duplicates()
@@ -1268,6 +1263,7 @@ def pdb(input_set, mode, impute):
1268
  not_models = None
1269
  modbase_not_match = None
1270
 
 
1271
  # Final corrections
1272
 
1273
  # Now 3D alignment.
@@ -1289,6 +1285,7 @@ def pdb(input_set, mode, impute):
1289
 
1290
  # Fix the axes and merge all data.
1291
 
 
1292
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1293
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1294
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
@@ -1301,6 +1298,7 @@ def pdb(input_set, mode, impute):
1301
  modbase['source'] = 'MODBASE'
1302
  data = pd.concat([swiss, modbase, pdb])
1303
 
 
1304
  data.reset_index(inplace=True)
1305
  data.drop(['index'], axis=1, inplace=True)
1306
  data = data.astype('str')
@@ -1324,10 +1322,10 @@ def pdb(input_set, mode, impute):
1324
  for pdbID in pdb_only.pdbID.to_list():
1325
  if pdbID not in existing_free_sasa:
1326
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1327
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1328
- include_hetatms=True,
1329
  outdir=None, force_rerun=False, file_type='pdb'))
1330
 
 
1331
  print('Calculation RSA for SwissModel Files...\n')
1332
  swiss_only = data[data.source == 'SWISSMODEL']
1333
  swiss_dp = []
@@ -1345,8 +1343,7 @@ def pdb(input_set, mode, impute):
1345
  for pdbID in modbase_only.pdbID.to_list():
1346
  if pdbID not in existing_free_sasa:
1347
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1348
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1349
- include_hetatms=True,
1350
  outdir=None, force_rerun=False, file_type='pdb'))
1351
 
1352
  # This annotation list is different than the prev one, keep it.
@@ -1384,18 +1381,16 @@ def pdb(input_set, mode, impute):
1384
  chain = data.at[i, 'chain']
1385
  uniprotID = data.at[i, 'uniprotID']
1386
  pdbID = data.at[i, 'pdbID']
1387
- alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
1388
- Path(path_to_output_files / '3D_alignment'), file_format='gzip')
1389
  mutPos = data.at[i, 'mutationPositionOnPDB']
1390
  try:
1391
- coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
1392
  except:
1393
  ValueError
1394
  coordMut = 'nan'
1395
  try:
1396
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1397
- data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
1398
- data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
1399
  except:
1400
  ValueError
1401
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
@@ -1443,9 +1438,11 @@ def pdb(input_set, mode, impute):
1443
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1444
  float(data.at[i, 'domainEndonPDB']))
1445
 
 
1446
  data = data.astype(str)
1447
  data.replace({'NaN': 'nan'}, inplace=True)
1448
 
 
1449
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1450
 
1451
  # Get interface positions from ECLAIR. Download HQ human
@@ -1466,29 +1463,28 @@ def pdb(input_set, mode, impute):
1466
  interface_dataframe.columns = ['uniprotID', 'positions']
1467
 
1468
  if len(data) == 0:
1469
- data = pd.DataFrame(
1470
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1471
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1472
- 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1473
- 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1474
- 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1475
- 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1476
- 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1477
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1478
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1479
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1480
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1481
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1482
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1483
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1484
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1485
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1486
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1487
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1488
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1489
- 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1490
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1491
- 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
1492
  else:
1493
  data.sasa = data.sasa.astype('str')
1494
 
@@ -1527,6 +1523,7 @@ def pdb(input_set, mode, impute):
1527
 
1528
  data.drop(['positions'], axis=1, inplace=True)
1529
 
 
1530
  # OPTIONAL
1531
  # DOMAIN SELECTION
1532
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
@@ -1545,8 +1542,7 @@ def pdb(input_set, mode, impute):
1545
  # nan--> 0, 0 -->1 and 1 -->2
1546
 
1547
  print('Final adjustments are being done...\n')
1548
- binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
1549
- 'dnaBindingBinary',
1550
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1551
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1552
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
@@ -1648,8 +1644,7 @@ def pdb(input_set, mode, impute):
1648
  ready = data.copy()
1649
  # Imputation
1650
  if (impute == 'True') or (impute == 'true') or (impute == True):
1651
- filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
1652
- 16.82,
1653
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1654
  col_index = 0
1655
  for col_ in ready.columns[-30:]:
@@ -1664,8 +1659,7 @@ def pdb(input_set, mode, impute):
1664
  ready = ready.replace({'nan': np.NaN})
1665
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1666
  if len(ready) == 0:
1667
- print(
1668
- 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
1669
  print(ready)
1670
  print('Feature vector successfully created...')
1671
  return ready
 
1
+
2
  # IMPORT NECESSARY MODULES AND LIBRARIES
3
  from timeit import default_timer as timer
4
  import xml.etree.ElementTree as ET
 
26
  from Bio import Align
27
  from Bio import SeqIO
28
  from Bio.PDB import *
 
29
  warnings.filterwarnings("ignore")
30
  start = timer()
31
  import streamlit as st
32
  # FUNCTIONS
33
 
34
 
35
+
36
  # FUNCTIONS
37
  from calc_pc_property import *
38
  from add_domains import *
 
58
  Add datapoint identifier and remove non-standard input.
59
  """
60
  data = clean_data(input_set)
61
+ path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
 
62
  out_path = path_to_output_files / 'log.txt'
63
  sys.stdout = open(out_path, 'w')
64
  print('Creating directories...')
65
 
66
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
67
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
68
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
 
69
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
70
  'transitPeptide', 'glycosylation', 'propeptide']
71
 
 
140
  if wt == can:
141
  data.at[i, 'wt_sequence_match'] = 'm'
142
  elif wt != can:
143
+ isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
 
144
  for k in isoList:
145
  if len(k) >= int(data.at[i, 'pos']):
146
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
147
  if wt == resInIso:
148
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
 
149
  data.at[i, 'wt_sequence_match'] = 'i'
150
  data.at[i, 'whichIsoform'] = whichIsoform
151
  break
 
190
  for prot in protein:
191
  pdbs.append(get_pdb_ids(prot))
192
  print('PDBs', pdbs)
193
+ if len(pdbs)>=1:
194
  print('pdbs not empty')
195
  pdbs = [item for sublist in pdbs for item in sublist]
196
  print('NEW', pdbs)
197
  else:
198
  print('pdbs empty')
199
+ pdbs =[]
200
  print('Processing PDB structures...\n')
201
  if pdbs == []:
202
  print('No PDB structure found for the query. ')
203
+ """
204
+ try:
205
+ pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in
206
+ ((',').join([str(item) for item in pdbs])).split(',')]
207
+ except IndexError:
208
+ pdbs = []
209
+ print('No PDB structure found for the query. ')
210
+ """
211
  print('Starting PDB structures download...\n')
212
  pdbs = list(filter(None, pdbs))
213
  pdbs = (set(pdbs))
 
219
  try:
220
  shutil.rmtree('obsolete')
221
  except OSError as e:
222
+ pass
223
+ existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
224
+ st.write('existing_pdb')
225
+ st.write(existing_pdb)
226
+ existing_pdb = [str(i) for i in existing_pdb]
227
+ existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
228
  cnt = 0
229
  st.write('this is the pdbs', pdbs)
 
 
 
 
 
 
 
 
 
230
  for search in pdbs:
231
+ st.write('searching for pdb:', search)
 
 
232
  try:
233
+ if search.lower() not in existing_pdb:
234
+ path_pdb = 'out_files/pdb/pdb_structures'
235
+ st.write('path for pdb: ',path_pdb)
236
+ file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
237
+ st.write('file: ',file)
238
+ existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
239
+ st.write('after download:', existing_pdb)
240
+ st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
241
+ existing_pdb = list(path_pdb.glob("*"))
242
+ st.write('after download:', existing_pdb)
243
+ else:
244
+ print('PDB structure file exists..')
245
+ for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
246
+ filename_replace_ext = filename.with_suffix(".pdb")
247
+ filename.rename(filename_replace_ext)
248
+
249
+ file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
250
+
251
+ base = os.path.splitext(str(file))[0]
252
+ base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
253
+ os.rename(file, base + ".ent")
254
+ file = base + '.ent'
255
+
256
+ resolution_method = parser.get_structure(search, file)
257
+ for record in SeqIO.parse(file, "pdb-seqres"):
258
+ if record.dbxrefs[0].split(':')[0] == 'UNP':
259
+ pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
260
+ pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
261
+ pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
262
+ pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
263
+ pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
264
+ pdb_info.at[index, 'chain'] = record.annotations["chain"]
265
+ pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
266
+ index += 1
267
+ except IndexError as a:
268
+ st.write(a)
269
+ pdb_info.at[index, 'uniprotID'] = 'nan'
270
+ pdb_info.at[index, 'pdbID'] = 'nan'
271
+ pdb_info.at[index, 'chain'] = 'nan'
272
+ pdb_info.at[index, 'resolution'] = 'nan'
273
+ cnt +=1
 
274
  print()
 
 
275
  print('PDB file processing finished..')
276
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
277
  try:
 
324
  TypeError
325
  with_pdb.at[i, 'pdbInfo'] = 'nan'
326
 
327
+ with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
328
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
329
  'wt_sequence_match',
330
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
331
 
332
+
333
+
334
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
335
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
336
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
 
344
  if len(with_pdb) > 0:
345
  with_pdb = add_annotations(with_pdb)
346
  else:
347
+ new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
 
348
  'activeSite',
349
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
350
  'crosslink', 'mutagenesis', 'strand',
 
363
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
364
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
365
  'glycosylationBinary', 'propeptideBinary']
366
+ with_pdb = pd.DataFrame(columns = new_cols)
367
  try:
368
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
369
  except:
 
375
  with_pdb.replace({'[]': 'nan'}, inplace=True)
376
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
377
  with_pdb.replace({'': 'nan'}, inplace=True)
378
+
379
  """
380
  STEP 7
381
  Do alignment for PDB
 
407
  pdb_fasta = None
408
  pdb_info = None
409
  pdbs = None
410
+ existing_pdb = None
 
411
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
412
  with_pdb = None
413
+
414
+
415
  print('Aligning sequences...\n')
416
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
417
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
 
434
  aligned_m = aligned_m.astype(str)
435
  aligned_nm = aligned_nm.astype(str)
436
 
437
+
438
  frames = [aligned_m, aligned_nm]
439
  after_up_pdb_alignment = pd.concat(frames, sort=False)
440
  if len(after_up_pdb_alignment) == 0:
 
457
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
458
  no_pdb = no_pdb.copy()
459
 
460
+
461
  print('PDB matching is completed...\n')
462
  print('SUMMARY')
463
  print('-------')
 
472
  print('--%d will be searched in Swiss-Model database.\n' % (
473
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
474
 
475
+
476
  dfM = None
477
  dfNM = None
478
  aligned_nm = None
 
528
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
529
  dtype=str, header=None, skiprows=1,
530
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
531
+ 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
 
532
 
533
  else:
534
  swiss_model = pd.DataFrame(
 
548
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
549
  else:
550
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
551
+ # swiss_model.drop(['input'], axis=1, inplace=True)
552
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
553
  print('Index File Processed...\n')
554
 
555
+
556
  # Get relevant columns
557
+ swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
 
558
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
559
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
560
  swiss_model.reset_index(inplace=True)
 
711
  ascending=[True, False])
712
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
713
 
714
+
715
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
716
  swiss_models_with_data.reset_index(inplace=True)
717
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
 
728
 
729
  swiss_models_with_data = swiss_models_with_data1.copy()
730
 
731
+
732
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
733
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
734
  axis=0, ascending=[True, True, True, False])
 
738
  keep='first')
739
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
740
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
741
+ len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
 
742
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
743
  # This printed data here includes all possible models with different qualities,
744
  # because we may get a hit in either of them.
 
765
 
766
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
767
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
768
+ swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
 
769
  swiss_models_with_data = None
770
 
771
+
772
  if len(swiss_model_aligned) == 0:
773
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
774
  swiss_model_aligned['qmean_norm'] = 'nan'
 
861
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
862
  print(url)
863
  req = requests.get(url)
864
+ name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
865
  with open(name, 'wb') as f:
866
  f.write(req.content)
867
  else:
 
878
  individual.write(str('UniProt ID: ' + protein))
879
  individual.write('\n')
880
  individual.write(str(pdb.contents[3])[10:-11].strip())
881
+ with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
882
  encoding="utf8") as f:
883
  fasta = ''
884
  chain = ''
 
961
  existing_modbase_models = None
962
  existing_modbase_models_ind = None
963
 
964
+
965
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
966
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
967
  'PDBCode': 'template', 'PDBChain': 'chain',
 
1014
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1015
  axis=0,
1016
  ascending=[True, True, True, True, False, True, False])
1017
+ with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
 
1018
 
1019
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1020
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
 
1028
  with_modbase_info.reset_index(inplace=True)
1029
  with_modbase_info.drop('index', axis=1, inplace=True)
1030
 
1031
+
1032
  align = with_modbase_info[
1033
  with_modbase_info.fasta != 'nan']
1034
  yes_pdb_no_match = with_modbase_info[
 
1047
  modbase_aligned = modbase_aligned.astype(str)
1048
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1049
 
1050
+
1051
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1052
  if len(with_modbase_info) != 0:
1053
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
 
1055
  ['datapoint'],
1056
  keep=False)
1057
  else:
1058
+ not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1059
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1060
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1061
+ 'intMet',
1062
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1063
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1064
+ 'crosslink',
1065
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1066
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1067
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1068
+ 'coiledCoil',
1069
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1070
+ 'disulfide',
1071
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1072
+ 'activeSite',
1073
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1074
+ 'crosslink',
1075
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1076
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1077
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1078
+ 'coiledCoil',
1079
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1080
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
 
1081
  with_modbase_info = None
1082
  if len(not_in_aligned) != 0:
1083
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
 
1094
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1095
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1096
  not_nan.score = not_nan.score.astype(float)
1097
+ not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
 
1098
 
1099
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1100
  ascending=[True, True, False])
 
1106
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1107
  if len(which_ones_are_match) == 0:
1108
  which_ones_are_match = pd.DataFrame(
1109
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1110
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1111
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1112
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
 
1142
  not_nan = None
1143
  nan = None
1144
 
1145
+
1146
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1147
 
1148
  # No model
 
1171
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1172
  rest = no_info
1173
  else:
1174
+ rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1175
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1176
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
1177
 
1178
  rest = rest[to_swiss_columns]
1179
  rest = rest.drop_duplicates()
 
1185
 
1186
  else:
1187
 
1188
+ modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1189
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1190
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1191
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1192
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1193
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1194
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1195
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1196
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1197
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1198
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1199
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1200
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1201
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1202
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1203
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1204
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1205
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1206
+ 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1207
+ 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1208
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1209
+ not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1210
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1211
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1212
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1213
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1214
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1215
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1216
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1217
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1218
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1219
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1220
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1221
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1222
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1223
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1224
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1225
+ no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1226
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1227
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1228
+ rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1229
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1230
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
 
 
 
1231
 
1232
  rest = rest[to_swiss_columns]
1233
  rest = rest.drop_duplicates()
 
1263
  not_models = None
1264
  modbase_not_match = None
1265
 
1266
+
1267
  # Final corrections
1268
 
1269
  # Now 3D alignment.
 
1285
 
1286
  # Fix the axes and merge all data.
1287
 
1288
+
1289
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1290
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1291
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
 
1298
  modbase['source'] = 'MODBASE'
1299
  data = pd.concat([swiss, modbase, pdb])
1300
 
1301
+
1302
  data.reset_index(inplace=True)
1303
  data.drop(['index'], axis=1, inplace=True)
1304
  data = data.astype('str')
 
1322
  for pdbID in pdb_only.pdbID.to_list():
1323
  if pdbID not in existing_free_sasa:
1324
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1325
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1326
  outdir=None, force_rerun=False, file_type='pdb'))
1327
 
1328
+
1329
  print('Calculation RSA for SwissModel Files...\n')
1330
  swiss_only = data[data.source == 'SWISSMODEL']
1331
  swiss_dp = []
 
1343
  for pdbID in modbase_only.pdbID.to_list():
1344
  if pdbID not in existing_free_sasa:
1345
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1346
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1347
  outdir=None, force_rerun=False, file_type='pdb'))
1348
 
1349
  # This annotation list is different than the prev one, keep it.
 
1381
  chain = data.at[i, 'chain']
1382
  uniprotID = data.at[i, 'uniprotID']
1383
  pdbID = data.at[i, 'pdbID']
1384
+ alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
 
1385
  mutPos = data.at[i, 'mutationPositionOnPDB']
1386
  try:
1387
+ coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
1388
  except:
1389
  ValueError
1390
  coordMut = 'nan'
1391
  try:
1392
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1393
+ data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
 
1394
  except:
1395
  ValueError
1396
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
 
1438
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1439
  float(data.at[i, 'domainEndonPDB']))
1440
 
1441
+
1442
  data = data.astype(str)
1443
  data.replace({'NaN': 'nan'}, inplace=True)
1444
 
1445
+
1446
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1447
 
1448
  # Get interface positions from ECLAIR. Download HQ human
 
1463
  interface_dataframe.columns = ['uniprotID', 'positions']
1464
 
1465
  if len(data) == 0:
1466
+ data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1467
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1468
+ 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1469
+ 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1470
+ 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1471
+ 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1472
+ 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1473
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1474
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1475
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1476
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1477
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1478
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1479
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1480
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1481
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1482
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1483
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1484
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1485
+ 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1486
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1487
+ 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
 
1488
  else:
1489
  data.sasa = data.sasa.astype('str')
1490
 
 
1523
 
1524
  data.drop(['positions'], axis=1, inplace=True)
1525
 
1526
+
1527
  # OPTIONAL
1528
  # DOMAIN SELECTION
1529
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
 
1542
  # nan--> 0, 0 -->1 and 1 -->2
1543
 
1544
  print('Final adjustments are being done...\n')
1545
+ binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
 
1546
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1547
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1548
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
 
1644
  ready = data.copy()
1645
  # Imputation
1646
  if (impute == 'True') or (impute == 'true') or (impute == True):
1647
+ filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
 
1648
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1649
  col_index = 0
1650
  for col_ in ready.columns[-30:]:
 
1659
  ready = ready.replace({'nan': np.NaN})
1660
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1661
  if len(ready) == 0:
1662
+ print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
 
1663
  print(ready)
1664
  print('Feature vector successfully created...')
1665
  return ready