fatmacankara commited on
Commit
e426677
·
1 Parent(s): 3ed55f0

Update code/alphafold_featureVector.py

Browse files
Files changed (1) hide show
  1. code/alphafold_featureVector.py +273 -36
code/alphafold_featureVector.py CHANGED
@@ -30,6 +30,217 @@ from Bio import Align
30
  from Bio import SeqIO
31
  from Bio.PDB import *
32
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  from huggingface_hub import hf_hub_download
35
 
@@ -371,50 +582,76 @@ def alphafold(input_set, mode, impute):
371
  KeyError
372
  info_per_model[mod][annot] = annotation_pos_on_pdb_
373
 
374
- st.write('-----')
375
- st.write('')
376
- mod = 1
377
- name = 'A0A075B6H7'
378
- pdb_path = hf_hub_download(repo_id="HuBioDataLab/AlphafoldStructures", filename=f"AF-{name}-F{mod}-model_v4.pdb.gz",repo_type = 'dataset')
379
- st.write('PATH')
380
- st.write(pdb_path)
381
- st.write('HER')
382
- #with gzip.open(pdb_path, mode="rt") as f:
383
- # file_content = f.read()
384
- # st.write(file_content)
385
- st.write('REH')
386
- st.write('-----')
387
- st.write('')
388
-
389
 
390
- st.write('HERE1')
391
- st.write('uniprot', uniprotID)
392
- #pdb_path = hf_hub_download(repo_id="HuBioDataLab/AlphafoldStructures", filename=f"AF-{uniprotID}-F{mod}-model_v4.pdb.gz",repo_type = 'dataset')
393
- pdb_path = hf_hub_download(repo_id="HuBioDataLab/AlphafoldStructures", filename=f"AF-A0A075B6H7-F1-model_v4.pdb.gz",repo_type = 'dataset')
394
- st.write('NP')
395
 
396
  with gzip.open(pdb_path, mode="rt") as f:
397
  file_content = f.read()
398
  st.write(file_content)
399
 
400
- st.write('HERE2')
401
- st.write('uniprotID',uniprotID)
402
- st.write('model_num', mod)
403
- st.write('pdb_path', pdb_path)
404
- st.write('pdbSequence', pdbSequence)
405
- st.write('mode',mode)
406
- st.write(Path(path_to_output_files / '3D_alignment'))
407
 
408
 
409
- st.write(get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
410
- 'gzip'))
411
- get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
412
- 'gzip')
413
- if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
414
- 'gzip') != None:
415
- alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
416
- 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
417
- 'gzip')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  alignments = alignments[0]
419
 
420
  calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)
 
30
  from Bio import SeqIO
31
  from Bio.PDB import *
32
  import numpy as np
33
+ ###
34
+ import math
35
+ import glob
36
+ import numpy as np
37
+ from Bio import Align
38
+ import gzip
39
+ from pathlib import Path
40
+ from Bio.Align import substitution_matrices
41
+ from Bio.PDB.Polypeptide import *
42
+ aligner = Align.PairwiseAligner()
43
+ import requests
44
+ from Bio.PDB import PDBParser, PPBuilder
45
+ from io import StringIO
46
+
47
+
48
+ def convert_non_standard_amino_acids(sequence):
49
+ """
50
+ Convert non-standard or ambiguous amino acid codes to their closest relatives.
51
+ """
52
+
53
+ # Define a dictionary to map non-standard codes to standard amino acids
54
+ conversion_dict = {
55
+ 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
56
+ 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
57
+ 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
58
+ 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
59
+ 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
60
+ 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
61
+ # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
62
+ '*': '',
63
+ }
64
+
65
+ # Replace non-standard codes with their closest relatives
66
+ converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
67
+
68
+ return converted_sequence
69
+
70
+ def distance(x1, y1, z1, x2, y2, z2):
71
+ d = math.sqrt(math.pow(x2 - x1, 2) +
72
+ math.pow(y2 - y1, 2) +
73
+ math.pow(z2 - z1, 2) * 1.0)
74
+ return d
75
+
76
+
77
+ def find_distance(coordMut, coordAnnot):
78
+ if coordMut != np.NaN:
79
+ try:
80
+ dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
81
+ float(coordAnnot[1]), float(coordAnnot[2]))
82
+ return "%.2f" % dist
83
+ except:
84
+ ValueError
85
+ dist = 'nan'
86
+ return dist
87
+ else:
88
+ return np.NaN
89
+
90
+
91
+ def threeToOne(variant):
92
+ if variant == "ALA":
93
+ variant = "A"
94
+ elif variant == "ARG":
95
+ variant = "R"
96
+ elif variant == "VAL":
97
+ variant = "V"
98
+ elif variant == "GLU":
99
+ variant = "E"
100
+ elif variant == "PRO":
101
+ variant = "P"
102
+ elif variant == "LEU":
103
+ variant = "L"
104
+ elif variant == "GLY":
105
+ variant = "G"
106
+ elif variant == "ASN":
107
+ variant = "N"
108
+ elif variant == "SER":
109
+ variant = "S"
110
+ elif variant == "GLN":
111
+ variant = "Q"
112
+ elif variant == "THR":
113
+ variant = "T"
114
+ elif variant == "MET":
115
+ variant = "M"
116
+ elif variant == "LYS":
117
+ variant = "K"
118
+ elif variant == "ASP":
119
+ variant = "D"
120
+ elif variant == "ILE":
121
+ variant = "I"
122
+ elif variant == "PHE":
123
+ variant = "F"
124
+ elif variant == "TRP":
125
+ variant = "W"
126
+ elif variant == "TYR":
127
+ variant = "Y"
128
+ elif variant == "HIS":
129
+ variant = "H"
130
+ elif variant == "CYS":
131
+ variant = "C"
132
+ elif variant == 'UNK':
133
+ variant = 'X'
134
+ elif variant == 'ASX':
135
+ variant = 'O'
136
+ return (variant)
137
+
138
+
139
+ def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
140
+ if mode == 1:
141
+ for alignment in alignments[0]:
142
+ alignment = (str(alignment).strip().split('\n'))
143
+ startGap = 0
144
+ if alignment[0].startswith('.'):
145
+ for k in alignment[0]:
146
+ if k == '.' or k == '-':
147
+ startGap += 1
148
+ else:
149
+ break
150
+ countGap = startGap
151
+ countResidue = 0
152
+ for j in alignment[0][startGap:]:
153
+ if j == '.' or j == '-':
154
+ countGap += 1
155
+ else:
156
+ countResidue += 1
157
+ if countResidue == float(annot):
158
+ break
159
+ countGap_pdb = 0
160
+ countResidue_pdb = 0
161
+ for m in alignment[2][0:countResidue + countGap - 1]:
162
+ if m == '.' or m == '-':
163
+ countGap_pdb += 1
164
+ posAtom = countResidue + countGap - countGap_pdb
165
+
166
+ realpdbStart = 0
167
+ for j in alignment[2]:
168
+ if j == '.' or j == '-':
169
+ realpdbStart += 1
170
+ else:
171
+ break
172
+
173
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
174
+ try:
175
+ coordinates = alignments[1]
176
+ residue_numbers = alignments[2]
177
+ coordWeWant = coordinates[posAtom - 1]
178
+ residue_number_we_want = residue_numbers[posAtom - 1]
179
+
180
+ except:
181
+ IndexError
182
+ coordWeWant = 'nan'
183
+ else:
184
+ coordWeWant = 'nan'
185
+ return coordWeWant, posAtom, residue_number_we_want
186
+ if mode == 2:
187
+ if annot != 'nan':
188
+ if int(annot) <= 1400:
189
+ alignment = (str(alignments).strip().split('\n'))
190
+ startGap = 0
191
+ if alignment[0].startswith('.'):
192
+ for k in alignment[0]:
193
+ if k == '.' or k == '-':
194
+ startGap += 1
195
+ else:
196
+ break
197
+ countGap = startGap
198
+ countResidue = 0
199
+ for j in alignment[0][startGap:]:
200
+ if j == '.' or j == '-':
201
+ countGap += 1
202
+ else:
203
+ countResidue += 1
204
+ if countResidue == float(annot):
205
+ break
206
+ countGap_pdb = 0
207
+ countResidue_pdb = 0
208
+ for m in alignment[2][0:countResidue + countGap - 1]:
209
+ if m == '.' or m == '-':
210
+ countGap_pdb += 1
211
+ posAtom = countResidue + countGap - countGap_pdb
212
+ realpdbStart = 0
213
+ for j in alignment[2]:
214
+ if j == '.' or j == '-':
215
+ realpdbStart += 1
216
+ else:
217
+ break
218
+ if len(alignment[2]) > (countResidue + countGap - 1):
219
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
220
+ try:
221
+ coordinates = coords
222
+ residue_numbers = resnums_for_sasa
223
+ coordWeWant = coordinates[posAtom - 1]
224
+ residue_number_we_want = residue_numbers[posAtom - 1]
225
+ except:
226
+ IndexError
227
+ coordWeWant = 'nan'
228
+ residue_number_we_want = 'nan'
229
+ else:
230
+ coordWeWant = 'nan'
231
+ residue_number_we_want = 'nan'
232
+ return coordWeWant, posAtom, residue_number_we_want
233
+ else:
234
+ coordWeWant = 'nan'
235
+ residue_number_we_want = 'nan'
236
+ return coordWeWant, posAtom, residue_number_we_want
237
+ else:
238
+ return np.NaN, np.NaN, np.NaN
239
+ else:
240
+ return np.NaN, np.NaN, np.NaN
241
+
242
+ ###
243
+
244
 
245
  from huggingface_hub import hf_hub_download
246
 
 
582
  KeyError
583
  info_per_model[mod][annot] = annotation_pos_on_pdb_
584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
585
 
586
+ st.write('Downloading the model from ASCARIS dataset.')
587
+ pdb_path = hf_hub_download(repo_id="HuBioDataLab/AlphafoldStructures", filename=f"AF-{uniprotID}-F{mod}-model_v4.pdb.gz",repo_type = 'dataset')
588
+
 
 
589
 
590
  with gzip.open(pdb_path, mode="rt") as f:
591
  file_content = f.read()
592
  st.write(file_content)
593
 
594
+ st.write('Download complete.')
 
 
 
 
 
 
595
 
596
 
597
+ #st.write(get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
598
+ # 'gzip'))
599
+
600
+
601
+
602
+
603
+
604
+ pdbSequence = convert_non_standard_amino_acids(pdbSequence)
605
+ st.write(pdbSequence)
606
+
607
+ st.write('Hello I am in 3Dalignment')
608
+
609
+ atomSequence = ''
610
+ coords = []
611
+ resnums_for_sasa = []
612
+ if file_format == 'txt':
613
+ st.write('Hello I am in 3Dalignment TXT')
614
+ with open(name, encoding="utf8") as f:
615
+ for line in f.readlines():
616
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
617
+ atomSequence += threeToOne(line[17:20].strip())
618
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
619
+ resnums_for_sasa.append(line[22:26].strip())
620
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
621
+ atomSequence += threeToOne(line[17:20].strip())
622
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
623
+ resnums_for_sasa.append(line[22:26].strip())
624
+ elif file_format == 'gzip':
625
+ st.write('Hello I am in 3Dalignment GZIP')
626
+ with gzip.open(pdb_path, mode='rb') as f:
627
+ for line in f:
628
+ line = line.decode()
629
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
630
+ atomSequence += threeToOne(line[17:20].strip())
631
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
632
+ resnums_for_sasa.append(line[22:26].strip())
633
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
634
+ atomSequence += threeToOne(line[17:20].strip())
635
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
636
+ resnums_for_sasa.append(line[22:26].strip())
637
+ #f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
638
+ aligner.mode = 'local'
639
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
640
+ aligner.open_gap_score = -11
641
+ aligner.extend_gap_score = -1
642
+ atomSequence = convert_non_standard_amino_acids(atomSequence)
643
+ alignments = aligner.align(pdbSequence, atomSequence)
644
+ alignments = (list(alignments))
645
+
646
+ #if get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan', 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
647
+ # 'gzip') != None:
648
+
649
+ if alignments != None:
650
+ #alignments, coords, resnums_for_sasa = get_alignments_3D(uniprotID, mod, pdb_path, pdbSequence, 'nan',
651
+ # 'nan', 'nan', mode, Path(path_to_output_files / '3D_alignment'),
652
+ # 'gzip')
653
+
654
+
655
  alignments = alignments[0]
656
 
657
  calculate_freesasa(uniprotID, mod, existing_free_sasa, alphafold_path, path_to_output_files)