fatmacankara commited on
Commit
83a7ed8
·
1 Parent(s): 001c319

Update code/add_alignment.py

Browse files
Files changed (1) hide show
  1. code/add_alignment.py +10 -100
code/add_alignment.py CHANGED
@@ -1,114 +1,36 @@
1
  from Bio import Align
2
  from Bio.Align import substitution_matrices
3
  from pathlib import Path
4
- import streamlit as st
 
5
  from Bio.pairwise2 import format_alignment
6
- from Bio import pairwise2
7
- from Bio import pairwise2
8
- from Bio.SubsMat import MatrixInfo as matlist
9
 
10
 
11
-
12
- """
13
  def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
14
- aligner = Align.PairwiseAligner()
15
  #print(f'Aligning Datapoint: {identifier}')
16
  if len(pdbSequence) >= 1:
17
- f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
 
18
  aligner.mode = 'local'
19
  aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
20
  aligner.open_gap_score = -11
21
  aligner.extend_gap_score = -1
22
  alignments = aligner.align(uniprotSequence, pdbSequence)
23
  alignments = (list(alignments))
24
-
25
- merge_in_threes = str(alignments[0]).split('\n')
26
- K = 3
27
- res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
28
- slice_val = slice(0,len(res),4)
29
- writtenlist = res[slice_val]
30
-
31
- new_alignment = []
32
- for i in writtenlist:
33
- cont1 = list(filter(None, i.split('target')))
34
- cont2 = cont1[0].split('query')
35
- target_pos = (list(filter(None,cont2[0].split(' '))))[0]
36
- target = (list(filter(None,cont2[0].split(' '))))[1]
37
- alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
38
- alg = (list(filter(None,cont2[0].split(' '))))[3]
39
- query_pos = (list(filter(None,cont2[1].split(' '))))[0]
40
- query = (list(filter(None,cont2[1].split(' '))))[1]
41
- if int(target_pos)>0:
42
- new_target = int(target_pos) * 'X' + target
43
- else:
44
- new_target = int(target_pos) * ' ' + target
45
-
46
- if int(alg_pos)>0:
47
- new_alg = int(target_pos) * 'X' + target
48
- else:
49
- new_alg = int(target_pos) * ' ' + alg
50
-
51
- if int(query_pos)>0:
52
- new_query = int(target_pos) * 'X' + target
53
- else:
54
- new_query = int(target_pos) * ' ' + target
55
-
56
- new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
57
  alignment_list = []
58
- k = 0
59
- for alignment in new_alignment:
60
- k += 1
61
- st.write('COUNT', k)
62
- st.write('alignment')
63
- st.write(alignment)
64
- f.write(str(alignment))
65
- f.write('\n')
66
- f.write('\n')
67
- alignment = (str(alignment).strip().split('\n'))
68
- alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
69
- st.write('alignment_updated')
70
- st.write(alignment)
71
- alignment_list.append(alignment)
72
- return alignment_list
73
-
74
- """
75
- def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
76
- aligner = Align.PairwiseAligner()
77
- #print(f'Aligning Datapoint: {identifier}')
78
- if len(pdbSequence) >= 1:
79
- f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
80
- aligner.mode = 'local'
81
- aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
82
- aligner.open_gap_score = -11
83
- aligner.extend_gap_score = -1
84
- alignments = aligner.align(uniprotSequence, pdbSequence)
85
-
86
- sub_matrix = matlist.blosum62
87
- alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)
88
-
89
- alignment_list = []
90
- k = 0
91
  for alignment in alignments:
92
-
93
  f.write(str(alignment))
94
  f.write('\n')
95
  f.write('\n')
96
  alignment = (str(alignment).strip().split('\n'))
97
  alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
98
-
99
  alignment_list.append(alignment)
100
  return alignment_list
101
 
 
102
  def mutation_position_on_pdb(alignment_list, pos):
103
  which_alignment_to_go = 0
104
  for alignment in alignment_list:
105
-
106
- #char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
107
- #for char in alignment[1]:
108
- # if char in char_list:
109
- # alignment[1] = alignment[1].replace(char, '.')
110
-
111
-
112
  which_alignment_to_go += 1
113
  alignment_uniprot = alignment[0]
114
  alignment_pdb = alignment[2]
@@ -119,7 +41,6 @@ def mutation_position_on_pdb(alignment_list, pos):
119
  startGap += 1
120
  else:
121
  break
122
-
123
  countGap = startGap
124
  countResidue = 0
125
  canonicalRes = ' '
@@ -129,7 +50,6 @@ def mutation_position_on_pdb(alignment_list, pos):
129
  countGap += 1
130
  else:
131
  countResidue += 1
132
-
133
  if int(countResidue) == int(pos):
134
  canonicalRes = alignment_uniprot[countResidue + countGap - 1]
135
  try:
@@ -138,7 +58,6 @@ def mutation_position_on_pdb(alignment_list, pos):
138
  IndexError
139
  pdbRes = 'nan'
140
  break
141
-
142
  if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
143
  if canonicalRes == pdbRes:
144
  pdb_alignStatus = 'aligned'
@@ -154,16 +73,12 @@ def mutation_position_on_pdb(alignment_list, pos):
154
  countResidue + countGap - 1] == '-':
155
  mutationPositionOnPDB = 'nan'
156
  posPDB = 'nan'
157
-
158
-
159
  else:
160
  posPDB = countResidue + countGap - countGap_pdb
161
-
162
  mutationPositionOnPDB = str(posPDB)
163
-
164
  break
165
  elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
166
- alignment[1][poscountResidue+ countGap - 1] == '-')):
167
  pdb_alignStatus = 'not_aligned'
168
  mutationPositionOnPDB = 'nan'
169
  elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
@@ -174,7 +89,10 @@ def mutation_position_on_pdb(alignment_list, pos):
174
  countResidue + countGap - 1] == '-':
175
  mutationPositionOnPDB = 'nan'
176
  posPDB = 'nan'
177
-
 
 
 
178
  return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
179
 
180
 
@@ -388,13 +306,9 @@ def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifie
388
 
389
  def final_stage(df, annotation_list, alignment_path):
390
  for i in df.index:
391
-
392
-
393
  identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
394
  alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
395
  df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
396
-
397
- print()
398
  df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
399
  startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
400
  alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
@@ -411,13 +325,9 @@ def final_stage(df, annotation_list, alignment_path):
411
  str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
412
  df.at[i, 'domainStartonPDB'] = 'nan'
413
  df.at[i, 'domainEndonPDB'] = 'nan'
414
-
415
-
416
- df = df.astype(str)
417
  return df
418
 
419
  def alignment(dataframe_to_align, annotation_list, alignment_path):
420
  domainList = ['domStart', 'domEnd']
421
  result = final_stage(dataframe_to_align, annotation_list, alignment_path)
422
  return result
423
- #
 
1
  from Bio import Align
2
  from Bio.Align import substitution_matrices
3
  from pathlib import Path
4
+
5
+ aligner = Align.PairwiseAligner()
6
  from Bio.pairwise2 import format_alignment
 
 
 
7
 
8
 
 
 
9
  def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
 
10
  #print(f'Aligning Datapoint: {identifier}')
11
  if len(pdbSequence) >= 1:
12
+ f = open(Path(alignment_path / f'{identifier}_alignment.txt'),
13
+ "w")
14
  aligner.mode = 'local'
15
  aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
16
  aligner.open_gap_score = -11
17
  aligner.extend_gap_score = -1
18
  alignments = aligner.align(uniprotSequence, pdbSequence)
19
  alignments = (list(alignments))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  alignment_list = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  for alignment in alignments:
 
22
  f.write(str(alignment))
23
  f.write('\n')
24
  f.write('\n')
25
  alignment = (str(alignment).strip().split('\n'))
26
  alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
 
27
  alignment_list.append(alignment)
28
  return alignment_list
29
 
30
+
31
  def mutation_position_on_pdb(alignment_list, pos):
32
  which_alignment_to_go = 0
33
  for alignment in alignment_list:
 
 
 
 
 
 
 
34
  which_alignment_to_go += 1
35
  alignment_uniprot = alignment[0]
36
  alignment_pdb = alignment[2]
 
41
  startGap += 1
42
  else:
43
  break
 
44
  countGap = startGap
45
  countResidue = 0
46
  canonicalRes = ' '
 
50
  countGap += 1
51
  else:
52
  countResidue += 1
 
53
  if int(countResidue) == int(pos):
54
  canonicalRes = alignment_uniprot[countResidue + countGap - 1]
55
  try:
 
58
  IndexError
59
  pdbRes = 'nan'
60
  break
 
61
  if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
62
  if canonicalRes == pdbRes:
63
  pdb_alignStatus = 'aligned'
 
73
  countResidue + countGap - 1] == '-':
74
  mutationPositionOnPDB = 'nan'
75
  posPDB = 'nan'
 
 
76
  else:
77
  posPDB = countResidue + countGap - countGap_pdb
 
78
  mutationPositionOnPDB = str(posPDB)
 
79
  break
80
  elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
81
+ alignment[1][countResidue + countGap - 1] == '-')):
82
  pdb_alignStatus = 'not_aligned'
83
  mutationPositionOnPDB = 'nan'
84
  elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
 
89
  countResidue + countGap - 1] == '-':
90
  mutationPositionOnPDB = 'nan'
91
  posPDB = 'nan'
92
+ else:
93
+ pdb_alignStatus = 'not_aligned'
94
+ mutationPositionOnPDB = 'nan'
95
+ print(pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
96
  return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])
97
 
98
 
 
306
 
307
  def final_stage(df, annotation_list, alignment_path):
308
  for i in df.index:
 
 
309
  identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
310
  alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
311
  df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
 
 
312
  df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
313
  startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
314
  alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
 
325
  str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
326
  df.at[i, 'domainStartonPDB'] = 'nan'
327
  df.at[i, 'domainEndonPDB'] = 'nan'
 
 
 
328
  return df
329
 
330
  def alignment(dataframe_to_align, annotation_list, alignment_path):
331
  domainList = ['domStart', 'domEnd']
332
  result = final_stage(dataframe_to_align, annotation_list, alignment_path)
333
  return result