fatmacankara commited on
Commit
2a5322f
·
1 Parent(s): 162f5ed

Update code/add_alignment.py

Browse files
Files changed (1) hide show
  1. code/add_alignment.py +24 -2
code/add_alignment.py CHANGED
@@ -5,7 +5,27 @@ from pathlib import Path
5
  aligner = Align.PairwiseAligner()
6
  from Bio.pairwise2 import format_alignment
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
9
  def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
10
  print(f'Aligning Datapoint: {identifier}')
11
  print(pdbSequence)
@@ -13,6 +33,8 @@ def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
13
  print(uniprotSequence)
14
  #if len(pdbSequence) >= 1:
15
  #f = open(Path(alignment_path / f'{identifier}_alignment.txt'),"w")
 
 
16
  aligner.mode = 'local'
17
  print('1')
18
  aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
@@ -21,8 +43,8 @@ def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
21
  print('3')
22
  aligner.extend_gap_score = -1
23
  print('4')
24
- print(aligner.align(uniprotSequence, pdbSequence)[0])
25
- alignments = aligner.align(uniprotSequence, pdbSequence)[0]
26
  print('Alignments')
27
  print(alignments)
28
  """
 
5
  aligner = Align.PairwiseAligner()
6
  from Bio.pairwise2 import format_alignment
7
 
8
+ def convert_non_standard_amino_acids(sequence):
9
+ """
10
+ Convert non-standard or ambiguous amino acid codes to their closest relatives.
11
+ """
12
+
13
+ # Define a dictionary to map non-standard codes to standard amino acids
14
+ conversion_dict = {
15
+ 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
16
+ 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
17
+ 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
18
+ 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
19
+ 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
20
+ 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
21
+ # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
22
+ '*': '',
23
+ }
24
+
25
+ # Replace non-standard codes with their closest relatives
26
+ converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
27
 
28
+ return converted_sequence
29
  def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
30
  print(f'Aligning Datapoint: {identifier}')
31
  print(pdbSequence)
 
33
  print(uniprotSequence)
34
  #if len(pdbSequence) >= 1:
35
  #f = open(Path(alignment_path / f'{identifier}_alignment.txt'),"w")
36
+ uniprotSequence = convert_non_standard_amino_acids(uniprotSequence)
37
+ pdbSequence = convert_non_standard_amino_acids(pdbSequence)
38
  aligner.mode = 'local'
39
  print('1')
40
  aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
 
43
  print('3')
44
  aligner.extend_gap_score = -1
45
  print('4')
46
+ print(aligner.align(uniprotSequence, pdbSequence))
47
+ alignments = aligner.align(uniprotSequence, pdbSequence)
48
  print('Alignments')
49
  print(alignments)
50
  """