luost26 commited on
Commit
6d34920
·
1 Parent(s): ae34d04
abnumber/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from abnumber.__version__ import __version__
2
+ from abnumber.chain import Chain
3
+ from abnumber.position import Position, sort_positions
4
+ from abnumber.alignment import Alignment
5
+ from abnumber.common import SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS
6
+ from abnumber.exceptions import ChainParseError
abnumber/__version__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = '0.3.0'
abnumber/alignment.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from abnumber.common import is_similar_residue, is_integer
4
+ from abnumber.position import Position
5
+
6
+
7
+ class Alignment:
8
+ """Antibody chain alignment of two or more chains
9
+
10
+ >>> from abnumber import Chain
11
+ >>>
12
+ >>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP'
13
+ >>> chain1 = Chain(seq1, scheme='imgt')
14
+ >>>
15
+ >>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP'
16
+ >>> chain2 = Chain(seq2, scheme='imgt')
17
+ >>> alignment = chain1.align(chain2)
18
+
19
+ Alignment can be sliced and iterated:
20
+
21
+ >>> for pos, (aa, bb) in alignment[:'5']:
22
+ >>> print(pos, aa, bb)
23
+ H1 Q Q
24
+ H2 V V
25
+ H3 Q Q
26
+ H4 L L
27
+ H5 Q V
28
+ ...
29
+
30
+ """
31
+ def __init__(self, positions, residues, scheme, chain_type):
32
+ assert isinstance(positions, list), 'Expected list of positions and residues. ' \
33
+ 'Use chain.align(other) to create an alignment.'
34
+ assert len(positions) == len(residues)
35
+ unique_cdr_definitions = set(pos.cdr_definition for pos in positions)
36
+ assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
37
+ self.positions = positions
38
+ self.residues = residues
39
+ self.scheme = scheme
40
+ self.chain_type = chain_type
41
+ self._zipped = list(zip(self.positions, self.residues))
42
+
43
+ def __repr__(self):
44
+ return self.format()
45
+
46
+ def __iter__(self):
47
+ yield from self._zipped.__iter__()
48
+
49
+ def __len__(self):
50
+ return len(self.positions)
51
+
52
+ def __getitem__(self, item):
53
+ if isinstance(item, slice):
54
+ if item.step is not None and item.step != 1:
55
+ raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
56
+ return self.slice(start=item.start, stop=item.stop)
57
+ pos = self._parse_position(item)
58
+ raw_pos = self.positions.index(pos)
59
+ return self.residues[raw_pos]
60
+
61
+ def slice(self, start: Union[str, int, 'Position'] = None, stop: Union[str, int, 'Position'] = None,
62
+ stop_inclusive: bool = True, allow_raw: bool = False):
63
+ """Create a slice of this alignment
64
+
65
+ You can also slice directly using ``alignment['111':'112A']`` or ``alignment.raw[10:20]``.
66
+
67
+ :param start: Slice start position (inclusive), :class:`Position` or string (e.g. '111A')
68
+ :param stop: Slice stop position (inclusive), :class:`Position` or string (e.g. '112A')
69
+ :param stop_inclusive: Include stop position in slice
70
+ :param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1
71
+ :return: new sliced Alignment object
72
+ """
73
+
74
+ start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None
75
+ stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None
76
+
77
+ new_positions = []
78
+ new_residues = []
79
+ for pos, residues in zip(self.positions, self.residues):
80
+ if start is not None and pos < start:
81
+ continue
82
+ if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)):
83
+ break
84
+ new_positions.append(pos)
85
+ new_residues.append(residues)
86
+
87
+ return Alignment(positions=new_positions, residues=new_residues, scheme=self.scheme, chain_type=self.chain_type)
88
+
89
+ def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
90
+ """Create :class:`Position` key object from string or int.
91
+
92
+ Note: The position should only be used for indexing, CDR definition is not preserved!
93
+
94
+ :param position: Numeric or string position representation
95
+ :param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1
96
+ :return: new Position object, should only be used for indexing, CDR definition is not preserved!
97
+ """
98
+ if isinstance(position, str):
99
+ return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme)
100
+ if isinstance(position, Position):
101
+ return position
102
+ try:
103
+ position = int(position)
104
+ except TypeError:
105
+ raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"')
106
+ if not allow_raw:
107
+ raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. "
108
+ "For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])")
109
+ if position >= len(self.positions):
110
+ return None
111
+ return self.positions[position]
112
+
113
+ def format(self, mark_identity=True, mark_cdrs=True):
114
+ """Format alignment to string
115
+
116
+ :param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
117
+ :param mark_cdrs: Add line highlighting CDR regions using ``^``
118
+ :return: formatted string
119
+ """
120
+
121
+ def _identity_symbol(a, b):
122
+ return '|' if a == b else ('+' if is_similar_residue(a, b) else '.')
123
+
124
+ lines = []
125
+ for i in range(len(self.residues[0])):
126
+ if mark_identity and i != 0:
127
+ lines.append(''.join(_identity_symbol(aas[i], aas[i-1]) for pos, aas in self))
128
+ lines.append(''.join(aas[i] for pos, aas in self))
129
+ if mark_cdrs:
130
+ if self.positions[0].cdr_definition == 'kabat':
131
+ lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
132
+ else:
133
+ lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
134
+ return '\n'.join(lines)
135
+
136
+ def print(self, mark_identity=True, mark_cdrs=True):
137
+ """Print string representation of alignment created using :meth:`Alignment.format`
138
+
139
+ >>> alignment.print()
140
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
141
+ ||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.|||||||||||
142
+ QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
143
+ ^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^
144
+ >>> alignment.print(mark_identity=False, mark_cdrs=False)
145
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
146
+ QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
147
+
148
+ :param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
149
+ :param mark_cdrs: Add line highlighting CDR regions using ``^``
150
+ """
151
+ print(self.format(mark_identity=mark_identity, mark_cdrs=mark_cdrs))
152
+
153
+ def has_mutation(self):
154
+ """Check if there is a mutation in the alignment or not"""
155
+ return any(len(set(aas)) != 1 for aas in self.residues)
156
+
157
+ def num_mutations(self):
158
+ """Get number of mutations (positions with more than one type of residue)"""
159
+ return sum(len(set(aas)) != 1 for aas in self.residues)
160
+
161
+ @property
162
+ def raw(self):
163
+ """Access raw representation of this alignment to allow unaligned numeric indexing and slicing
164
+
165
+ >>> # Numbering of ``chain.raw`` starts at 0
166
+ >>> alignment.raw[0]
167
+ 'H1'
168
+ >>> # Slicing with string is based on schema numbering, the end is inclusive
169
+ >>> chain['1':'10']
170
+ 'QVQLQQSGAE'
171
+ >>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style)
172
+ >>> chain.raw[0:10]
173
+ 'QVQLQQSGAE'
174
+ :return: Raw alignment accessor that can be sliced or indexed to produce a new :class:`Alignment` object
175
+ """
176
+ return RawAlignmentAccessor(self)
177
+
178
+
179
+ class RawAlignmentAccessor:
180
+ def __init__(self, alignment: Alignment):
181
+ self.alignment = alignment
182
+
183
+ def __getitem__(self, item):
184
+ if isinstance(item, slice):
185
+ if item.step is not None and item.step != 1:
186
+ raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
187
+ if item.start is not None and not is_integer(item.start):
188
+ raise IndexError(f'Expected int start index for alignment.raw, got {type(item.start)}: {item.start}')
189
+ if item.stop is not None and not is_integer(item.stop):
190
+ raise IndexError(f'Expected int end index for alignment.raw, got {type(item.stop)}: {item.stop}')
191
+ return self.alignment.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True)
192
+ if not is_integer(item):
193
+ raise IndexError(f'Expected int indexing for alignment.raw, got {type(item)}: {item}')
194
+ pos = self.alignment.positions[item]
195
+ return self.alignment[pos]
abnumber/chain.py ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+ from typing import Union, List, Generator, Tuple
3
+ from Bio import SeqIO
4
+ from Bio.SeqRecord import SeqRecord
5
+ import pandas as pd
6
+
7
+ from abnumber.alignment import Alignment
8
+ from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \
9
+ is_integer, SCHEME_BORDERS, _get_unique_chains
10
+ from abnumber.exceptions import ChainParseError
11
+ import numpy as np
12
+ from Bio.Seq import Seq
13
+
14
+ from abnumber.position import Position
15
+
16
+
17
+ class Chain:
18
+ """
19
+ Antibody chain aligned to a chosen antibody numbering scheme
20
+
21
+ :example:
22
+
23
+ >>> from abnumber import Chain
24
+ >>>
25
+ >>> seq = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAPSVYPLA'
26
+ >>> chain = Chain(seq, scheme='imgt')
27
+ >>> chain
28
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
29
+ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
30
+
31
+ Chain can be iterated:
32
+
33
+ >>> for pos, aa in chain:
34
+ >>> print(pos, aa)
35
+ H1 Q
36
+ H2 V
37
+ H3 Q
38
+ H4 L
39
+ H5 Q
40
+ ...
41
+
42
+ Chain can also be indexed and sliced using scheme numbering:
43
+
44
+ >>> chain['5']
45
+ 'Q'
46
+ >>> for pos, aa in chain['H2':'H5']:
47
+ >>> print(pos, aa)
48
+ H2 V
49
+ H3 Q
50
+ H4 L
51
+ H5 Q
52
+
53
+ :param sequence: Unaligned string sequence
54
+ :param name: Optional sequence identifier
55
+ :param scheme: Numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho``
56
+ :param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
57
+ One of ``imgt``, ``chothia``, ``kabat``, ``north``. Required for ``aho``.
58
+ :param assign_germline: Assign germline name using ANARCI based on best sequence identity
59
+ :param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
60
+ :param aa_dict: (Internal use only) Create Chain object directly from dictionary of region objects (internal use)
61
+ :param tail: (Internal use only) Constant region sequence
62
+ :param species: (Internal use only) Species as identified by ANARCI
63
+ :param germline: (Internal use only) Germline as identified by ANARCI
64
+ """
65
+
66
+ def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germline=False, allowed_species=None, **kwargs):
67
+ aa_dict = kwargs.pop('aa_dict', None)
68
+ chain_type = kwargs.pop('chain_type', None)
69
+ tail = kwargs.pop('tail', None)
70
+ species = kwargs.pop('species', None)
71
+ v_gene = kwargs.pop('v_gene', None)
72
+ j_gene = kwargs.pop('j_gene', None)
73
+ if isinstance(allowed_species, str):
74
+ allowed_species = [allowed_species]
75
+ if len(kwargs):
76
+ raise TypeError(f'Argument not recognized: {", ".join(kwargs)}')
77
+ if aa_dict is not None:
78
+ if sequence is not None:
79
+ raise ChainParseError('Only one of aa_dict= and sequence= can be provided')
80
+ assert isinstance(aa_dict, dict), f'Expected dict, got: {type(aa_dict)}'
81
+ assert tail is not None
82
+ assert chain_type is not None
83
+ else:
84
+ if sequence is None:
85
+ raise ChainParseError('Expected sequence, got None')
86
+ if not isinstance(sequence, str) and not isinstance(sequence, Seq):
87
+ raise ChainParseError(f'Expected string or Seq, got {type(sequence)}: {sequence}')
88
+ if '-' in sequence:
89
+ raise ChainParseError(f'Please provide an unaligned sequence, got: {sequence}')
90
+ if chain_type is not None:
91
+ raise ChainParseError('Do not use chain_type= when providing sequence=, it will be inferred automatically')
92
+ if tail is not None:
93
+ raise ChainParseError('Do not use tail= when providing sequence=, it will be inferred automatically')
94
+ if isinstance(sequence, Seq):
95
+ sequence = str(sequence)
96
+ results = _anarci_align(sequence, scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline)
97
+ if len(results) > 1:
98
+ raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
99
+ aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
100
+
101
+ _validate_chain_type(chain_type)
102
+
103
+ self.name: str = name
104
+ """User-provided sequence identifier"""
105
+ self.chain_type: str = chain_type
106
+ """Chain type as identified by ANARCI: ``H`` (heavy), ``K`` (kappa light) or ``L`` (lambda light)
107
+
108
+ See also :meth:`Chain.is_heavy_chain` and :meth:`Chain.is_light_chain`.
109
+ """
110
+ self.scheme: str = scheme
111
+ """Numbering scheme used to align the sequence"""
112
+ self.cdr_definition: str = cdr_definition or scheme
113
+ """Numbering scheme to be used for definition of CDR regions (same as ``scheme`` by default)"""
114
+ self.tail: str = tail
115
+ """Constant region sequence"""
116
+ self.species: str = species
117
+ """Species as identified by ANARCI"""
118
+ self.v_gene: str = v_gene
119
+ """V gene germline as identified by ANARCI (if assign_germline is True)"""
120
+ self.j_gene: str = j_gene
121
+ """J gene germline as identified by ANARCI (if assign_germline is True)"""
122
+
123
+ self.fr1_dict = OrderedDict()
124
+ self.cdr1_dict = OrderedDict()
125
+ self.fr2_dict = OrderedDict()
126
+ self.cdr2_dict = OrderedDict()
127
+ self.fr3_dict = OrderedDict()
128
+ self.cdr3_dict = OrderedDict()
129
+ self.fr4_dict = OrderedDict()
130
+
131
+ self._init_from_dict(aa_dict, allowed_species=allowed_species)
132
+
133
+ def _init_from_dict(self, aa_dict, allowed_species):
134
+ if self.scheme not in SUPPORTED_SCHEMES:
135
+ raise NotImplementedError(f'Scheme "{self.scheme}" is not supported. Available schemes: {", ".join(SUPPORTED_SCHEMES)}')
136
+ if self.cdr_definition in ['aho']:
137
+ raise ValueError('CDR regions are not defined for AHo, '
138
+ 'you need to specify cdr_definition="chothia" or another scheme for CDR extraction.')
139
+ if self.cdr_definition not in SUPPORTED_CDR_DEFINITIONS:
140
+ raise NotImplementedError(f'CDR definition "{self.scheme}" is not supported. Available definitions: {", ".join(SUPPORTED_SCHEMES)}')
141
+ # list of region start positions
142
+ borders = SCHEME_BORDERS[self.cdr_definition] if self.cdr_definition in SCHEME_BORDERS else SCHEME_BORDERS[f'{self.cdr_definition}_{self.chain_type}']
143
+
144
+ regions_list = [self.fr1_dict, self.cdr1_dict, self.fr2_dict, self.cdr2_dict, self.fr3_dict, self.cdr3_dict, self.fr4_dict]
145
+ region_idx = 0
146
+
147
+ sorted_positions = sorted(aa_dict.keys())
148
+
149
+ cdr_definition_ready = True
150
+ for pos in sorted_positions:
151
+ assert pos.scheme == self.scheme, f'Schemes of provided position ({pos.scheme}) does not match Chain scheme ({self.scheme})'
152
+ if pos.cdr_definition != self.cdr_definition:
153
+ cdr_definition_ready = False
154
+
155
+ if cdr_definition_ready:
156
+ combined_aa_dict = aa_dict
157
+ else:
158
+ seq = ''.join(aa_dict[pos] for pos in sorted_positions)
159
+ renumbered_aa_dict = _anarci_align(
160
+ seq,
161
+ scheme=self.cdr_definition if self.cdr_definition != 'north' else 'chothia',
162
+ allowed_species=allowed_species
163
+ )[0][0]
164
+ cdr_definition_positions = [pos.number for pos in sorted(renumbered_aa_dict.keys())]
165
+ combined_aa_dict = {}
166
+ for orig_pos, cdr_definition_position in zip(sorted_positions, cdr_definition_positions):
167
+ aa = aa_dict[orig_pos]
168
+ pos = orig_pos.copy()
169
+ pos.set_cdr_definition(self.cdr_definition, cdr_definition_position)
170
+ combined_aa_dict[pos] = aa
171
+
172
+ for pos in sorted(combined_aa_dict.keys()):
173
+ assert isinstance(pos, Position), f'Expected Position object, got {type(pos)}: {pos}'
174
+ aa = combined_aa_dict[pos].upper().strip()
175
+ if aa in [None, '*', '-', '', '.']:
176
+ continue
177
+ while pos.cdr_definition_position >= borders[region_idx]:
178
+ region_idx += 1
179
+ regions_list[region_idx][pos] = aa
180
+
181
+ def __repr__(self):
182
+ return self.format()
183
+
184
+ def __str__(self):
185
+ return self.seq
186
+
187
+ def __iter__(self):
188
+ yield from self.positions.items().__iter__()
189
+
190
+ def __getitem__(self, item):
191
+ if isinstance(item, slice):
192
+ if item.step is not None and item.step != 1:
193
+ raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
194
+ return self.slice(start=item.start, stop=item.stop)
195
+ pos = self._parse_position(item)
196
+ return self.positions[pos]
197
+
198
+ def __len__(self):
199
+ return len(self.positions)
200
+
201
+ def __hash__(self):
202
+ return hash(self.positions)
203
+
204
+ def __eq__(self, other):
205
+ """Check chain equality. Only checks scheme, aligned sequence and tail sequence, ignores name, metadata and CDR definitions."""
206
+ assert isinstance(other, Chain), f'Can only compare Chain to another Chain, got {type(other)}: {other}'
207
+ return self.positions == other.positions and self.tail == other.tail
208
+
209
+ @classmethod
210
+ def to_fasta(cls, chains, path_or_fd, keep_tail=False, description=''):
211
+ """Save multiple chains to FASTA"""
212
+ if isinstance(chains, Chain):
213
+ records = chains.to_seq_record(keep_tail=keep_tail, description=description)
214
+ else:
215
+ records = (chain.to_seq_record(keep_tail=keep_tail, description=description) for chain in chains)
216
+ return SeqIO.write(records, path_or_fd, 'fasta-2line')
217
+
218
+ @classmethod
219
+ def from_fasta(cls, path_or_handle, scheme, cdr_definition=None, as_series=False, as_generator=False, **kwargs) -> Union[List['Chain'], pd.Series, Generator['Chain', None, None]]:
220
+ """Read multiple chains from FASTA"""
221
+ generator = (cls(record.seq, name=record.name, scheme=scheme, cdr_definition=cdr_definition, **kwargs)
222
+ for record in SeqIO.parse(path_or_handle, 'fasta'))
223
+ if as_generator:
224
+ return generator
225
+ chains = list(generator)
226
+ if as_series:
227
+ return pd.Series(chains, index=[c.name for c in chains])
228
+ return chains
229
+
230
+ def to_seq_record(self, keep_tail=False, description=''):
231
+ """Create BioPython SeqRecord object from this Chain"""
232
+ if not self.name:
233
+ raise ValueError('Name needs to be present to convert to a SeqRecord')
234
+ seq = Seq(self.seq + self.tail if keep_tail else self.seq)
235
+ return SeqRecord(seq, id=self.name, description=description)
236
+
237
+ @classmethod
238
+ def to_anarci_csv(cls, chains: List['Chain'], path):
239
+ """Save multiple chains to ANARCI-like CSV"""
240
+ df = cls.to_dataframe(chains)
241
+ df.to_csv(path)
242
+
243
+ @classmethod
244
+ def to_dataframe(cls, chains: List['Chain']):
245
+ """Produce a Pandas dataframe with aligned chain sequences in the columns
246
+
247
+ Note: Contains only positions (columns) that are present in the provided chains,
248
+ so number of columns can differ based on the input.
249
+ """
250
+ series_list = [chain.to_series() for chain in chains]
251
+
252
+ # Each chain can have a different set of positions
253
+ # so we need to sort the columns to make sure they are in the right order
254
+ # this is using the correct Position sorting
255
+ columns = set(c for series in series_list for c in series.index)
256
+ prop_columns = [c for c in columns if not isinstance(c, Position)]
257
+ position_columns = sorted([c for c in columns if isinstance(c, Position)])
258
+ # Columns can come from K and L chain, so we need to convert them to string and remove duplicates here
259
+ position_columns_str = pd.Series(
260
+ [pos.format(chain_type=False) for pos in position_columns]
261
+ ).drop_duplicates().to_list()
262
+
263
+ # Get full list of string columns
264
+ columns_str = prop_columns + position_columns_str
265
+
266
+ # Reindex each series using ordered list of string columns
267
+ series_list_ordered = []
268
+ for series in series_list:
269
+ series.index = series.index.map(lambda pos: pos.format(chain_type=False))
270
+ series_list_ordered.append(series.reindex(columns_str))
271
+
272
+ df = pd.DataFrame(series_list_ordered)[columns_str].fillna('-')
273
+ df.index.name = 'Id'
274
+
275
+ return df
276
+
277
+ def to_series(self):
278
+ props = {
279
+ 'chain_type': self.chain_type,
280
+ 'species': self.species
281
+ }
282
+ return pd.Series({**props, **self.positions}, name=self.name)
283
+
284
+ @classmethod
285
+ def from_series(cls, series, scheme, cdr_definition=None) -> 'Chain':
286
+ chain_type = series['chain_type']
287
+ species = series.get('species')
288
+ position_index = [c for c in series.index if c[:1].isnumeric()]
289
+ aa_dict = {Position.from_string(pos, chain_type=chain_type, scheme=scheme): aa
290
+ for pos, aa in series[position_index].items() if aa != '-' and not pd.isna(aa)}
291
+ return cls(sequence=None, aa_dict=aa_dict, name=series.name, scheme=scheme, cdr_definition=cdr_definition,
292
+ chain_type=chain_type, species=species, tail='')
293
+
294
+ @classmethod
295
+ def from_anarci_csv(cls, path, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]:
296
+ df = pd.read_csv(path, index_col=0)
297
+ return cls.from_dataframe(df, scheme=scheme, cdr_definition=cdr_definition, as_series=as_series)
298
+
299
+ @classmethod
300
+ def from_dataframe(cls, df, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]:
301
+ chains = [cls.from_series(series, scheme=scheme, cdr_definition=cdr_definition) for i, series in df.iterrows()]
302
+ if as_series:
303
+ return pd.Series(chains, index=[c.name for c in chains])
304
+ return chains
305
+
306
+ def format(self, method='wide', **kwargs):
307
+ """Format sequence to string
308
+
309
+ :param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()`
310
+ :return: formatted string
311
+ """
312
+ if method == 'wide':
313
+ return self.format_wide(**kwargs)
314
+ elif method == 'tall':
315
+ return self.format_tall(**kwargs)
316
+ raise ValueError(f'Use method="wide" or method="tall", unknown method: "{method}"')
317
+
318
+ def print(self, method='wide', **kwargs):
319
+ """Print string representation using :meth:`Chain.format`
320
+
321
+ By default, produces "wide" format with sequence on first line and CDR regions higlighted with ``^`` on second line:
322
+
323
+ >>> chain.print()
324
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
325
+ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
326
+
327
+ :param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()`
328
+ """
329
+ print(self.format(method=method, **kwargs))
330
+
331
+ def format_tall(self, columns=5):
332
+ """Create string with one position per line, showing position numbers and amino acids
333
+
334
+ :return: formatted string
335
+ """
336
+ height = int(np.ceil(len(self) / columns))
337
+ rows = [''] * height
338
+ for column, start in enumerate(range(0, len(self), height)):
339
+ chain_slice = self.raw[start:start+height]
340
+ for row, (pos, aa) in enumerate(chain_slice):
341
+ rows[row] = rows[row].ljust(column * 15)
342
+ pos_format = (pos.get_region() + ' ' if pos.is_in_cdr() else '') + pos.format()
343
+ rows[row] += f'{pos_format.rjust(9)} {aa}'
344
+
345
+ return '\n'.join(rows)
346
+
347
+ def print_tall(self, columns=5):
348
+ """Print string representation using :meth:`Chain.format_tall`
349
+
350
+ >>> chain.print_tall()
351
+ FR1 H1 Q
352
+ FR1 H2 V
353
+ FR1 H3 Q
354
+ FR1 H4 L
355
+ FR1 H5 Q
356
+ FR1 H6 Q
357
+ FR1 H7 S
358
+ ...
359
+ """
360
+ print(self.format_tall(columns=columns))
361
+
362
+ def format_wide(self, numbering=False):
363
+ """Create string with sequence on first line and CDR regions higlighted with `^` on second line
364
+
365
+ :param numbering: Add position numbers on top
366
+ :return: formatted string
367
+ """
368
+ lines = []
369
+ if numbering:
370
+
371
+ first_order = ''
372
+ prev_number = None
373
+ after_double_digit = False
374
+ for pos in self.positions:
375
+ number = str(pos.number // 10)
376
+ if number != prev_number:
377
+ if after_double_digit:
378
+ # Special case: when double digits follow another double digits, do not print the first digit
379
+ number = number[1:]
380
+ first_order += number
381
+ if len(number) > 1:
382
+ after_double_digit = True
383
+ else:
384
+ if after_double_digit:
385
+ # Special case: After 10, 11, etc, skip adding the space
386
+ after_double_digit = False
387
+ else:
388
+ first_order += ' '
389
+ prev_number = number
390
+
391
+ lines.append(first_order)
392
+ lines.append(''.join(str(pos.number % 10) for pos in self.positions))
393
+ letters = ''.join(pos.letter or ' ' for pos in self.positions)
394
+ if letters.strip():
395
+ lines.append(letters)
396
+ lines.append(self.seq)
397
+ if self.cdr_definition == 'kabat':
398
+ lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
399
+ else:
400
+ lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
401
+ return '\n'.join(lines)
402
+
403
+ def print_wide(self, numbering=False):
404
+ """Print string representation using :meth:`Chain.format_wide`
405
+
406
+ >>> chain.print_wide()
407
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
408
+ ^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
409
+ """
410
+ print(self.format_wide(numbering=numbering))
411
+
412
+ def is_heavy_chain(self):
413
+ """Check if this chain is heavy chain (``chain_type=="H"``)"""
414
+ return self.chain_type == 'H'
415
+
416
+ def is_light_chain(self):
417
+ """Check if this chain is light chain (``chain_type=="K" or chain_type=="L"``)"""
418
+ return self.is_lambda_light_chain() or self.is_kappa_light_chain()
419
+
420
+ def is_lambda_light_chain(self):
421
+ """Check if this chain is lambda light chain (``chain_type=="L"``)"""
422
+ return self.chain_type == 'L'
423
+
424
+ def is_kappa_light_chain(self):
425
+ """Check if this chain is kappa light chain (``chain_type=="K"``)"""
426
+ return self.chain_type == 'K'
427
+
428
+ def align(self, *other) -> 'Alignment':
429
+ """Align this chain to other chains by using their existing numbering
430
+
431
+ >>> from abnumber import Chain
432
+ >>>
433
+ >>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP'
434
+ >>> chain1 = Chain(seq1, scheme='imgt')
435
+ >>>
436
+ >>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP'
437
+ >>> chain2 = Chain(seq2, scheme='imgt')
438
+ >>>
439
+ >>> alignment = chain1.align(chain2)
440
+ >>> print(alignment.format())
441
+ QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
442
+ ||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.|||||||||||
443
+ QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
444
+ ^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^
445
+
446
+ :param other: The :class:`Chain` object to align, can be repeated to create a multiple sequence alignment
447
+ :return: :class:`Alignment` object
448
+ """
449
+ pos_dicts = [self.positions]
450
+ for chain in other:
451
+ assert isinstance(chain, Chain), f'Expected Chain object, got {type(chain)}: {chain}'
452
+ pos_dicts.append(chain.positions)
453
+
454
+ unique_cdr_definitions = set(pos.cdr_definition for pos_dict in pos_dicts for pos in pos_dict.keys())
455
+ assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
456
+
457
+ shared_pos = sorted(set(pos for pos_dict in pos_dicts for pos in pos_dict.keys()))
458
+ residues = [tuple(pos_dict.get(pos, '-') for pos_dict in pos_dicts) for pos in shared_pos]
459
+ return Alignment(shared_pos, residues, chain_type=self.chain_type, scheme=self.scheme)
460
+
461
+ def clone(self, replace_seq: str = None):
462
+ """Create a copy of this chain, optionally with a replacement sequence
463
+
464
+ :param replace_seq: Optional replacement sequence, needs to be the same length
465
+ :return: new Chain object
466
+ """
467
+ return self.slice(replace_seq=replace_seq)
468
+
469
+ def slice(self, replace_seq: str = None, start: Union[str, int, 'Position'] = None,
470
+ stop: Union[str, int, 'Position'] = None, stop_inclusive: bool = True, allow_raw: bool = False):
471
+ """Create a slice of this chain, optionally with a replacement sequence that is placed into the same numbering
472
+
473
+ You can also slice directly using ``chain['111':'112A']`` or ``chain.raw[10:20]``.
474
+
475
+ :param replace_seq: Optional replacement sequence, needs to be the same length
476
+ :param start: Optional slice start position (inclusive), :class:`Position` or string (e.g. '111A')
477
+ :param stop: Optional slice stop position (inclusive), :class:`Position` or string (e.g. '112A')
478
+ :param stop_inclusive: Include stop position in slice
479
+ :param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1
480
+ :return: new Chain object
481
+ """
482
+ aa_dict = {}
483
+ positions = self.positions
484
+ if replace_seq is not None:
485
+ assert len(replace_seq) == len(positions), 'Sequence needs to be the same length'
486
+
487
+ start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None
488
+ stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None
489
+
490
+ for i, (pos, aa) in enumerate(positions.items()):
491
+ if start is not None and pos < start:
492
+ continue
493
+ if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)):
494
+ break
495
+ aa_dict[pos] = replace_seq[i] if replace_seq is not None else aa
496
+
497
+ return Chain(
498
+ sequence=None,
499
+ aa_dict=aa_dict,
500
+ name=self.name,
501
+ scheme=self.scheme,
502
+ chain_type=self.chain_type,
503
+ cdr_definition=self.cdr_definition,
504
+ tail=self.tail,
505
+ species=self.species,
506
+ v_gene=self.v_gene,
507
+ j_gene=self.j_gene
508
+ )
509
+
510
+ def renumber(self, scheme=None, cdr_definition=None, allowed_species=None):
511
+ """Return copy of this chain aligned using a different numbering scheme or CDR definition
512
+
513
+ :param scheme: Change numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho``.
514
+ :param cdr_definition: Change CDR definition scheme: One of ``imgt``, ``chothia``, ``kabat``, ``north``.
515
+ :param allowed_species: ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
516
+ """
517
+
518
+ return Chain(
519
+ self.seq + self.tail,
520
+ name=self.name,
521
+ allowed_species=allowed_species,
522
+ scheme=scheme or self.scheme,
523
+ cdr_definition=cdr_definition or scheme or self.cdr_definition,
524
+ assign_germline=self.v_gene is not None
525
+ )
526
+
527
+ def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutations: List[Union['Position',str]] = [], name: str = None) -> 'Chain':
528
+ """Graft CDRs from this Chain onto another chain
529
+
530
+ :param other: Chain to graft CDRs into (source of frameworks and tail sequence)
531
+ :param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations)
532
+ :param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`)
533
+ :param name: Name of new Chain. If not provided, use name of this chain.
534
+ :return: Chain with CDRs grafted from this chain and frameworks from the given chain
535
+ """
536
+ assert self.scheme == other.scheme, \
537
+ f'Sequences need to have the same numbering scheme, got {self.scheme} and {other.scheme}'
538
+ assert self.cdr_definition == other.cdr_definition, \
539
+ f'Sequences need to have the same CDR definition, got {self.cdr_definition} and {other.cdr_definition}'
540
+ assert self.chain_type == other.chain_type, \
541
+ f'Sequences need to have the same chain type, got {self.chain_type} and {other.chain_type}'
542
+
543
+ backmutations = [self._parse_position(pos) for pos in backmutations]
544
+
545
+ grafted_dict = {pos: aa for pos, aa in other if not pos.is_in_cdr()}
546
+ for pos, aa in self:
547
+ if pos.is_in_cdr() or (backmutate_vernier and pos.is_in_vernier()) or pos in backmutations:
548
+ grafted_dict[pos] = aa
549
+
550
+ return Chain(sequence=None, aa_dict=grafted_dict, name=name or self.name, chain_type=self.chain_type,
551
+ scheme=self.scheme, cdr_definition=self.cdr_definition, tail=other.tail,
552
+ v_gene=other.v_gene, j_gene=other.j_gene)
553
+
554
+ def graft_cdrs_onto_human_germline(self, v_gene=None, j_gene=None,
555
+ backmutate_vernier=False, backmutations: List[Union['Position',str]] = []):
556
+ """Graft CDRs from this Chain onto the nearest human germline sequence
557
+
558
+ :param v_gene: Use defined V germline allele (e.g. IGHV1-18*01), gene (e.g. IGHV1-18) or family (e.g. IGHV1)
559
+ :param j_gene: Use defined J germline allele (e.g. IGHJ1*01) or gene (e.g. IGHJ1)
560
+ :param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations)
561
+ :param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`)
562
+ :return: Chain with CDRs grafted from this chain and frameworks from TODO
563
+ """
564
+ germline_chain = self.find_merged_human_germline(v_gene=v_gene, j_gene=j_gene)
565
+
566
+ if self.scheme != 'imgt' or self.cdr_definition != 'imgt':
567
+ germline_chain = germline_chain.renumber(self.scheme, self.cdr_definition)
568
+
569
+ return self.graft_cdrs_onto(germline_chain, backmutate_vernier=backmutate_vernier, backmutations=backmutations)
570
+
571
+ def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
572
+ """Create :class:`Position` key object from string or int.
573
+
574
+ Note: The position should only be used for indexing, CDR definition is not preserved!
575
+
576
+ :param position: Numeric or string position representation
577
+ :param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1
578
+ :return: new Position object, should only be used for indexing, CDR definition is not preserved!
579
+ """
580
+ if isinstance(position, str):
581
+ return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme)
582
+ if isinstance(position, Position):
583
+ return position
584
+ try:
585
+ position = int(position)
586
+ except TypeError:
587
+ raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"')
588
+ if not allow_raw:
589
+ raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. "
590
+ "For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])")
591
+ if position >= len(self.positions):
592
+ return None
593
+ return self.get_position_by_raw_index(position)
594
+
595
+ def get_position_by_raw_index(self, index):
596
+ """Get Position object at corresponding raw numeric position"""
597
+ return list(self.positions.keys())[index]
598
+
599
+ def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True) -> Tuple[List['Chain'], List['Chain']]:
600
+ """Find most identical V and J germline sequences based on IMGT alignment
601
+
602
+ :param limit: Number of best matching germlines to return
603
+ :param v_gene: Filter germlines to specific V gene name
604
+ :param j_gene: Filter germlines to specific J gene name
605
+ :param unique: Skip germlines with duplicate amino acid sequence
606
+ :return: list of top V chains, list of top J chains
607
+ """
608
+ from abnumber.germlines import get_imgt_v_chains, get_imgt_j_chains
609
+
610
+ chain = self if self.scheme == 'imgt' and self.cdr_definition == 'imgt' else self.renumber('imgt')
611
+ v_chains = list(get_imgt_v_chains(chain.chain_type).values())
612
+ j_chains = list(get_imgt_j_chains(chain.chain_type).values())
613
+
614
+ if v_gene:
615
+ if v_gene.startswith('IGKV') and self.chain_type == 'L':
616
+ raise NotImplementedError('Cannot graft lambda chain into kappa chain')
617
+ if v_gene.startswith('IGLV') and self.chain_type == 'K':
618
+ raise NotImplementedError('Cannot graft kappa chain into lambda chain')
619
+ v_chains = [chain for chain in v_chains if chain.name.startswith(v_gene)]
620
+ if not v_chains:
621
+ print('Available V genes:', get_imgt_v_chains(chain.chain_type).keys())
622
+ raise ValueError(f'No V genes found for "{chain.chain_type}" chain gene name "{v_gene}"')
623
+
624
+ if j_gene:
625
+ j_chains = [chain for chain in j_chains if chain.name.startswith(j_gene)]
626
+ if not j_chains:
627
+ print('Available J genes:', get_imgt_j_chains(chain.chain_type).keys())
628
+ raise ValueError(f'No J genes found for "{chain.chain_type}" chain gene name "{j_gene}"')
629
+
630
+ if unique:
631
+ v_chains = _get_unique_chains(v_chains)
632
+ j_chains = _get_unique_chains(j_chains)
633
+
634
+ v_alignments = [chain.align(germline) for germline in v_chains]
635
+ v_ranks = np.array([alignment.num_mutations() for alignment in v_alignments]).argsort(kind='stable')[:limit]
636
+ top_v_chains = [v_chains[r] for r in v_ranks]
637
+
638
+ j_alignments = [chain.align(germline) for germline in j_chains]
639
+ j_ranks = np.array([alignment.num_mutations() for alignment in j_alignments]).argsort(kind='stable')[:limit]
640
+ top_j_chains = [j_chains[r] for r in j_ranks]
641
+
642
+ return top_v_chains, top_j_chains
643
+
644
+ def find_merged_human_germline(self, top=0, v_gene=None, j_gene=None) -> 'Chain':
645
+ """Find n-th most identical V and J germline sequence based on IMGT alignment and merge them into one Chain
646
+
647
+ :param top: Return top N most identical germline (0-indexed)
648
+ :param v_gene: Filter germlines to specific V gene name
649
+ :param j_gene: Filter germlines to specific J gene name
650
+ :return: merged germline sequence Chain object
651
+ """
652
+ v_chains, j_chains = self.find_human_germlines(limit=top+1, v_gene=v_gene, j_gene=j_gene)
653
+ v_chain = v_chains[top]
654
+ j_chain = j_chains[top]
655
+
656
+ merged_dict = {
657
+ **{pos: aa for pos, aa in j_chain},
658
+ **{pos: aa for pos, aa in v_chain}
659
+ }
660
+
661
+ return Chain(
662
+ sequence=None,
663
+ aa_dict=merged_dict,
664
+ chain_type=self.chain_type,
665
+ scheme='imgt',
666
+ tail=''
667
+ )
668
+
669
+ @property
670
+ def raw(self):
671
+ """Access raw representation of this chain to allow unaligned numeric indexing and slicing
672
+
673
+ >>> # String numbering is based on schema numbering
674
+ >>> chain['1']
675
+ 'QVQLQQSGAE'
676
+ >>> # Numbering of ``chain.raw`` starts at 0
677
+ >>> chain.raw[0]
678
+ 'QVQLQQSGAE'
679
+ >>> # Slicing with string is based on schema numbering, the end is inclusive
680
+ >>> chain['1':'10']
681
+ 'QVQLQQSGAE'
682
+ >>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style)
683
+ >>> chain.raw[0:10]
684
+ 'QVQLQQSGAE'
685
+
686
+ :return: Raw chain accessor that can be sliced or indexed to produce a new :class:`Chain` object
687
+ """
688
+ return RawChainAccessor(self)
689
+
690
+ @property
691
+ def regions(self):
692
+ """Dictionary of region dictionaries
693
+
694
+ Region is an uppercase string, one of: ``"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"``
695
+
696
+ :return: Dictionary of Region name -> Dictionary of (:class:`Position` -> Amino acid)
697
+ """
698
+ return OrderedDict(
699
+ FR1=self.fr1_dict,
700
+ CDR1=self.cdr1_dict,
701
+ FR2=self.fr2_dict,
702
+ CDR2=self.cdr2_dict,
703
+ FR3=self.fr3_dict,
704
+ CDR3=self.cdr3_dict,
705
+ FR4=self.fr4_dict
706
+ )
707
+
708
+ @property
709
+ def positions(self):
710
+ """Dictionary of :class:`Position` -> Amino acid"""
711
+ positions = OrderedDict()
712
+ for region, aa_dict in self.regions.items():
713
+ for pos, aa in aa_dict.items():
714
+ positions[pos] = aa
715
+ return positions
716
+
717
+ @property
718
+ def seq(self):
719
+ """Unaligned string representation of the variable chain sequence
720
+
721
+ :return: Unaligned string representation of the variable chain sequence
722
+ """
723
+ return ''.join(self.positions.values())
724
+
725
+ @property
726
+ def fr1_seq(self):
727
+ """Unaligned string representation of the Framework 1 region sequence"""
728
+ return ''.join(self.fr1_dict.values())
729
+
730
+ @property
731
+ def cdr1_seq(self):
732
+ """Unaligned string representation of the CDR 1 region sequence"""
733
+ return ''.join(self.cdr1_dict.values())
734
+
735
+ @property
736
+ def fr2_seq(self):
737
+ """Unaligned string representation of the Framework 2 region sequence"""
738
+ return ''.join(self.fr2_dict.values())
739
+
740
+ @property
741
+ def cdr2_seq(self):
742
+ """Unaligned string representation of the CDR 2 region sequence"""
743
+ return ''.join(self.cdr2_dict.values())
744
+
745
+ @property
746
+ def fr3_seq(self):
747
+ """Unaligned string representation of the Framework 3 region sequence"""
748
+ return ''.join(self.fr3_dict.values())
749
+
750
+ @property
751
+ def cdr3_seq(self):
752
+ """Unaligned string representation of the CDR 3 region sequence"""
753
+ return ''.join(self.cdr3_dict.values())
754
+
755
+ @property
756
+ def fr4_seq(self):
757
+ """Unaligned string representation of the Framework 4 region sequence"""
758
+ return ''.join(self.fr4_dict.values())
759
+
760
+
761
+ class RawChainAccessor:
762
+ def __init__(self, chain: Chain):
763
+ self.chain = chain
764
+
765
+ def __getitem__(self, item):
766
+ if isinstance(item, slice):
767
+ if item.step is not None and item.step != 1:
768
+ raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
769
+ if item.start is not None and not is_integer(item.start):
770
+ raise IndexError(f'Expected int start index for chain.raw, got {type(item.start)}: {item.start}')
771
+ if item.stop is not None and not is_integer(item.stop):
772
+ raise IndexError(f'Expected int end index for chain.raw, got {type(item.stop)}: {item.stop}')
773
+ return self.chain.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True)
774
+ if not is_integer(item):
775
+ raise IndexError(f'Expected int indexing for chain.raw, got {type(item)}: {item}')
776
+ pos = self.chain.get_position_by_raw_index(item)
777
+ return self.chain[pos]
778
+
779
+
780
+
781
+
abnumber/common.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from typing import List, Tuple
3
+ import re
4
+ import numpy as np
5
+ from abnumber.exceptions import ChainParseError
6
+ try:
7
+ from anarci.anarci import anarci
8
+ except ImportError:
9
+ # Only print the error without failing - required to import
10
+ print('ANARCI module not available. Please install it separately or install AbNumber through Bioconda')
11
+ print('See: https://abnumber.readthedocs.io/')
12
+ sys.exit(1)
13
+
14
+ POS_REGEX = re.compile(r'([HL]?)(\d+)([A-Z]?)')
15
+ WHITESPACE = re.compile(r'\s+')
16
+
17
+
18
+ def _validate_chain_type(chain_type):
19
+ assert chain_type in ['H', 'L', 'K'], \
20
+ f'Invalid chain type "{chain_type}", it should be "H" (heavy), "L" (lambda light chian) or "K" (kappa light chain)'
21
+
22
+
23
+ def _anarci_align(sequence, scheme, allowed_species, assign_germline=False) -> List[Tuple]:
24
+ from abnumber.position import Position
25
+ sequence = re.sub(WHITESPACE, '', sequence)
26
+ all_numbered, all_ali, all_hits = anarci(
27
+ [('id', sequence)],
28
+ scheme=scheme,
29
+ allowed_species=allowed_species,
30
+ assign_germline=assign_germline
31
+ )
32
+ seq_numbered = all_numbered[0]
33
+ seq_ali = all_ali[0]
34
+ if seq_numbered is None:
35
+ raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
36
+ assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
37
+ results = []
38
+ for (positions, start, end), ali in zip(seq_numbered, seq_ali):
39
+ chain_type = ali['chain_type']
40
+ species = ali['species']
41
+ v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
42
+ j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
43
+ aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
44
+ for (num, letter), aa in positions if aa != '-'}
45
+ tail = sequence[end+1:]
46
+ results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
47
+ return results
48
+
49
+
50
+ def _get_unique_chains(chains):
51
+ seqs = set()
52
+ chains_filtered = []
53
+ for chain in chains:
54
+ if chain.seq in seqs:
55
+ continue
56
+ seqs.add(chain.seq)
57
+ chains_filtered.append(chain)
58
+ return chains_filtered
59
+
60
+
61
+ # Based on positive score in Blosum62
62
+ SIMILAR_PAIRS = {'AA', 'AS', 'CC', 'DD', 'DE', 'DN', 'ED', 'EE', 'EK', 'EQ', 'FF', 'FW', 'FY', 'GG', 'HH', 'HN', 'HY',
63
+ 'II', 'IL', 'IM', 'IV', 'KE', 'KK', 'KQ', 'KR', 'LI', 'LL', 'LM', 'LV', 'MI', 'ML', 'MM', 'MV', 'ND',
64
+ 'NH', 'NN', 'NS', 'PP', 'QE', 'QK', 'QQ', 'QR', 'RK', 'RQ', 'RR', 'SA', 'SN', 'SS', 'ST', 'TS', 'TT',
65
+ 'VI', 'VL', 'VM', 'VV', 'WF', 'WW', 'WY', 'YF', 'YH', 'YW', 'YY'}
66
+
67
+
68
+ def is_similar_residue(a, b):
69
+ if a == '-' or b == '-':
70
+ return a == b
71
+ return a+b in SIMILAR_PAIRS
72
+
73
+
74
+ def is_integer(object):
75
+ return isinstance(object, int) or isinstance(object, np.integer)
76
+
77
+
78
+ SUPPORTED_SCHEMES = ['imgt', 'aho', 'chothia', 'kabat']
79
+ SUPPORTED_CDR_DEFINITIONS = ['imgt', 'chothia', 'kabat', 'north']
80
+
81
+ SCHEME_BORDERS = {
82
+ # Start coordinates
83
+ # CDR1, FR2, CDR2, FR3, CDR3, FR4
84
+ 'imgt': [27, 39, 56, 66, 105, 118, 129],
85
+ 'kabat_H': [31, 36, 50, 66, 95, 103, 114],
86
+ 'kabat_K': [24, 35, 50, 57, 89, 98, 108],
87
+ 'kabat_L': [24, 35, 50, 57, 89, 98, 108],
88
+ 'chothia_H': [26, 33, 52, 57, 95, 103, 114],
89
+ 'chothia_K': [24, 35, 50, 57, 89, 98, 108],
90
+ 'chothia_L': [24, 35, 50, 57, 89, 98, 108],
91
+ 'north_H': [23, 36, 50, 59, 93, 103, 114],
92
+ 'north_K': [24, 35, 49, 57, 89, 98, 108],
93
+ 'north_L': [24, 35, 49, 57, 89, 98, 108],
94
+ }
95
+
96
+ # { scheme -> { region -> list of position numbers } }
97
+ SCHEME_REGIONS = {
98
+ scheme: {
99
+ 'FR1': list(range(1, borders[0])),
100
+ 'CDR1': list(range(borders[0], borders[1])),
101
+ 'FR2': list(range(borders[1], borders[2])),
102
+ 'CDR2': list(range(borders[2], borders[3])),
103
+ 'FR3': list(range(borders[3], borders[4])),
104
+ 'CDR3': list(range(borders[4], borders[5])),
105
+ 'FR4': list(range(borders[5], borders[6])),
106
+ } for scheme, borders in SCHEME_BORDERS.items()
107
+ }
108
+
109
+ # { scheme -> { position number -> region } }
110
+ SCHEME_POSITION_TO_REGION = {
111
+ scheme: {pos_num: region for region, positions in regions.items() for pos_num in positions} \
112
+ for scheme, regions in SCHEME_REGIONS.items()
113
+ }
114
+
115
+ # { scheme -> set of vernier position numbers }
116
+ SCHEME_VERNIER = {
117
+ # 'imgt_H': frozenset([2, 52, 53, 54, 76, 78, 80, 82, 87, 118]),
118
+ # 'chothia_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
119
+ # 'north_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
120
+ 'kabat_H': frozenset([2, 27, 28, 29, 30, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
121
+
122
+ # 'imgt_K': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
123
+ # 'imgt_L': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
124
+ # 'chothia_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
125
+ # 'chothia_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
126
+ # 'north_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
127
+ # 'north_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
128
+ 'kabat_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
129
+ 'kabat_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
130
+ }
131
+
132
+ #'kabat_H': 31-35, 50-65, 95-102
133
+ #'kabat_K': 24-34, 50-56, 89-97
abnumber/exceptions.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ class ChainParseError(Exception):
2
+ pass
abnumber/germlines.py ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _HUMAN_IMGT_V_CHAINS = None
2
+ _HUMAN_IMGT_J_CHAINS = None
3
+
4
+
5
+ def get_imgt_chain(gene_name):
6
+ if gene_name.startswith('IGH'):
7
+ chain_type = 'H'
8
+ elif gene_name.startswith('IGK'):
9
+ chain_type = 'K'
10
+ elif gene_name.startswith('IGL'):
11
+ chain_type = 'L'
12
+ else:
13
+ raise ValueError(f'Gene name should start with IG(H/K/L), got: {gene_name}')
14
+
15
+ if gene_name.startswith(f'IG{chain_type}V'):
16
+ chains = get_imgt_v_chains(chain_type)
17
+ elif gene_name.startswith(f'IG{chain_type}J'):
18
+ chains = get_imgt_j_chains(chain_type)
19
+ else:
20
+ raise ValueError(f'Expected V or J gene name, got: {gene_name}')
21
+
22
+ if gene_name not in chains:
23
+ suffixes = [chain_name for chain_name in chains if chain_name.startswith(gene_name)]
24
+ if suffixes:
25
+ raise ValueError(f'Gene name "{gene_name}" not complete, use one of: {suffixes}')
26
+ print('Available gene names:', chains.keys())
27
+ raise ValueError(f'Gene name "{gene_name}" not found')
28
+
29
+ return chains[gene_name]
30
+
31
+
32
+ def get_imgt_v_chains(chain_type=None):
33
+ global _HUMAN_IMGT_V_CHAINS
34
+ if _HUMAN_IMGT_V_CHAINS is None or chain_type not in _HUMAN_IMGT_V_CHAINS:
35
+ _HUMAN_IMGT_V_CHAINS = {}
36
+ for t, germlines in HUMAN_IMGT_IG_V.items():
37
+ positions = germlines['positions']
38
+ seqs = germlines['aligned_sequences']
39
+ _HUMAN_IMGT_V_CHAINS[t] = {name: germline_to_chain(positions, seq, name=name, chain_type=t) for name, seq in seqs.items()}
40
+ return _HUMAN_IMGT_V_CHAINS[chain_type]
41
+
42
+
43
+ def get_imgt_j_chains(chain_type=None):
44
+ global _HUMAN_IMGT_J_CHAINS
45
+ if _HUMAN_IMGT_J_CHAINS is None or chain_type not in _HUMAN_IMGT_J_CHAINS:
46
+ _HUMAN_IMGT_J_CHAINS = {}
47
+ for t, germlines in HUMAN_IMGT_IG_J.items():
48
+ positions = germlines['positions']
49
+ seqs = germlines['aligned_sequences']
50
+ _HUMAN_IMGT_J_CHAINS[t] = {name: germline_to_chain(positions, seq, name=name, chain_type=t) for name, seq in seqs.items()}
51
+ return _HUMAN_IMGT_J_CHAINS[chain_type]
52
+
53
+
54
+ def germline_to_chain(positions, seq, chain_type, **kwargs):
55
+ from abnumber.chain import Chain, Position
56
+ return Chain(sequence=None, scheme='imgt', chain_type=chain_type, tail='', aa_dict={
57
+ Position.from_string(pos, chain_type=chain_type, scheme='imgt'): aa for pos, aa in zip(positions, seq)
58
+ }, **kwargs)
59
+
60
+
61
+ def get_germline_v_families(chain_type):
62
+ names = HUMAN_IMGT_IG_V[chain_type]['aligned_sequences'].keys()
63
+ return sorted(set([name.split('-')[0].split('/')[0] for name in names]))
64
+
65
+
66
+ def get_germline_v_genes(chain_type):
67
+ names = HUMAN_IMGT_IG_V[chain_type]['aligned_sequences'].keys()
68
+ return sorted(set([name.split('*')[0] for name in names]))
69
+
70
+
71
+ HUMAN_IMGT_IG_V = {
72
+ 'H': {
73
+ "positions": [
74
+ "H1", "H2", "H3", "H4", "H5", "H6", "H7", "H8", "H9", "H11", "H12", "H13", "H14", "H15", "H16", "H17", "H18", "H19", "H20", "H21", "H22", "H23", "H24", "H25", "H26", "H27", "H28", "H29", "H30", "H31", "H34", "H35", "H36", "H37", "H38", "H39", "H40", "H41", "H42", "H43", "H44", "H45", "H46", "H47", "H48", "H49", "H50", "H51", "H52", "H53", "H54", "H55", "H56", "H57", "H58", "H59", "H60", "H61", "H62", "H63", "H64", "H65", "H66", "H67", "H68", "H69", "H70", "H71", "H72", "H74", "H75", "H76", "H77", "H78", "H79", "H80", "H81", "H82", "H83", "H84", "H85", "H86", "H87", "H88", "H89", "H90", "H91", "H92", "H93", "H94", "H95", "H96", "H97", "H98", "H99", "H100", "H101", "H102", "H103", "H104", "H105", "H106", "H107"
75
+ ],
76
+ "aligned_sequences": {
77
+ "IGHV1-18*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCAR-",
78
+ "IGHV1-18*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDMAVYYCAR-",
79
+ "IGHV1-18*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCAR-",
80
+ "IGHV1-2*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTSTRDTSISTAYMELSRLRSDDTVVYYCAR-",
81
+ "IGHV1-2*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
82
+ "IGHV1-2*03": "QVQLVQSGAEVKKLGASVKVSCKASGYTF--TGYYMHWVXQAPGQGLEWMGWINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
83
+ "IGHV1-2*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAQKFQGWVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
84
+ "IGHV1-2*05": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTVVYYCAR-",
85
+ "IGHV1-2*06": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
86
+ "IGHV1-2*07": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAHKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
87
+ "IGHV1-24*01": "QVQLVQSGAEVKKPGASVKVSCKVSGYTL--TELSMHWVRQAPGKGLEWMGGFDPE--DGETIYAQKFQGRVTMTEDTSTDTAYMELSSLRSEDTAVYYCAT-",
88
+ "IGHV1-3*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCAR-",
89
+ "IGHV1-3*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWSNAG--NGNTKYSQEFQGRVTITRDTSASTAYMELSSLRSEDMAVYYCAR-",
90
+ "IGHV1-3*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQEFQGRVTITRDTSASTAYMELSSLRSEDMAVYYCAR-",
91
+ "IGHV1-3*05": "QVQLVQSGAEEKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCAR-",
92
+ "IGHV1-38-4*01": "QVQLVQSWAEVRKSGASVKVSCSFSGFTI--TSYGIHWVQQSPGQGLEWMGWINPG--NGSPSYAKKFQGRFTMTRDMSTTTAYTDLSSLTSEDMAVYYYAR-",
93
+ "IGHV1-45*01": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPGQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
94
+ "IGHV1-45*02": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPGQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
95
+ "IGHV1-45*03": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPRQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
96
+ "IGHV1-46*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
97
+ "IGHV1-46*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--NSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
98
+ "IGHV1-46*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
99
+ "IGHV1-46*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKLQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
100
+ "IGHV1-58*01": "QMQLVQSGPEVKKPGTSVKVSCKASGFTF--TSSAVQWVRQARGQRLEWIGWIVVG--SGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAA-",
101
+ "IGHV1-58*02": "QMQLVQSGPEVKKPGTSVKVSCKASGFTF--TSSAMQWVRQARGQRLEWIGWIVVG--SGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAA-",
102
+ "IGHV1-68*01": "QVQLGQSEAEVKKPGASVKVSCKASGYTF--TCCSLHWLQQAPGQGLERMRWITLY--NGNTNYAKKFQGRVTITRDMSLRTAYIELSSLRSEDSAVYYWAR-",
103
+ "IGHV1-68*02": "QVQLGQSEAEVKKPGASVKVSCKASGYTF--TYCSLHWLQQAPGQGLERMRWITLY--NGNINYAKKFQSRVTITRDMSLRTAYIELSSLRSEDSAVYYWAR-",
104
+ "IGHV1-69*01": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
105
+ "IGHV1-69*02": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
106
+ "IGHV1-69*04": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
107
+ "IGHV1-69*05": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITTDESTSTAYMELSSLRSEDTAVYYCAR-",
108
+ "IGHV1-69*06": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
109
+ "IGHV1-69*08": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGRIIPI--LGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
110
+ "IGHV1-69*09": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
111
+ "IGHV1-69*10": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
112
+ "IGHV1-69*11": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
113
+ "IGHV1-69*12": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
114
+ "IGHV1-69*13": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
115
+ "IGHV1-69*14": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
116
+ "IGHV1-69*15": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
117
+ "IGHV1-69*16": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGGIIPI--LGTANYAQKFQGRVTITTDESTSTAYMELSSLRSEDTAVYYCAR-",
118
+ "IGHV1-69*17": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
119
+ "IGHV1-69*19": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
120
+ "IGHV1-69-2*01": "EVQLVQSGAEVKKPGATVKISCKVSGYTF--TDYYMHWVQQAPGKGLEWMGLVDPE--DGETIYAEKFQGRVTITADTSTDTAYMELSSLRSEDTAVYYCAT-",
121
+ "IGHV1-69D*01": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
122
+ "IGHV1-8*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCAR-",
123
+ "IGHV1-8*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCAR-",
124
+ "IGHV1-8*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTITRNTSISTAYMELSSLRSEDTAVYYCAR-",
125
+ "IGHV1-NL1*01": "QVQLLQPGVQVKKPGSSVKVSC-ASRYTF--TKYFTRWV-QSPGQGHXWMG-INPY--NDNTHYAQTFWGRVTITSDRSMSTAYMELSXLRSEDMVVYYCVR-",
126
+ "IGHV1/OR15-1*01": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYTELSSLRSEDTATYYCAR-",
127
+ "IGHV1/OR15-1*02": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTACTELSSLRSEDTATYYCAR-",
128
+ "IGHV1/OR15-1*03": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYTELSSLRSEDTATYYCAR-",
129
+ "IGHV1/OR15-1*04": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSSLRSEDTATYYCAR-",
130
+ "IGHV1/OR15-2*01": "QVQLVQSGAEVKKPRASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSRLRSDDMAVYYCAR-",
131
+ "IGHV1/OR15-2*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSSLRSDDMAVYYCAR-",
132
+ "IGHV1/OR15-2*03": "QVQLVQSGAEVKKPRASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSSLRSDDMAVYYCAR-",
133
+ "IGHV1/OR15-3*01": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TDYFMNWMRQAPGQRLEWMGWINAG--NGNTKYSQKLQGRVTITRDTSSSTAYMQLSSLRSEDTAVYYCAR-",
134
+ "IGHV1/OR15-3*02": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TDYFMNWMRQAPGQRLEWMGWINAG--NGNTKYSQKLQGRVTITRDTSASTAYMQLSSLRSEDTAVYYCAR-",
135
+ "IGHV1/OR15-3*03": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TSYYMNWMRQAPGQGFEWMGWINAG--NGNTKYSQKLQGRVTITRDTSASTAYMQLSSLRSEDTAVYYCAR-",
136
+ "IGHV1/OR15-4*01": "QDQLVQSGAEVKKPLSSVKVSFKASGYTF--TNNFMHWV-QAPGQGLEWMGWINAG--NGNTTYAQKFQGRVTITRDTSMSTAYTELSSLRSEDTAVYYCAR-",
137
+ "IGHV1/OR15-5*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TNYCMHWVRQVHAQGLEWMGLVCPS--DGSTSYAQKFQARVTITRDTSMSTAYMELSSLRSEDTAMYYCVR-",
138
+ "IGHV1/OR15-9*01": "QVQLMQSGAEVKKPGASVRISCKASGYTF--TSYCMHWVCQAHAQGLEWMGLVCPS--DGSTSYAQKFQGRVTITRDTSMGTAYMELSSLRSEDTAMYYCVR-",
139
+ "IGHV1/OR21-1*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTI--TSYCMHWVHQVHAQGLEWMGLVCPS--DGSTSYAQKFQARVTITRDTSMSTAYMELSSLRSEDTAMYYCVR-",
140
+ "IGHV2-10*01": "QVTLKESGPALVKPTQTLMLTCTFSGFSLSTSGMGVG-ICQPSAKALEWLAHIY-N---DNKYYSPSLKSRLIISKDTSKNEVVLTVINMDIVDTATHYCARR",
141
+ "IGHV2-26*01": "QVTLKESGPVLVKPTETLTLTCTVSGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
142
+ "IGHV2-26*02": "QVTLKESGPVLVKPTETLTLTCTVSGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
143
+ "IGHV2-26*03": "QVTLKESGPVLVKPTETLTLTCTISGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
144
+ "IGHV2-5*01": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWN---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
145
+ "IGHV2-5*02": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
146
+ "IGHV2-5*05": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
147
+ "IGHV2-5*06": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
148
+ "IGHV2-5*08": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLALIYWD---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
149
+ "IGHV2-5*09": "QVTLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
150
+ "IGHV2-70*01": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
151
+ "IGHV2-70*04": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
152
+ "IGHV2-70*10": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWIARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
153
+ "IGHV2-70*11": "RVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
154
+ "IGHV2-70*12": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCAHR",
155
+ "IGHV2-70*13": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
156
+ "IGHV2-70*15": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
157
+ "IGHV2-70*16": "QVTLKESGPVLVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
158
+ "IGHV2-70*17": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
159
+ "IGHV2-70*18": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSEMCVSWVRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
160
+ "IGHV2-70*19": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWVRQPPGKALEWLALIDWD---DDKHYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
161
+ "IGHV2-70D*04": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
162
+ "IGHV2-70D*14": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
163
+ "IGHV2/OR16-5*01": "QVTLKESGPALVKPTETLTLTCTLSGFSLSTSGMGMSWIRQPPGKALEWLAHIFLN---DKKSYSTSLKNRLIISKDTSKSQVVLTMTNMDPVDTATYYCAWR",
164
+ "IGHV3-11*01": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
165
+ "IGHV3-11*03": "QVQLLESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
166
+ "IGHV3-11*04": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
167
+ "IGHV3-11*05": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
168
+ "IGHV3-11*06": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
169
+ "IGHV3-13*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
170
+ "IGHV3-13*02": "EVHLVESGGGLVQPGGALRLSCAASGFTF--SNYDMHWVRQATGKGLEWVSANGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
171
+ "IGHV3-13*03": "EVQLVESGGGLVQPGGSLRLSCAACGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGQFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
172
+ "IGHV3-13*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
173
+ "IGHV3-13*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDPYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
174
+ "IGHV3-15*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
175
+ "IGHV3-15*02": "EVQLVESGGALVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
176
+ "IGHV3-15*03": "EVQLVESAGALVQPGGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGRIKSKANGGTTDYAAPVKGRFTISRVDSKNTLYLQMNSLKTEDTAVYYCTT-",
177
+ "IGHV3-15*04": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIESKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
178
+ "IGHV3-15*05": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
179
+ "IGHV3-15*06": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTNYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
180
+ "IGHV3-15*07": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMNWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
181
+ "IGHV3-15*08": "EVQLVESAGGLVQPGGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGCIKSKANGGTTDYAAPVKGRFTISRDDSKNTLYLQMISLKTEDTAVYYCTT-",
182
+ "IGHV3-16*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWARKAPGKGLEWVSGVSWN--GSRTHYVDSVKRRFIISRDNSRNSLYLQKNRRRAEDMAVYYCVR-",
183
+ "IGHV3-16*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWARKAPGKGLEWVSGVSWN--GSRTHYVDSVKRRFIISRDNSRNSLYLQKNRRRAEDMAVYYCVR-",
184
+ "IGHV3-19*01": "TVQLVESGGGLVEPGGSLRLSCAASGFTF--SNSDMNWVRQAPGKGLEWVSGVSWN--GSRTHYADSVKGRFIISRDNSRNFLYQQMNSLRPEDMAVYYCVR-",
185
+ "IGHV3-20*01": "EVQLVESGGGVVRPGGSLRLSCAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYHCAR-",
186
+ "IGHV3-20*02": "EVQLVESGGGVVRPGGSLRLSFAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYHCAR-",
187
+ "IGHV3-20*03": "EVQLVESGGGVVRPGGSLRLSFAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAR-",
188
+ "IGHV3-20*04": "EVQLVESGGGVVRPGGSLRLSCAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAR-",
189
+ "IGHV3-21*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
190
+ "IGHV3-21*02": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
191
+ "IGHV3-21*03": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
192
+ "IGHV3-21*04": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
193
+ "IGHV3-21*05": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
194
+ "IGHV3-21*06": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
195
+ "IGHV3-22*01": "EVHLVESGGALVQPGGSLRLSCAASGFTF--SYYYMSGVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMKSLKTEDTAVYYCSR-",
196
+ "IGHV3-22*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SYYYMSGVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMKSLKTEDTAVYYCSR-",
197
+ "IGHV3-23*01": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
198
+ "IGHV3-23*02": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
199
+ "IGHV3-23*03": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSVIYSG--GSSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
200
+ "IGHV3-23*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
201
+ "IGHV3-23D*01": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
202
+ "IGHV3-25*01": "EMQLVESGGGLQKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELV-QVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
203
+ "IGHV3-25*02": "EMQLVESGGGLAKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELV-QVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
204
+ "IGHV3-25*04": "ETQLVESGGGLAKPGRSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELVGQVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALYYCTR-",
205
+ "IGHV3-25*05": "EMQLVESGGGLAKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELVGQVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
206
+ "IGHV3-29*01": "EVELIEPTEDLRQPGKFLRLSCVASRFAF--SSF-MSPVHQSAGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRTEDMAVYGCT-G",
207
+ "IGHV3-30*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
208
+ "IGHV3-30*02": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAFIRYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
209
+ "IGHV3-30*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
210
+ "IGHV3-30*04": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
211
+ "IGHV3-30*05": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEGTAVYYCAR-",
212
+ "IGHV3-30*06": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
213
+ "IGHV3-30*07": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
214
+ "IGHV3-30*08": "QVQLVDSGGGVVQPGRSLRLSCAASAFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
215
+ "IGHV3-30*09": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFAISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
216
+ "IGHV3-30*10": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYTDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
217
+ "IGHV3-30*11": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
218
+ "IGHV3-30*12": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
219
+ "IGHV3-30*13": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNRLYLQMNSLRAEDTAVYYCAR-",
220
+ "IGHV3-30*14": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
221
+ "IGHV3-30*15": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCAR-",
222
+ "IGHV3-30*16": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
223
+ "IGHV3-30*17": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
224
+ "IGHV3-30*18": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
225
+ "IGHV3-30*19": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
226
+ "IGHV3-30-2*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RNSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAAGTAVCYCM-G",
227
+ "IGHV3-30-22*01": "EVELIESIEDLRQPGKFLRLSCVASRFAF--SSF-MSRVHQSPGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRAEDMDVYGCT-G",
228
+ "IGHV3-30-3*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
229
+ "IGHV3-30-3*02": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
230
+ "IGHV3-30-3*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
231
+ "IGHV3-30-33*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RSSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAEGTAVCYCM--",
232
+ "IGHV3-30-42*01": "EVELIEPTEDLRQPGKFLRLSCVASRFAF--SSF-MSPVHQSAGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRTEDMAVYGCT-G",
233
+ "IGHV3-30-5*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
234
+ "IGHV3-30-5*02": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAFIRYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
235
+ "IGHV3-30-52*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RNSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAAGTAVCYCM--",
236
+ "IGHV3-32*01": "EVELIESIEDLRQPGKFLRLSCVASRFAF--SSF-MSRVHQSPGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNTQRAEDVAVYGYT-G",
237
+ "IGHV3-33*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
238
+ "IGHV3-33*02": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSAKGRFTISRDNSTNTLFLQMNSLRAEDTAVYYCAR-",
239
+ "IGHV3-33*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
240
+ "IGHV3-33*04": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
241
+ "IGHV3-33*05": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
242
+ "IGHV3-33*06": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
243
+ "IGHV3-33*07": "QVQLVESGGRVVQPGRSLRLSCAASGFTF--SRYGMYWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
244
+ "IGHV3-33-2*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-MSSVSQAPGKGLE-VVDIQCD--GSQICYAQSVKSKFTISKENAKNSLYLQMNSLRAEGTAVCYCM-G",
245
+ "IGHV3-35*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWVHQAPGKGLEWVSGVSWN--GSRTHYADSVKGRFIISRDNSRNTLYLQTNSLRAEDTAVYYCVR-",
246
+ "IGHV3-38*01": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAAYYCARY",
247
+ "IGHV3-38*02": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAVYYCARY",
248
+ "IGHV3-38*03": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAVYYCARY",
249
+ "IGHV3-38-3*01": "EVQLVESRGVLVQPGGSLRLSCAASGFTV--SSNEMSWVRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLHLQMNSLRAEDTAVYYCKK-",
250
+ "IGHV3-41*02": "EVQLVESGGGLVQPGGSLRLSCAASGFSF--SSYGMSWVRQAPGKGLD-VAHIWND--GSQKYYADSVKGRFTISRDNSKSMLYLQMDSLKAKDTAMYYCTR-",
251
+ "IGHV3-43*01": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYTMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRTEDTALYYCAKD",
252
+ "IGHV3-43*02": "EVQLVESGGGVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISGD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRTEDTALYYCAKD",
253
+ "IGHV3-43D*03": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRAEDTALYYCAKD",
254
+ "IGHV3-43D*04": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRAEDTALYYCAKD",
255
+ "IGHV3-47*01": "EDQLVESGGGLVQPGGSLRPSCAASGFAF--SSYALHWVRRAPGKGLEWVSAIGTG---GDTYYADSVMGRFTISRDNAKKSLYLHMNSLIAEDMAVYYCAR-",
256
+ "IGHV3-47*02": "EDQLVESGGGLVQPGGSLRPSCAASGFAF--SSYVLHWVRRAPGKGPEWVSAIGTG---GDTYYADSVMGRFTISRDNAKKSLYLQMNSLIAEDMAVYYCAR-",
257
+ "IGHV3-48*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
258
+ "IGHV3-48*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRDEDTAVYYCAR-",
259
+ "IGHV3-48*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYEMNWVRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
260
+ "IGHV3-48*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
261
+ "IGHV3-49*01": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYTASVKGRFTISRDGSKSIAYLQMNSLKTEDTAVYYCTR-",
262
+ "IGHV3-49*02": "EVQLVESGGGLVQPGPSLRLSCTASGFTF--GYYPMSWVRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
263
+ "IGHV3-49*03": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
264
+ "IGHV3-49*04": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWVRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
265
+ "IGHV3-49*05": "EVQLVESGGGLVKPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
266
+ "IGHV3-52*01": "EVQLVESG-GLVQPGGSLRLSCAASGFTF--SSSWMHWVCQAPEKGLEWVADIKCD--GSEKYYVDSVKGRLTISRDNAKNSLYLQVNSLRAEDMTVYYCVR-",
267
+ "IGHV3-52*03": "EVQLVESG-GLVQPGGSLRLSCAASGFTF--SSSWMHWVCQAPEKGLEWVADIKCD--GSEKYYVDSVKGRLTISRDNAKNSLYLQVNSLRAEDMTVYYCVR-",
268
+ "IGHV3-53*01": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
269
+ "IGHV3-53*02": "EVQLVETGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
270
+ "IGHV3-53*03": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQPPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
271
+ "IGHV3-53*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRHNSKNTLYLQMNSLRAEDTAVYYCAR-",
272
+ "IGHV3-53*05": "EVQLVETGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
273
+ "IGHV3-54*01": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI--D--RSQLCYAQSVKSRFTISKENAKNSLCLQMNSLRAEGTAVYYCM--",
274
+ "IGHV3-54*02": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI-YD--RSQICYAQSVKSRFTISKENAKNSLRLQMNSLRAEGTAVYYCM--",
275
+ "IGHV3-54*04": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI--D--RSQLCYAQSVKSRFTISKENAKNSLCLQMNSLRAEGTAVYYCM--",
276
+ "IGHV3-62*01": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGL-WVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLSLQMNSLRAEGTVVYYCVK-",
277
+ "IGHV3-62*03": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGL-WVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLYLQMNSLRADDMAVYYCVK-",
278
+ "IGHV3-62*04": "EVQLVKSGGGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGLEWVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLSLQMNSLRAEDMAVYYCVK-",
279
+ "IGHV3-63*01": "EVELIESIEGLRQLGKFLRLSCVASGFTF--SSY-MSWVNETLGKGLEGVIDVKYD--GSQIYHADSVKGRFTISKDNAKNSPYLQTNSLRAEDMTMHGCT-G",
280
+ "IGHV3-64*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYANSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
281
+ "IGHV3-64*02": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
282
+ "IGHV3-64*03": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYVQMSSLRAEDTAVYYCVK-",
283
+ "IGHV3-64*04": "QVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
284
+ "IGHV3-64*05": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYVQMSSLRAEDTAVYYCVK-",
285
+ "IGHV3-64*07": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
286
+ "IGHV3-64D*06": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
287
+ "IGHV3-64D*08": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
288
+ "IGHV3-64D*09": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
289
+ "IGHV3-66*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
290
+ "IGHV3-66*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
291
+ "IGHV3-66*03": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSC---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
292
+ "IGHV3-66*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
293
+ "IGHV3-69-1*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMNWVRQAPGKGLEWVSSISSS---STIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
294
+ "IGHV3-69-1*02": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMNWVRQAPGKGLEWVSSISSS---STIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
295
+ "IGHV3-7*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
296
+ "IGHV3-7*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
297
+ "IGHV3-7*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
298
+ "IGHV3-7*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
299
+ "IGHV3-7*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
300
+ "IGHV3-71*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
301
+ "IGHV3-71*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDMAVYYCAR-",
302
+ "IGHV3-71*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
303
+ "IGHV3-71*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
304
+ "IGHV3-72*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMDWVRQAPGKGLEWVGRTRNKANSYTTEYAASVKGRFTISRDDSKNSLYLQMNSLKTEDTAVYYCAR-",
305
+ "IGHV3-73*01": "EVQLVESGGGLVQPGGSLKLSCAASGFTF--SGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYCTR-",
306
+ "IGHV3-73*02": "EVQLVESGGGLVQPGGSLKLSCAASGFTF--SGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYCTR-",
307
+ "IGHV3-74*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
308
+ "IGHV3-74*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
309
+ "IGHV3-74*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTTYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
310
+ "IGHV3-9*01": "EVQLVESGGGLVQPGRSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAKD",
311
+ "IGHV3-9*02": "EVQLVESGGGLVQPGRSLRLSCAASGFTS--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAKD",
312
+ "IGHV3-9*03": "EVQLVESGGGLVQPGRSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMALYYCAKD",
313
+ "IGHV3-NL1*01": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVSVIYSG--GSSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
314
+ "IGHV3/OR15-7*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTMYLQMSNLKTEDLAVYYCAR-",
315
+ "IGHV3/OR15-7*02": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSSLKTEDLAVYYCAR-",
316
+ "IGHV3/OR15-7*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSSLKTEDLAVYYCAR-",
317
+ "IGHV3/OR15-7*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSNLKTEDLAVYYCAR-",
318
+ "IGHV3/OR16-10*01": "EVQLVQSGGGLVHPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
319
+ "IGHV3/OR16-10*02": "EVQLVQSGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
320
+ "IGHV3/OR16-10*03": "EVQLVESGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
321
+ "IGHV3/OR16-12*01": "EVQLVESGRGLAQPGGYLKLSGAASGFTV--GSWYMSWIHQAPGKGLEWVSYISSS--GCSTNYADSVKGRFTISTDNSKNTLYLQMNSLRVEDTAVYYCAR-",
322
+ "IGHV3/OR16-13*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSMKGQFTISRDNAKNTLYLQMNSLRAEDMAVYYCTR-",
323
+ "IGHV3/OR16-14*01": "EVQLEESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQSPGKGLV-VSRINSD--GSSTSYADSLKGQFTISRDNAKNTLYLQMNSLRAEDMAVYYCTR-",
324
+ "IGHV3/OR16-15*01": "EVQLVESGGGLVQPGGSLRLSCAASVFTF--SNSDINWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFSISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
325
+ "IGHV3/OR16-15*02": "EVQLVESGGGLVQPGGSLRHSCAASGFTF--SNSDMNWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
326
+ "IGHV3/OR16-16*01": "EVQLVESGGGLVQPGGSLRHSCAASGFTF--SNSDMNWVL-APGKGLEWVSDISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
327
+ "IGHV3/OR16-17*01": "EVQLVESGGGLVQPGGSLRLSCPDSGFTF--SNHYMSWVRQAPGKGLEWISYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
328
+ "IGHV3/OR16-18*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
329
+ "IGHV3/OR16-20*01": "EVQLVQSGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
330
+ "IGHV3/OR16-6*02": "EVQLVESAGGLGTAWGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGCIKSKANGGTTDYAAPVKGRFTISRDDSKNTLYLQMISLKTEDTAVYYCTT-",
331
+ "IGHV3/OR16-8*01": "EVQLVESGGGLVQPGGSLRLSCPASGFTF--SNHYMSWVRQAPGKGLEWVSYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
332
+ "IGHV3/OR16-8*02": "EVQLVESGGGLVQPGGSLRLSCPDSGFTF--SNHYMSWVRQAPGKGLEWISYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
333
+ "IGHV3/OR16-9*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNHYTSWVRQAPGKGLEWVSYSSGN--SGYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCVK-",
334
+ "IGHV4-28*01": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
335
+ "IGHV4-28*02": "QVQLQESGPGLVKPSQTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSIYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
336
+ "IGHV4-28*03": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
337
+ "IGHV4-28*04": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTGVYYCAR-",
338
+ "IGHV4-28*05": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSIYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
339
+ "IGHV4-28*06": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTALDTAVYYCAR-",
340
+ "IGHV4-28*07": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
341
+ "IGHV4-30-2*01": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDRSKNQFSLKLSSVTAADTAVYYCAR-",
342
+ "IGHV4-30-2*03": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
343
+ "IGHV4-30-2*05": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
344
+ "IGHV4-30-2*06": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQSPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDRSKNQFSLKLSSVTAADTAVYYCAR-",
345
+ "IGHV4-30-4*01": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGDYYWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
346
+ "IGHV4-30-4*02": "QVQLQESGPGLVKPSDTLSLTCTVSGGSISSGDYYWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
347
+ "IGHV4-30-4*07": "QVQLQESGPGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
348
+ "IGHV4-31*01": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSLVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
349
+ "IGHV4-31*02": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
350
+ "IGHV4-31*03": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
351
+ "IGHV4-31*10": "QVQLQESGPGLLKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGCIYYS---GSTYYNPSLKSRVTISVDPSKNQFSLKPSSVTAADTAVDYCAR-",
352
+ "IGHV4-34*01": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
353
+ "IGHV4-34*02": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
354
+ "IGHV4-34*04": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNNNPSLKSRATISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
355
+ "IGHV4-34*05": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWCWIRQPLGKGLEWIGEINHS---GSTNNNPSLKSRATISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
356
+ "IGHV4-34*09": "QVQLQESGPGLVKPSQTLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
357
+ "IGHV4-34*10": "QVQLQESGPGLVKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
358
+ "IGHV4-34*11": "QVQLQQWGAGLLKPSETLSLTCAVYGGSV--SGYYWSWIRQPPGKGLEWIGYIYYS---GSTNNNPSLKSRATISVDTSKNQFSLNLSSVTAADTAVYCCAR-",
359
+ "IGHV4-34*12": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEIIHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
360
+ "IGHV4-38-2*01": "QVQLQESGPGLVKPSETLSLTCAVSGYSIS-SGYYWGWIRQPPGKGLEWIGSIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
361
+ "IGHV4-38-2*02": "QVQLQESGPGLVKPSETLSLTCTVSGYSIS-SGYYWGWIRQPPGKGLEWIGSIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
362
+ "IGHV4-39*01": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
363
+ "IGHV4-39*02": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNHFSLKLSSVTAADTAVYYCAR-",
364
+ "IGHV4-39*06": "RLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFPLKLSSVTAADTAVYYCAR-",
365
+ "IGHV4-39*07": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
366
+ "IGHV4-4*01": "QVQLQESGPGLVKPPGTLSLTCAVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYCCAR-",
367
+ "IGHV4-4*02": "QVQLQESGPGLVKPSGTLSLTCAVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
368
+ "IGHV4-4*07": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTAADTAVYYCAR-",
369
+ "IGHV4-4*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
370
+ "IGHV4-55*01": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
371
+ "IGHV4-55*02": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
372
+ "IGHV4-55*08": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
373
+ "IGHV4-55*09": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
374
+ "IGHV4-59*01": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
375
+ "IGHV4-59*02": "QVQLQESGPGLVKPSETLSLTCTVSGGSV--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
376
+ "IGHV4-59*07": "QVQLQESGPGLVKPSDTLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
377
+ "IGHV4-59*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
378
+ "IGHV4-59*10": "QVQLQQWGAGLLKPSETLSLTCAVYGGSI--SSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTAADTAVYYCAR-",
379
+ "IGHV4-59*11": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSHYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
380
+ "IGHV4-59*13": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
381
+ "IGHV4-61*01": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
382
+ "IGHV4-61*02": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
383
+ "IGHV4-61*03": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNHFSLKLSSVTAADTAVYYCAR-",
384
+ "IGHV4-61*05": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
385
+ "IGHV4-61*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGGYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
386
+ "IGHV4-61*09": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGHIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
387
+ "IGHV4/OR15-8*01": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSPNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
388
+ "IGHV4/OR15-8*02": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GNPNYNPSLKSRVTISIDKSKNQFSLKLSSVTAADTAVYYCAR-",
389
+ "IGHV4/OR15-8*03": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSPNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
390
+ "IGHV5-10-1*01": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
391
+ "IGHV5-10-1*02": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
392
+ "IGHV5-10-1*03": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
393
+ "IGHV5-10-1*04": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
394
+ "IGHV5-51*01": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
395
+ "IGHV5-51*02": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWTGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
396
+ "IGHV5-51*03": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
397
+ "IGHV5-51*04": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKPISTAYLQWSSLKASDTAMYYCAR-",
398
+ "IGHV5-51*07": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVHQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
399
+ "IGHV5-78*01": "EVQLLQSAAEVKRPGESLRISCKTSGYSF--TSYWIHWVRQMPGKELEWMGSIYPG--NSDTRYSPSFQGHVTISADSSSSTAYLQWSSLKASDAAMYYCVR-",
400
+ "IGHV6-1*01": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKSRITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
401
+ "IGHV6-1*02": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKSRITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
402
+ "IGHV6-1*03": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKS-ITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
403
+ "IGHV7-34-1*01": "-LQLVQSGPEVKKPGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--TGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
404
+ "IGHV7-34-1*02": "-LQLVQSGPEVKKPGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--NGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
405
+ "IGHV7-34-1*03": "-LQLVQSGPEVKKRGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--TGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
406
+ "IGHV7-4-1*01": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSTAYLQICSLKAEDTAVYYCAR-",
407
+ "IGHV7-4-1*02": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSTAYLQISSLKAEDTAVYYCAR-",
408
+ "IGHV7-4-1*04": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSMAYLQISSLKAEDTAVYYCAR-",
409
+ "IGHV7-4-1*05": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSMAYLQISSLKAEDTAVCYCAR-",
410
+ "IGHV7-40*03": "FSIEKSNNLSVNQWMIR-NMIYVNHGILC--SQYGMNSV-PAPGQGLEWMGWIITY--TGNPTYTNGFTGRFLFSMDTSVSMAYLQISSLKAEDTAVYDCMR-",
411
+ "IGHV7-81*01": "QVQLVQSGHEVKQPGASVKVSCKASGYSF--TTYGMNWVPQAPGQGLEWMGWFNTY--TGNPTYAQGFTGRFVFSMDTSASTAYLQISSLKAEDMAMYYCAR-",
412
+ "IGHV8-51-1*01": "EAQLTESGGDLVH-EGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRS--ESHQYNADYVRGRLTTSRDNTKYMLYMQMNSLRTQNMAAFNCAG-",
413
+ "IGHV8-51-1*02": "EAQLTESGGDLVHLEGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRG--ESHQYNADYVRGRLTTSRDNTKYMLYMQMISLRTQNMAAFNCAG-",
414
+ "IGHV8-51-1*03": "EAQLTESGGDLVH-EGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRG--ESHQYNADYVRGRLTTSRDNTKYMLYMQMNSLRTQNMAAFNCAG-"
415
+ }
416
+ },
417
+ 'L': {
418
+ "positions": [
419
+ "L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20", "L21", "L22", "L23", "L24", "L25", "L26", "L27", "L28", "L29", "L30", "L31", "L32", "L33", "L34", "L35", "L36", "L37", "L38", "L39", "L40", "L41", "L42", "L43", "L44", "L45", "L46", "L47", "L48", "L49", "L50", "L51", "L52", "L53", "L54", "L55", "L56", "L57", "L58", "L59", "L62", "L63", "L64", "L65", "L66", "L67", "L68", "L69", "L70", "L71", "L72", "L74", "L75", "L76", "L77", "L78", "L79", "L80", "L81", "L82", "L83", "L84", "L85", "L86", "L87", "L88", "L89", "L90", "L91", "L92", "L93", "L94", "L95", "L96", "L97", "L98", "L99", "L100", "L101", "L102", "L103", "L104", "L105", "L106", "L107", "L108", "L109", "L110", "L111", "L111A", "L111B", "L111C", "L111D"
420
+ ],
421
+ "aligned_sequences": {
422
+ "IGLV1-36*01": "QSVLTQPPS-VSEAPRQRVTISCSGSSSNI----GNNAVNWYQQLPGKAPKLLIYYD-----DLLPSGVSDRFSGSK--SGTSASLAISGLQSEDEADYYCAAWDDSLNG--",
423
+ "IGLV1-40*01": "QSVLTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGTSASLAITGLQAEDEADYYCQSYDSSLSG--",
424
+ "IGLV1-40*02": "QSVVTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGTSASLAITGLQAEDEADYYCQSYDSSLSG--",
425
+ "IGLV1-40*03": "QSVVTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGASASLAITGLQAEDEADYYCQSYDSSLSG--",
426
+ "IGLV1-41*01": "QSVLTQPPS-VSAAPGQKVTISCSGSSSDM----GNYAVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLWPEDEADYYCLAWDTSPRA--",
427
+ "IGLV1-41*02": "QSVLTQPPS-VSAAPGQKVTISCSGSSSDM----GNYAVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLWPED-ADYYCLAWDTSLRA--",
428
+ "IGLV1-44*01": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNTVNWYQQLPGTAPKLLIYSN-----NQRPSGVPDRFSGSK--SGTSASLAISGLQSEDEADYYCAAWDDSLNG--",
429
+ "IGLV1-47*01": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYRN-----NQRPSGVPDRFSGSK--SGTSASLAISGLRSEDEADYYCAAWDDSLSG--",
430
+ "IGLV1-47*02": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYSN-----NQRPSGVPDRFSGSK--SGTSASLAISGLRSEDEADYYCAAWDDSLSG--",
431
+ "IGLV1-47*03": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYRN-----NQRPSGVPDRFSGSK--SGTSASLAISGLWSEDEADYYCAAWDDSLSG--",
432
+ "IGLV1-50*01": "QSVLTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYVVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDQFSGSK--SGTSASLAITGLQSEDEADYYCKAWDNSLNA--",
433
+ "IGLV1-51*01": "QSVLTQPPS-VSAAPGQKVTISCSGSSSNI----GNNYVSWYQQLPGTAPKLLIYDN-----NKRPSGIPDRFSGSK--SGTSATLGITGLQTGDEADYYCGTWDSSLSA--",
434
+ "IGLV1-51*02": "QSVLTQPPS-VSAAPGQKVTISCSGSSSNI----GNNYVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLQTGDEADYYCGTWDSSLSA--",
435
+ "IGLV1-62*01": "QSVLTQPPS-VSWATRQRLTVSCTGSSSNTG---TGYNVNCWQ-LPRTDPKLLRHGD-----KNWASWVSDQFSGSK--SGSLASLGTTGLWAEDKTDYHCQSRDIC-VL--",
436
+ "IGLV10-54*01": "QAGLTQPPS-VSKGLRQTATLTCTGNSNNV----GNQGAAWLQQHQGHPPKLLSYRN-----NNRPSGISERLSASR--SGNTASLTITGLQPEDEADYYCSAWDSSLSA--",
437
+ "IGLV10-54*02": "QAGLTQPPS-VSKGLRQTATLTCTGNSNIV----GNQGAAWLQQHQGHPPKLLSYRN-----NNRPSGISERFSASR--SGNTASLTITGLQPEDEADYYCSALDSSLSA--",
438
+ "IGLV10-54*03": "QAGLTQPPS-VSKGLRQTATLTCTGNSNNV----GNQGAAWPEQHQGHPPKLLSYRN-----NNRPSGISERLSASR--SGNTASLTITGLQPEDEADYYCSAWDSSLSA--",
439
+ "IGLV11-55*01": "RPVLTQPPS-LSASPGATARLPCTLSSDLSV---GGKNMFWYQQKPGSSPRLFLYHYSD-SDKQLGPGVPSRVSGSKETSSNTAFLLISGLQPEDEADYYCQVYESSAN---",
440
+ "IGLV11-55*02": "RPVLTQPPS-LSASPGATARLPCTLSSDLSV---GGKNMFWYQQKLGSSPRLFLYHYSD-SDKQLGPGVPSRVSGSKETSSNTAFLLISGLQPEDEADYYCQVYESSAN---",
441
+ "IGLV2-11*01": "QSALTQPRS-VSGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SKRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSYTF--",
442
+ "IGLV2-11*02": "QSALTQPRS-VSGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SKRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSYTF--",
443
+ "IGLV2-14*01": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SNRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
444
+ "IGLV2-14*02": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
445
+ "IGLV2-14*03": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SNRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
446
+ "IGLV2-18*01": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCSLYTSSSTF--",
447
+ "IGLV2-18*02": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTF--",
448
+ "IGLV2-18*03": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTTSGLQAEDEADYYCSSYTSSSTF--",
449
+ "IGLV2-18*04": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRSSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTF--",
450
+ "IGLV2-23*01": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTL--",
451
+ "IGLV2-23*02": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEV-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTF--",
452
+ "IGLV2-23*03": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTF--",
453
+ "IGLV2-33*01": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNMASLTISGLKSEVEANYHCSLYSSSYTF--",
454
+ "IGLV2-33*02": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNMASLTISGLKSEVEANYHCSLYSSSYTF--",
455
+ "IGLV2-33*03": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNVASLTISGLKSEVEANYHCSLYSSSYTF--",
456
+ "IGLV2-34*01": "QSVLTQPRS-VSRSPGQ-VTIFCTGTSSDIG---GYDLVSWCQ-HPGKAPKLMIYDV-----ANWPSGAPGCFSGSK--SGNTASLTISGLQAEDEADYYCSSYAGSYNF--",
457
+ "IGLV2-34*02": "QSVLTQPRS-VSRSPGQ-VTIFCTGTSSDIG---GYDLVSWCQ-HPGKAPKLMIYDV-----GNWPSGAPGCFSGSK--SGNTASLTISGLQAEDEADYYCSSYAGSYNF--",
458
+ "IGLV2-5*01": "QSALIQPPS-VSGSPGQSVTISCTGTSSDVG---SYDYVSWYQQHPGTVPKPMIYNV-----NTQPSGVPDRFSGSK--SGNTASMTISGLQAEDEADY-CCSYTSSAT---",
459
+ "IGLV2-5*02": "QSALIQPPS-VSGSPGQSVTISCTGTSSDVG---SYDYVSWYQQHPGTVPKPMIYNV-----NTRPSGVPDRFSGSK--SGNTASMTISGLQAEDEADY-CCSYTSSAT---",
460
+ "IGLV2-8*01": "QSALTQPPS-ASGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SKRPSGVPDRFSGSK--SGNTASLTVSGLQAEDEADYYCSSYAGSNNF--",
461
+ "IGLV2-8*02": "QSALTQPPS-ASRSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SKRPSGVPDRFSGSK--SGNTASLTVSGLQAEDEADYYCSSYAGSNNF--",
462
+ "IGLV3-1*01": "SYELTQPPS-VSVSPGQTASITCSGDKLG------DKYACWYQQKPGQSPVLVIYQD-----SKRPSGIPERFSGSN--SGNTATLTISGTQAMDEADYYCQAWDSSTA---",
463
+ "IGLV3-10*01": "SYELTQPPS-VSVSPGQTARITCSGDALP------KKYAYWYQQKSGQAPVLVIYED-----SKRPSGIPERFSGSS--SGTMATLTISGAQVEDEADYYCYSTDSSGNH--",
464
+ "IGLV3-10*03": "SYELTQPPS-VSVSPGQTARITCSGDALP------KKYAYWYQQKSGQAPVLVIYED-----SKRPSGIPERFSGSS--SGTMATLTISGAQVEDEDDYYCYSTDSSGNH--",
465
+ "IGLV3-12*01": "SYELTQPHS-VSVATAQMARITCGGNNIG------SKAVHWYQQKPGQDPVLVIYSD-----SNRPSGIPERFSGSN--PGNTTTLTISRIEAGDEADYYCQVWDSSSDH--",
466
+ "IGLV3-12*02": "SYELTQPHS-VSVATAQMARITCGGNNIG------SKAVHWYQQKPGQDPVLVIYSD-----SNRPSGIPERFSGSN--PGNTATLTISRIEAGDEADYYCQVWDSSSDH--",
467
+ "IGLV3-13*01": "SYELTQPPA-VSVSPGQTARISCSGDVLR------DNYADWYPQKPGQAPVLVIYKD-----GERPSGIPERFSGST--SGNTTALTISRVLTKGGADYYCFSGD-NN----",
468
+ "IGLV3-13*02": "SYELTQPPA-VSVSPGQTARISCSGDVLR------DNYADWYPQKPGQTPVLVIYKD-----GERPSGIPERFSGST--SGNTTALTISRVLTKGGADYYCFSGD-NN----",
469
+ "IGLV3-16*01": "SYELTQPPS-VSVSLGQMARITCSGEALP------KKYAYWYQQKPGQFPVLVIYKD-----SERPSGIPERFSGSS--SGTIVTLTISGVQAEDEADYYCLSADSSGTY--",
470
+ "IGLV3-19*01": "SSELTQDPA-VSVALGQTVRITCQGDSLR------SYYASWYQQKPGQAPVLVIYGK-----NNRPSGIPDRFSGSS--SGNTASLTITGAQAEDEADYYCNSRDSSGNH--",
471
+ "IGLV3-19*02": "SSELTQDPA-VSVALGQTVRITCQGDSLR------SYYASWYQQKPGQAPVRVIYGK-----NNRPSGIPDRFSGSS--SGNTASLTITGAQAEDEADYYCNSWDSSGNH--",
472
+ "IGLV3-21*01": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVIYYD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
473
+ "IGLV3-21*02": "SYVLTQPPS-VSVAPGQTARITCGGNNIG------SKSVHWYQQKPGQAPVLVVYDD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
474
+ "IGLV3-21*03": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVVYDD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
475
+ "IGLV3-21*04": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVIYYD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
476
+ "IGLV3-22*01": "SYELTQLPS-VSVSPGQTARITCSGDVLG------ENYADWYQQKPGQAPELVIYED-----SERYPGIPERFSGST--SGNTTTLTISRVLTEDEADYYCLSGDEDN----",
477
+ "IGLV3-22*03": "SYELTQLPS-VSLSPGQKARITCSGDVLG------KNYADWYQQKPGQAPELVIYED-----SERYPGIPERFSGST--SGNTTTLTISRVLTEDEADYYCLSGNEDN----",
478
+ "IGLV3-25*01": "SYELMQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
479
+ "IGLV3-25*02": "SYELTQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
480
+ "IGLV3-25*03": "SYELTQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
481
+ "IGLV3-27*01": "SYELTQPSS-VSVSPGQTARITCSGDVLA------KKYARWFQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGAQVEDEADYYCYSAADNN----",
482
+ "IGLV3-31*01": "SSELSQEPA-VSVALG-TARITCQGDSIE------DSVVNWYKQKPSQAPGLVI-LN-----SVQSSGIPKKFSGSS--SGNMATLTITGIQVEDKADYYCQSWDSSRTH--",
483
+ "IGLV3-31*02": "SSELSQEPA-VSVSLG-TARITCQGDSIE------DSVVNWYKQKPSQAPGLVI-LN-----SVQSSGIPKKFSGSS--SGNMATLTITGIQVEDKADYYCQSWDSSRTH--",
484
+ "IGLV3-32*01": "SSGPTQVPA-VSVALGQMARITCQGDSME------GSYEHWYQQKPGQAPVLVIYDS-----SDRPSRIPERFSGSK--SGNTTTLTITGAQAEDEADYYYQLIDNHATQ--",
485
+ "IGLV3-9*01": "SYELTQPLS-VSVALGQTARITCGGNNIG------SKNVHWYQQKPGQAPVLVIYRD-----SNRPSGIPERFSGSN--SGNTATLTISRAQAGDEADYYCQVWDSSTA---",
486
+ "IGLV3-9*02": "SYELTQPLS-VSVALGQAARITCGGNNLG------YKSVHWYQQKPGQAPVLVIYRD-----NNRPSGIPERFSGSN--SGNTATLTISRAQAGDEADYYCQVWDSSTAH--",
487
+ "IGLV4-3*01": "LPVLTQPPS-ASALLGASIKLTCTLSSEHS-----TYTIEWYQQRPGRSPQYIMKVKSD-GSHSKGDGIPDRFMGSS--SGADRYLTFSNLQSDDEAEYHCGESHTIDGQVG",
488
+ "IGLV4-60*01": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQLEDEADYYCETWDSNT----",
489
+ "IGLV4-60*02": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQFEDEADYYCETWDSNT----",
490
+ "IGLV4-60*03": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQSEDEADYYCETWDSNT----",
491
+ "IGLV4-69*01": "QLVLTQSPS-ASASLGASVKLTCTLSSGHS-----SYAIAWHQQQPEKGPRYLMKLNSD-GSHSKGDGIPDRFSGSS--SGAERYLTISSLQSEDEADYYCQTWGTGI----",
492
+ "IGLV4-69*02": "QLVLTQSPS-ASASLGASVKLTCTLSSGHS-----SYAIAWHQQQPEKGPRYLMKLNSD-GSHSKGDGIPDRFSGSS--SGAERYLTISSLQSEDEADYYCQTWGTGI----",
493
+ "IGLV5-37*01": "QPVLTQPPS-SSASPGESARLTCTLPSDINV---GSYNIYWYQQKPGSPPRYLLYYYSD-SDKGQGSGVPSRFSGSKDASANTGILLISGLQSEDEADYYCMIWPSNAS---",
494
+ "IGLV5-39*01": "QPVLTQPTS-LSASPGASARFTCTLRSGINV---GTYRIYWYQQKPGSLPRYLLRYKSD-SDKQQGSGVPSRFSGSKDASTNAGLLLISGLQSEDEADYYCAIWYSSTS---",
495
+ "IGLV5-39*02": "QPVLTQPTS-LSASPGASARFTCTLRSGINV---GTYRIYWYQQNPGSLPRYLLRYKSD-SDKQQGSGVPSRFSGSKDASTNAGLLLISGLQSEDEADYYCAIWYSSTS---",
496
+ "IGLV5-45*01": "QAVLTQPAS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
497
+ "IGLV5-45*02": "QAVLTQPSS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
498
+ "IGLV5-45*03": "QAVLTQPSS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
499
+ "IGLV5-45*04": "QAVLTQPSS-LSASPGASASLTCTLCSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
500
+ "IGLV5-48*01": "QPVLTQPTS-LSASPGASARLTCTLRSGINL---GSYRIFWYQQKPESPPRYLLSYYSD-SSKHQGSGVPSRFSGSKDASSNAGILVISGLQSEDEADYYCMIWHSSAS---",
501
+ "IGLV5-48*02": "QAVLTQPTS-LSASPGASARLTCTLRSGISV---GSYRIYWYQQKPGSPPRYLLNYYSD-SDKHQGSGVPSRFSGSKDASTNAGILFISGL-SEDEADYYCMIWHSSAS---",
502
+ "IGLV5-52*01": "QPVLTQPSS-HSASSGASVRLTCMLSSGFSV---GDFWIRWYQQKPGNPPRYLLYYHSD-SNKGQGSGVPSRFSGSNDASANAGILRISGLQPEDEADYYCGTWHSNSKT--",
503
+ "IGLV6-57*01": "NFMLTQPHS-VSESPGKTVTISCTRSSGSI----ASNYVQWYQQRPGSSPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
504
+ "IGLV6-57*02": "NFMLTQPHS-VSESPGKTVTISCTGSSGSI----ASNYVQWYQQRPGSAPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
505
+ "IGLV6-57*03": "NFMLTQPHS-VSESPGKTVTISCTRSSGSI----ASNYVQWYQQRPGSAPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
506
+ "IGLV7-43*01": "QTVVTQEPS-LTVSPGGTVTLTCASSTGAVT---SGYYPNWFQQKPGQAPRALIYST-----SNKHSWTPARFSGSL--LGGKAALTLSGVQPEDEAEYYCLLYYGGAQ---",
507
+ "IGLV7-46*01": "QAVVTQEPS-LTVSPGGTVTLTCGSSTGAVT---SGHYPYWFQQKPGQAPRTLIYDT-----SNKHSWTPARFSGSL--LGGKAALTLSGAQPEDEAEYYCLLSYSGAR---",
508
+ "IGLV7-46*02": "QAVVTQEPS-LTVSPGGTVTLTCGSSTGAVT---SGHYPYWFQQKPGQAPRTLIYDT-----SNKHSWTPARFSGSL--LGGKAALTLLGAQPEDEAEYYCLLSYSGAR---",
509
+ "IGLV8-61*01": "QTVVTQEPS-FSVSPGGTVTLTCGLSSGSVS---TSYYPSWYQQTPGQAPRTLIYST-----NTRSSGVPDRFSGSI--LGNKAALTITGAQADDESDYYCVLYMGSGI---",
510
+ "IGLV8-61*02": "QTVVTQEPS-FSVSPGGTVTLTCGLSSGSVS---TSYYPSWYQQTPGQAPRTLIYST-----NTRSSGVPDCFSGSI--LGNKAALTITGAQADDESDYYCVLYMGSGI---",
511
+ "IGLV8/OR8-1*02": "QSVVTQEPS-LSGSPGGTVTLTCALSSGSVS---TSHYPRWYQQTPGQAPHMLICSP-----NTCPSGVPGRFSGSI--LGNKAALTITGTQVDDDSDHYCVLYMGSGN---",
512
+ "IGLV9-49*01": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV",
513
+ "IGLV9-49*02": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV",
514
+ "IGLV9-49*03": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV"
515
+ }
516
+ },
517
+ 'K': {
518
+ "positions": [
519
+ "L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20", "L21", "L22", "L23", "L24", "L25", "L26", "L27", "L28", "L29", "L30", "L31", "L32", "L33", "L34", "L35", "L36", "L37", "L38", "L39", "L40", "L41", "L42", "L43", "L44", "L45", "L46", "L47", "L48", "L49", "L50", "L51", "L52", "L53", "L54", "L55", "L56", "L57", "L58", "L59", "L62", "L63", "L64", "L65", "L66", "L67", "L68", "L69", "L70", "L71", "L72", "L74", "L75", "L76", "L77", "L78", "L79", "L80", "L81", "L82", "L83", "L84", "L85", "L86", "L87", "L88", "L89", "L90", "L91", "L92", "L93", "L94", "L95", "L96", "L97", "L98", "L99", "L100", "L101", "L102", "L103", "L104", "L105", "L106", "L107", "L108", "L109", "L110", "L111", "L111A", "L111B", "L111C", "L111D"
520
+ ],
521
+ "aligned_sequences": {
522
+ "IGKV1-12*01": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
523
+ "IGKV1-12*02": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
524
+ "IGKV1-13*01": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALA-YQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNNYP----",
525
+ "IGKV1-13*02": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNSYP----",
526
+ "IGKV1-16*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWFQQKPGKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
527
+ "IGKV1-16*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWFQQKPGKAPKSLIYAA-----SSLQSGVPSKFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
528
+ "IGKV1-17*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
529
+ "IGKV1-17*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISNLQPEDFATYYCLQHNSYP----",
530
+ "IGKV1-17*03": "DIQMTQSPSAMSASVGDRVTITCRASQGI------SNYLAWFQQKPGKVPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
531
+ "IGKV1-27*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWYQQKPGKVPKLLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYCQKYNSAP----",
532
+ "IGKV1-33*01": "DIQMTQSPSSLSASVGDRVTITCQASQDI------SNYLNWYQQKPGKAPKLLIYDA-----SNLETGVPSRFSGSG--SGTDFTFTISSLQPEDIATYYCQQYDNLP----",
533
+ "IGKV1-37*01": "DIQLTQSPSSLSASVGDRVTITCRVSQGI------SSYLNWYRQKPGKVPKLLIYSA-----SNLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYGQRTYNAP----",
534
+ "IGKV1-39*01": "DIQMTQSPSSLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQSYSTP----",
535
+ "IGKV1-39*02": "DIQMTQSPSFLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQCGYSTP----",
536
+ "IGKV1-5*01": "DIQMTQSPSTLSASVGDRVTITCRASQSI------SSWLAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
537
+ "IGKV1-5*02": "DIQMTQSPSTLSASVGDRVTIICRASQSI------SSWLAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
538
+ "IGKV1-5*03": "DIQMTQSPSTLSASVGDRVTITCRASQSI------SSWLAWYQQKPGKAPKLLIYKA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
539
+ "IGKV1-6*01": "AIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCLQDYNYP----",
540
+ "IGKV1-6*02": "AIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCLQDYNYP----",
541
+ "IGKV1-8*01": "AIRMTQSPSSFSASTGDRVTITCRASQGI------SSYLAWYQQKPGKAPKLLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSYP----",
542
+ "IGKV1-9*01": "DIQLTQSPSFLSASVGDRVTITCRASQGI------SSYLAWYQQKPGKAPKLLIYAA-----STLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCQQLNSYP----",
543
+ "IGKV1-NL1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNSLAWYQQKPGKAPKLLLYAA-----SRLESGVPSRFSGSG--SGTDYTLTISSLQPEDFATYYCQQYYSTP----",
544
+ "IGKV1/OR-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
545
+ "IGKV1/OR-3*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFAAYYCQQSDSTP----",
546
+ "IGKV1/OR-4*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKFLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSTP----",
547
+ "IGKV1/OR1-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATYYCQQSDSTP----",
548
+ "IGKV1/OR10-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSTS----",
549
+ "IGKV1/OR15-118*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----PSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATY-CQQSDSTP----",
550
+ "IGKV1/OR2-0*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----PSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATYYCQQSDSTP----",
551
+ "IGKV1/OR2-1*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTISSLQPEDFAAYYCQQSDSTP----",
552
+ "IGKV1/OR2-108*01": "DIQVTQSPSSLSASVGDRVTITCRASQGI------SNGLSWYQQKPGQAPTLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYCLQDYTTP----",
553
+ "IGKV1/OR2-11*01": "DIQMTQPPSSLSASVGDRATVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRAMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
554
+ "IGKV1/OR2-118*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFANYYCQQSDSTP----",
555
+ "IGKV1/OR2-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTISSLQPEDFAAYYCQQSDSTP----",
556
+ "IGKV1/OR2-3*01": "DIQMTQPPSSLSASVGDRVTVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRGMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
557
+ "IGKV1/OR2-9*01": "DIQMTQPPSSLSASVGDRATVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRAMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
558
+ "IGKV1/OR22-5*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKPLIYAA-----SSLQSGIPSQFSDSG--SGTD-TLTISSLQPEDFATYYCQQSYSTP----",
559
+ "IGKV1/OR22-5*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSQFSDSG--SGTD-TLTISSLQPEDFTTYYCQQSYSTP----",
560
+ "IGKV1/OR9-1*01": "DIQMTQSPSSLSASVGGRVTITCRVSQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
561
+ "IGKV1/OR9-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
562
+ "IGKV1/ORY-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------INNLNWYQKKPGKTPKLLIYAA-----SSLQSGIPTRFSDSG--SGTDYTPTISSLQPEDFATYYCQQSDSTP----",
563
+ "IGKV1D-12*01": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
564
+ "IGKV1D-12*02": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
565
+ "IGKV1D-13*01": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNNYP----",
566
+ "IGKV1D-13*02": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNSYP----",
567
+ "IGKV1D-16*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SSWLAWYQQKPEKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
568
+ "IGKV1D-16*02": "DIQMTQSPSSLSASVGDRVTITCRARQGI------SSWLAWYQQKPEKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
569
+ "IGKV1D-17*01": "NIQMTQSPSAMSASVGDRVTITCRARQGI------SNYLAWFQQKPGKVPKHLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
570
+ "IGKV1D-33*01": "DIQMTQSPSSLSASVGDRVTITCQASQDI------SNYLNWYQQKPGKAPKLLIYDA-----SNLETGVPSRFSGSG--SGTDFTFTISSLQPEDIATYYCQQYDNLP----",
571
+ "IGKV1D-37*01": "DIQLTQSPSSLSASVGDRVTITCRVSQGI------SSYLNWYRQKPGKVPKLLIYSA-----SNLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYGQRTYNAP----",
572
+ "IGKV1D-39*01": "DIQMTQSPSSLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQSYSTP----",
573
+ "IGKV1D-42*01": "DIQMIQSPSFLSASVGDRVSIICWASEGI------SSNLAWYLQKPGKSPKLFLYDA-----KDLHPGVSSRFSGRG--SGTDFTLTIISLKPEDFAAYYCKQDFSYP----",
574
+ "IGKV1D-42*02": "DIQMTQSPSFLSASVGDRVSIICWASEGI------SSNLAWYLQKPGKSPKLFLYDA-----KDLHPGVSSRFSGRG--SGTDFTLTIISLKPEDFAAYYCKQDFSYP----",
575
+ "IGKV1D-43*01": "AIRMTQSPFSLSASVGDRVTITCWASQGI------SSYLAWYQQKPAKAPKLFIYYA-----SSLQSGVPSRFSGSG--SGTDYTLTISSLQPEDFATYYCQQYYSTP----",
576
+ "IGKV1D-8*01": "VIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
577
+ "IGKV1D-8*02": "AIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
578
+ "IGKV1D-8*03": "VIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
579
+ "IGKV2-18*01": "DIVMTQTPPSLPVNPGEPASISCRSSQSLLHS-NGYTYLHWYLQKPGQSPQLLIYRV-----SNHLSGVPDRFSGSG--SGSDFTLKISWVEAEDVGVYYCMQATQFP----",
580
+ "IGKV2-24*01": "DIVMTQTPLSSPVTLGQPASISCRSSQSLVHS-DGNTYLSWLQQRPGQPPRLLIYKI-----SNRFSGVPDRFSGSG--AGTDFTLKISRVEAEDVGVYYCMQATQFP----",
581
+ "IGKV2-28*01": "DIVMTQSPLSLPVTPGEPASISCRSSQSLLHS-NGYNYLDWYLQKPGQSPQLLIYLG-----SNRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQALQTP----",
582
+ "IGKV2-29*01": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYY-MQGIHLP----",
583
+ "IGKV2-29*02": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGIHLP----",
584
+ "IGKV2-29*03": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGIHLP----",
585
+ "IGKV2-30*01": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVYS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNRDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
586
+ "IGKV2-30*02": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVHS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNRDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
587
+ "IGKV2-4*01": "DIVMTQHLLSLPIPLGEPASISCRSSQSLLHS-DGNTYLDWYLQKPGQSPQLLIYTI-----SNKFYGVPNKFSGSR--SGTGFTLKFSKVEAEDVGVYCCEQGLQGP----",
588
+ "IGKV2-40*01": "DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSDDGNTYLDWYLQKPGQSPQLLIYTL-----SYRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQRIEFP----",
589
+ "IGKV2/OR2-7D*01": "DILLTQTPLSLSITPGEPASISCRSSRSLLHS-NGNTYLHW-LQKPGQPPQCLICKV-----SNRFSGVPDRFSGSG--SGIDFTLKISPVEAADVGVYITACKLHTGP---",
590
+ "IGKV2/OR22-4*01": "DIVMTQTPLSLPVTPGEPASISCRSSESLLDTDDEYTYLNWYLQKPGQSPQLLIYEV-----SNRASGVPDRFSGSG--SGTDFTLKISRVEA-DVGVYYCMQALQTP----",
591
+ "IGKV2D-18*01": "DIVMTQTPPSLPVNPGEPASISCRSSQSLLHS-NGYTYLHWYPQKPGQSPQLLIYRV-----SSRFSGVPDRFSGSG--SGSDFTLKISWVEAEDVGVYYCMQATQFP----",
592
+ "IGKV2D-24*01": "DIVMTQTPLSSPVTLGQPASISFRSSQSLVHS-DGNTYLSWLQQRPGQPPRLLIYKV-----SNRFSGVPDRFSGSG--AGTDFTLKISRVEAEDVGVYYCTQATQFP----",
593
+ "IGKV2D-26*01": "EIVMTQTPLSLSITPGEQASISCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
594
+ "IGKV2D-26*02": "EIVMTQTPLSLSITPGEQASMSCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLICEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
595
+ "IGKV2D-26*03": "EIVMTQTPLSLSITPGEQASMSCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
596
+ "IGKV2D-28*01": "DIVMTQSPLSLPVTPGEPASISCRSSQSLLHS-NGYNYLDWYLQKPGQSPQLLIYLG-----SNRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQALQTP----",
597
+ "IGKV2D-29*01": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQPPQLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQSIQLP----",
598
+ "IGKV2D-29*02": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQSIQLP----",
599
+ "IGKV2D-30*01": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVYS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNWDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
600
+ "IGKV2D-40*01": "DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSDDGNTYLDWYLQKPGQSPQLLIYTL-----SYRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQRIEFP----",
601
+ "IGKV3-11*01": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--SGTDFTLTISSLEPEDFAVYYCQQRSNWP----",
602
+ "IGKV3-11*02": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--SGRDFTLTISSLEPEDFAVYYCQQRSNWP----",
603
+ "IGKV3-15*01": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNNWP----",
604
+ "IGKV3-20*01": "EIVLTQSPGTLSLSPGERATLSCRASQSVS-----SSYLAWYQQKPGQAPRLLIYGA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQYGSSP----",
605
+ "IGKV3-7*01": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDHNLP----",
606
+ "IGKV3-7*02": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
607
+ "IGKV3-7*03": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGRDFTLTISSLQPEDFAVYYCQQDHNLP----",
608
+ "IGKV3-7*04": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
609
+ "IGKV3/OR2-268*01": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
610
+ "IGKV3/OR2-268*02": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
611
+ "IGKV3D-11*01": "EIVLTQSPATLSLSPGERATLSCRASQGV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
612
+ "IGKV3D-11*02": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
613
+ "IGKV3D-11*03": "EIVLTQSPATLSLSPGERATLSCRASQGV------SSNLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
614
+ "IGKV3D-15*01": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNNWP----",
615
+ "IGKV3D-15*02": "EIVMMQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNN-P----",
616
+ "IGKV3D-15*03": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----SIRATGIPARFSGSG--SGTEFTLTISILQSEDFAVYYCQQYNNWP----",
617
+ "IGKV3D-20*01": "EIVLTQSPATLSLSPGERATLSCGASQSVS-----SSYLAWYQQKPGLAPRLLIYDA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQYGSSP----",
618
+ "IGKV3D-20*02": "EIVLTQSPATLSLSPGERATLSCRASQSVS-----SSYLAWYQQKPGQAPRLLIYDA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQRSNWH----",
619
+ "IGKV3D-7*01": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
620
+ "IGKV4-1*01": "DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWA-----STRESGVPDRFSGSG--SGTDFTLTISSLQAEDVAVYYCQQYYSTP----",
621
+ "IGKV5-2*01": "ETTLTQSPAFMSATPGDKVNISCKASQDI------DDDMNWYQQKPGEAAIFIIQEA-----TTLVPGIPPRFSGSG--YGTDFTLTINNIESEDAAYYFCLQHDNFP----",
622
+ "IGKV6-21*01": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSFSGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
623
+ "IGKV6-21*02": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
624
+ "IGKV6D-21*01": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSFSGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
625
+ "IGKV6D-21*02": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTLTINSLEAEDAAAYYCHQSSSLP----",
626
+ "IGKV6D-41*01": "DVVMTQSPAFLSVTPGEKVTITCQASEGI------GNYLYWYQQKPDQAPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTFTISSLEAEDAATYYCQQGNKHP----",
627
+ "IGKV7-3*01": "DIVLTQSPASLAVSPGQRATITCRASESVSF--LGINLIHWYQQKPGQPPKLLIYQA-----SNKDTGVPARFSGSG--SGTDFTLTINPVEANDTANYYCLQSKNFP----"
628
+ }
629
+ }
630
+ }
631
+
632
+ HUMAN_IMGT_IG_J = {
633
+ 'H': {
634
+ "positions": [
635
+ "H112C", "H112B", "H112A", "H112", "H113", "H114", "H115", "H116", "H117", "H118", "H119", "H120", "H121", "H122", "H123", "H124", "H125", "H126", "H127", "H128"
636
+ ],
637
+ "aligned_sequences": {
638
+ "IGHJ1*01": "---AEYFQHWGQGTLVTVSS",
639
+ "IGHJ2*01": "---YWYFDLWGRGTLVTVSS",
640
+ "IGHJ3*01": "----DAFDVWGQGTMVTVSS",
641
+ "IGHJ3*02": "----DAFDIWGQGTMVTVSS",
642
+ "IGHJ4*01": "-----YFDYWGQGTLVTVSS",
643
+ "IGHJ4*02": "-----YFDYWGQGTLVTVSS",
644
+ "IGHJ4*03": "-----YFDYWGQGTLVTVSS",
645
+ "IGHJ5*01": "----NWFDSWGQGTLVTVSS",
646
+ "IGHJ5*02": "----NWFDPWGQGTLVTVSS",
647
+ "IGHJ6*01": "YYYYYGMDVWGQGTTVTVSS",
648
+ "IGHJ6*04": "YYYYYGMDVWGKGTTVTVSS"
649
+ }
650
+ },
651
+ 'L': {
652
+ "positions": [
653
+ "L116", "L117", "L118", "L119", "L120", "L121", "L122", "L123", "L124", "L125", "L126", "L127"
654
+ ],
655
+ "aligned_sequences": {
656
+ "IGLJ1*01": "YVFGTGTKVTVL",
657
+ "IGLJ2*01": "VVFGGGTKLTVL",
658
+ "IGLJ3*01": "VVFGGGTKLTVL",
659
+ "IGLJ3*02": "WVFGGGTKLTVL",
660
+ "IGLJ4*01": "FVFGGGTQLIIL",
661
+ "IGLJ5*01": "WVFGEGTELTVL",
662
+ "IGLJ5*02": "WVFGEGTELTVL",
663
+ "IGLJ6*01": "NVFGSGTKVTVL",
664
+ "IGLJ7*01": "AVFGGGTQLTVL",
665
+ "IGLJ7*02": "AVFGGGTQLTAL"
666
+ }
667
+ },
668
+ 'K': {
669
+ "positions": [
670
+ "L116", "L117", "L118", "L119", "L120", "L121", "L122", "L123", "L124", "L125", "L126", "L127"
671
+ ],
672
+ "aligned_sequences": {
673
+ "IGKJ1*01": "WTFGQGTKVEIK",
674
+ "IGKJ2*01": "YTFGQGTKLEIK",
675
+ "IGKJ2*02": "CTFGQGTKLEIK",
676
+ "IGKJ2*03": "YSFGQGTKLEIK",
677
+ "IGKJ2*04": "CSFGQGTKLEIK",
678
+ "IGKJ3*01": "FTFGPGTKVDIK",
679
+ "IGKJ4*01": "LTFGGGTKVEIK",
680
+ "IGKJ4*02": "LTFGGGTKVEIK",
681
+ "IGKJ5*01": "ITFGQGTRLEIK"
682
+ }
683
+ }
684
+ }
abnumber/position.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import List, Union
3
+
4
+ from abnumber.common import _validate_chain_type, SCHEME_POSITION_TO_REGION, SCHEME_VERNIER, POS_REGEX
5
+
6
+
7
+ class Position:
8
+ """Numbered position using a given numbering scheme
9
+
10
+ Used as a key to store Position -> Amino acid information.
11
+
12
+ Position objects are sortable according to the schema simply using ``sorted()``.
13
+ """
14
+ def __init__(self, chain_type: str, number: int, letter: str, scheme: str):
15
+ _validate_chain_type(chain_type)
16
+ self.chain_type: str = chain_type
17
+ self.number: int = int(number)
18
+ self.letter: str = letter.strip()
19
+ self.scheme: str = scheme
20
+ self.cdr_definition: str = self.scheme
21
+ self.cdr_definition_position: int = self.number
22
+
23
+ def copy(self):
24
+ return copy.copy(self)
25
+
26
+ def _key(self):
27
+ # Note: We are not including chain_type, but just Heavy/Light flag, to keep Kappa and Lambda chain positions equal
28
+ return self.chain_type_prefix(), self.number, self.letter, self.scheme
29
+
30
+ def __repr__(self):
31
+ return f'{self.chain_type_prefix()}{self.number}{self.letter} ({self.scheme})'
32
+
33
+ def __str__(self):
34
+ return self.format()
35
+
36
+ def set_cdr_definition(self, cdr_definition: str, cdr_definition_position: int):
37
+ assert cdr_definition is not None, 'cdr_definition is required'
38
+ assert cdr_definition_position is not None, 'cdr_definition_position is required'
39
+ self.cdr_definition = cdr_definition
40
+ self.cdr_definition_position = cdr_definition_position
41
+
42
+ def format(self, chain_type=True, region=False, rjust=False, ljust=False, fillchar=' '):
43
+ """Format Position to string
44
+
45
+ :param chain_type: Add chain type prefix (H/L)
46
+ :param region: Add region prefix (FR1, CDR1, ...)
47
+ :param rjust: Align text to the right
48
+ :param ljust: Align text to the left
49
+ :param fillchar: Characer to use for alignment padding
50
+ :return: formatted string
51
+ """
52
+ formatted = f'{self.number}{self.letter}'
53
+ if chain_type:
54
+ formatted = f'{self.chain_type_prefix()}{formatted}'
55
+ if region:
56
+ formatted = f'{self.get_region()} {formatted}'
57
+ just = 4 + 1* int(chain_type) + 5 * int(region)
58
+ if rjust:
59
+ formatted = formatted.rjust(just, fillchar)
60
+ if ljust:
61
+ formatted = formatted.ljust(just, fillchar)
62
+ return formatted
63
+
64
+ def __hash__(self):
65
+ return self._key().__hash__()
66
+
67
+ def __eq__(self, other):
68
+ return isinstance(other, Position) and self._key() == other._key()
69
+
70
+ def __ge__(self, other):
71
+ return self == other or self > other
72
+
73
+ def __le__(self, other):
74
+ return self == other or self < other
75
+
76
+ def __lt__(self, other):
77
+ if not isinstance(other, Position):
78
+ raise TypeError(f'Cannot compare Position object with {type(other)}: {other}')
79
+ assert self.is_heavy_chain() == other.is_heavy_chain(), f'Positions do not come from the same chain: {self}, {other}'
80
+ assert self.scheme == other.scheme, 'Comparing positions in different schemes is not implemented'
81
+ return self._sort_key() < other._sort_key()
82
+
83
+ def chain_type_prefix(self):
84
+ if self.chain_type == 'H':
85
+ return 'H'
86
+ if self.chain_type in ['K', 'L']:
87
+ return 'L'
88
+ raise NotImplementedError(f'Unknown chain type "{self.chain_type}"')
89
+
90
+ def _sort_key(self):
91
+ letter_ord = ord(self.letter) if self.letter else 0
92
+ if self.scheme == 'imgt':
93
+ if self.number in [33, 61, 112]:
94
+ # position 112 is sorted in reverse
95
+ letter_ord = -letter_ord
96
+ elif self.scheme in ['chothia', 'kabat', 'aho']:
97
+ # all letters are sorted alphabetically for these schemes
98
+ pass
99
+ else:
100
+ raise NotImplementedError(f'Cannot compare positions of scheme: {self.scheme}')
101
+ return self.is_heavy_chain(), self.number, letter_ord
102
+
103
+ def get_region(self):
104
+ """Get string name of this position's region
105
+
106
+ :return: uppercase string, one of: ``"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"``
107
+ """
108
+ if self.cdr_definition in SCHEME_POSITION_TO_REGION:
109
+ regions = SCHEME_POSITION_TO_REGION[self.cdr_definition]
110
+ else:
111
+ regions = SCHEME_POSITION_TO_REGION[f'{self.cdr_definition}_{self.chain_type}']
112
+ return regions[self.cdr_definition_position]
113
+
114
+ def is_in_cdr(self):
115
+ """Check if given position is found in the CDR regions"""
116
+ return self.get_region().lower().startswith('cdr')
117
+
118
+ def is_in_vernier(self):
119
+ if self.cdr_definition != 'kabat':
120
+ raise NotImplementedError('Vernier zone identification is currently supported '
121
+ f'only with Kabat CDR definitions, got: {self.cdr_definition}')
122
+ return self.cdr_definition_position in SCHEME_VERNIER.get(f'{self.cdr_definition}_{self.chain_type}', [])
123
+
124
+ @classmethod
125
+ def from_string(cls, position, chain_type, scheme):
126
+ """Create Position object from string, e.g. "H5"
127
+
128
+ Note that Positions parsed from string do not support separate CDR definitions.
129
+ """
130
+ match = POS_REGEX.match(position.upper())
131
+ _validate_chain_type(chain_type)
132
+ expected_chain_prefix = 'H' if chain_type == 'H' else 'L'
133
+ if match is None:
134
+ raise IndexError(f'Expected position format chainNumberLetter '
135
+ f'(e.g. "{expected_chain_prefix}112A" or "112A"), got: "{position}"')
136
+ chain_prefix, number, letter = match.groups()
137
+ number = int(number)
138
+ if chain_prefix and expected_chain_prefix != chain_prefix:
139
+ raise IndexError(f'Use no prefix or "{expected_chain_prefix}" prefix for "{chain_type}" chain. '
140
+ f'Got: "{chain_prefix}".')
141
+ return cls(chain_type=chain_type, number=number, letter=letter, scheme=scheme)
142
+
143
+ def is_heavy_chain(self):
144
+ return self.chain_type == 'H'
145
+
146
+ def is_light_chain(self):
147
+ return self.chain_type in 'KL'
148
+
149
+
150
+ def sort_positions(positions: List[str], chain_type: str, scheme: str) -> List:
151
+ """Sort position strings to correct order based on given scheme"""
152
+ has_prefix = [p.startswith('H') or p.startswith('L') for p in positions]
153
+ assert all(has_prefix) or not any(has_prefix), 'Inconsistent position prefix'
154
+ has_prefix = all(has_prefix)
155
+
156
+ position_objects = [Position.from_string(p, chain_type=chain_type, scheme=scheme) for p in positions]
157
+
158
+ return [p.format(chain_type=has_prefix) for p in sorted(position_objects)]
anarci/.DS_Store ADDED
Binary file (6.15 kB). View file
 
anarci/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __version__ = "1.b"
2
+ __all__ = ["anarci", "schemes"]
3
+ from .anarci import *
anarci/anarci.py ADDED
@@ -0,0 +1,1013 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ANARCI - Antibody Numbering and Antigen Receptor ClassIfication
2
+ # Copyright (C) 2016 Oxford Protein Informatics Group (OPIG)
3
+ #
4
+ # This program is free software: you can redistribute it and/or modify
5
+ # it under the terms of the GNU General Public License as published by
6
+ # the Free Software Foundation, either version 3 of the License, or
7
+ # (at your option) any later version.
8
+ #
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ # GNU General Public License for more details.#
13
+ #
14
+ # You should have received a copy of the GNU General Public License
15
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
+
17
+ '''
18
+ ANARCI - Antigen Receptor Numbering And ClassIfication
19
+
20
+ Oxford Protein Informatics Group (OPIG). 2015-17
21
+
22
+ ANARCI performs alignments of sequences to databases of Hidden Markov Models (HMMs).
23
+ Those that align with a significant score are classified by species and chain type.
24
+ They are then numbered with a scheme of the user's choosing.
25
+
26
+ Currently implemented schemes:
27
+ IMGT
28
+ Chothia (IGs only)
29
+ Kabat (IGs only)
30
+ Martin / Enhanced Chothia (IGs only)
31
+ AHo
32
+ Wolfguy (IGs only)
33
+
34
+ Currently recognisable species (chains):
35
+ Human (heavy, kappa, lambda, alpha, beta)
36
+ Mouse (heavy, kappa, lambda, alpha, beta)
37
+ Rat (heavy, kappa, lambda)
38
+ Rabbit (heavy, kappa, lambda)
39
+ Pig (heavy, kappa, lambda)
40
+ Rhesus Monkey (heavy, kappa)
41
+
42
+ Notes:
43
+ o Use assign_germline to get a better species assignment
44
+ o Each scheme has been implemented to follow the published specification as closely as possible. However, in places some schemes
45
+ do not specifiy where insertions should be placed (e.g. imgt FW3). In these cases the HMM alignment is used. This can give rise
46
+ to inserted positions that were not described by the respective paper.
47
+ o AHo is implemented heuristically based on chain type. If one grafted a foreign CDR1 loop onto, say, a VH domain, it will be
48
+ numbered as if it is a CDRH1 loop.
49
+
50
+
51
+ '''
52
+
53
+ import os
54
+ import sys
55
+ import tempfile
56
+ import gzip
57
+ import math
58
+ from functools import partial
59
+ from textwrap import wrap
60
+ from subprocess import Popen, PIPE
61
+ from itertools import groupby, islice
62
+ from multiprocessing import Pool
63
+
64
+ from Bio.SearchIO.HmmerIO import Hmmer3TextParser as HMMERParser
65
+
66
+ # Import from the schemes submodule
67
+ from .schemes import *
68
+ from .germlines import all_germlines
69
+
70
+ all_species = list(all_germlines['V']['H'].keys())
71
+
72
+ amino_acids = sorted(list("QWERTYIPASDFGHKLCVNM"))
73
+ set_amino_acids = set(amino_acids)
74
+ anarci_path = os.path.split(__file__)[0]
75
+
76
+ scheme_short_to_long = { "m":"martin", "c":"chothia", "k":"kabat","imgt":"imgt", "kabat":"kabat", "chothia":"chothia", "martin":"martin", "i":"imgt", "a":"aho","aho":"aho","wolfguy":"wolfguy", "w":"wolfguy"}
77
+
78
+ scheme_names = list(scheme_short_to_long.keys())
79
+ chain_type_to_class = {"H":"H", "K":"L", "L":"L", "A":"A", "B":"B", "G":"G", "D":"D"}
80
+
81
+ HMM_path = os.path.join( anarci_path, "dat", "HMMs" )
82
+
83
+ all_reference_states = list(range( 1, 129)) # These are the IMGT reference states (matches)
84
+
85
+ class HMMscanError(Exception):
86
+ def __init__(self, message):
87
+ # Call the base class constructor with the parameters it needs
88
+ super(HMMscanError, self).__init__(message)
89
+
90
+ ## Utility functions ##
91
+ def read_fasta(filename):
92
+ """
93
+ Read a sequence file and parse as description, string
94
+ """
95
+ return [ r for r in fasta_iter(filename) ]
96
+
97
+ def fasta_iter(fasta_name):
98
+ """
99
+ Given a fasta file. yield tuples of header, sequence
100
+ https://www.biostars.org/p/710/
101
+ """
102
+ if fasta_name.endswith( '.gz' ): # IOError raised upon iteration if not a real gzip file.
103
+ fh = gzip.open(fasta_name)
104
+ else:
105
+ fh = open(fasta_name)
106
+ faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
107
+ for header in faiter:
108
+ header = next(header)[1:].strip()
109
+ #header = header.next()[1:].strip()
110
+ seq = "".join(s.strip() for s in next(faiter))
111
+ yield header, seq
112
+
113
+
114
+ def write_fasta(sequences, f):
115
+ """
116
+ Write a list of sequences to file.
117
+
118
+ should be a list of name, sequence tuples
119
+
120
+ f should be an open file
121
+ """
122
+ for name, sequence in sequences:
123
+ print(">%s"%name, file=f)
124
+ print('\n'.join(['\n'.join(wrap(block, width=80)) for block in sequence.splitlines()]), file=f)
125
+
126
+
127
+ def validate_sequence(sequence):
128
+ """
129
+ Check whether a sequence is a protein sequence or if someone has submitted something nasty.
130
+ """
131
+ assert len(sequence) < 10000, "Sequence too long."
132
+ assert not (set( sequence.upper() ) - set_amino_acids ), "Unknown amino acid letter found in sequence: %s"% ", ".join(list((set( sequence.upper() ) - set_amino_acids )))
133
+ return True
134
+
135
+ def validate_numbering(xxx_todo_changeme, name_seq=[]):
136
+ """
137
+ Wrapper to do some basic validation of the numbering.
138
+
139
+ Further validation could be done but at the moment we just check that the numbering indices are incremental (they should be)
140
+ """
141
+ (numbering, start, end) = xxx_todo_changeme
142
+ name, seq = name_seq
143
+ last = -1
144
+ nseq=""
145
+
146
+ for (index, _), a in numbering:
147
+ assert index >= last, "Numbering was found to decrease along the sequence %s. Please report."%name
148
+ last = index
149
+ nseq += a.replace("-","")
150
+
151
+ assert nseq in seq.replace("-",""), "The algorithm did not number a contiguous segment for sequence %s. Please report"%name
152
+
153
+ return numbering, start, end
154
+
155
+ def grouper(n, iterable):
156
+ '''
157
+ Group entries of an iterable by n
158
+ '''
159
+ it = iter(iterable)
160
+ def take():
161
+ while 1:
162
+ yield list( islice(it,n) )
163
+ return iter(take().__next__, [] )
164
+
165
+ def anarci_output(numbered, sequences, alignment_details, outfile, sequence_id=None, domain_id=None):
166
+ """
167
+ Outputs to open file
168
+
169
+ If sequence_id is specified as an integer then only this sequence will be printed.
170
+ Otherwise all sequences will be printed.
171
+
172
+ If domain_id is specified as an integer then only this domain will be printed.
173
+ Otherwise all domains will be printed.
174
+
175
+ If domain_id is specified then sequence_id must also be specified.
176
+ """
177
+ assert (sequence_id is not None) or (sequence_id is None and domain_id is None), "If domain_id is specified, sequence_id must also be specified."
178
+ for i in range(len(numbered)):
179
+ if sequence_id is None:
180
+ print("# %s"%sequences[i][0], file=outfile) # print the name
181
+ if numbered[i] is not None:
182
+ if sequence_id is not None:
183
+ if i != sequence_id: continue
184
+ print("# ANARCI numbered", file=outfile)
185
+ for j in range( len(numbered[i])): # Iterate over domains
186
+ if domain_id is not None:
187
+ if j != domain_id: continue
188
+ print("# Domain %d of %d"%(j+1, len(numbered[i]) ), file=outfile)
189
+ print("# Most significant HMM hit", file=outfile)
190
+ print("#|species|chain_type|e-value|score|seqstart_index|seqend_index|", file=outfile)
191
+ alignment_details[i][j]["evalue"] = str( alignment_details[i][j]["evalue"] )
192
+ print("#|%s|%s|%s|%.1f|%d|%d|"%tuple( [alignment_details[i][j][field] for field in
193
+ ["species","chain_type","evalue","bitscore"]]
194
+ +[ numbered[i][j][1], numbered[i][j][2]] ), file=outfile)
195
+
196
+ if 'germlines' in alignment_details[i][j]:
197
+ print('# Most sequence-identical germlines', file=outfile)
198
+ print('#|species|v_gene|v_identity|j_gene|j_identity|', file=outfile)
199
+ (species, vgene), vid =alignment_details[i][j]['germlines'].get('v_gene', [['','unknown'],0])
200
+ if vgene is None:
201
+ vgene, vid = 'unknown', 0
202
+ (_,jgene), jid =alignment_details[i][j]['germlines'].get('j_gene', [['','unknown'],0])
203
+ if jgene is None:
204
+ jgene, jid = 'unknown', 0
205
+ print('#|%s|%s|%.2f|%s|%.2f|'%(species, vgene, vid, jgene, jid ), file=outfile)
206
+ chain_type = chain_type_to_class[ alignment_details[i][j]["chain_type"] ]
207
+ print("# Scheme = %s"%alignment_details[i][j]["scheme"], file=outfile)
208
+ if len( numbered[i][j][0] ) == 0:
209
+ print("# Warning: %s scheme could not be applied to this sequence."%alignment_details[i][j]["scheme"], file=outfile)
210
+ for (index, insertion), aa in numbered[i][j][0]:
211
+ print(chain_type, ("%d"%index).ljust(5), insertion, aa, file=outfile)
212
+ print("//", file=outfile)
213
+
214
+ def csv_output(sequences, numbered, details, outfileroot):
215
+ '''
216
+ Write numbered sequences to csv files. A csv file is written for each chain type.
217
+
218
+ Kappa and Lambda chains are written to the same file
219
+
220
+ The sequences will written aligned to the numbering scheme. Gaps in the sequences with respect to the alignment are written
221
+ as a '-'
222
+
223
+ @param sequences: List of name, sequence tuples
224
+ @param numbered: Numbered sequences in the same order as the sequences list.
225
+ @param details: List of alignment details in the same order as the sequences list.
226
+ @param outfileroot: The file path for csv files to write. _<chain_type>.csv will be appended to this.
227
+ '''
228
+
229
+ chain_types = {}
230
+ pos_ranks = {}
231
+ all_pos = {}
232
+ _lc = {'K':'KL','L':'KL'}
233
+
234
+
235
+ # Divide the set into chain types and find how to order the numbering for each type.
236
+ for i in range( len(sequences) ): # Iterate over entries
237
+ if numbered[i] is None: continue
238
+
239
+ for j in range(len(numbered[i])): # Iterate over domains.
240
+ # Record the chain type index
241
+ c = details[i][j]['chain_type']
242
+ c = _lc.get(c, c) # Consider lambda and kappa together.
243
+ chain_types.setdefault( c, [] ).append( (i,j) )
244
+ if c not in pos_ranks:
245
+ pos_ranks[c] = {}
246
+ all_pos[c] = set()
247
+
248
+ # Update the insertion order for the scheme. i.e. is it A B C or C B A (e.g. imgt 111 and 112 repectively)
249
+ l = -1
250
+ r = 0
251
+ for p, _ in numbered[i][j][0]:
252
+ if p[0] != l:
253
+ l = p[0]
254
+ r = 0
255
+ else:
256
+ r +=1
257
+ pos_ranks[c][p] = max( r, pos_ranks[c].get( p, r ) )
258
+ all_pos[c].add( p )
259
+
260
+ # Write a new file for each chain type. Kappa and lambda are written together as light chains.
261
+ for cts in ['H','KL','A','B','G','D']:
262
+ if cts in chain_types:
263
+ with open( outfileroot + '_%s.csv'%cts, 'w' ) as out:
264
+
265
+ # Sort the positions by index and insertion order
266
+ positions = sorted( all_pos[cts], key = lambda p: (p[0], pos_ranks[cts][p]) )
267
+
268
+ # Header line
269
+ fields = ['Id','domain_no','hmm_species','chain_type','e-value','score','seqstart_index','seqend_index',
270
+ 'identity_species','v_gene','v_identity','j_gene','j_identity']
271
+ fields += [ ('%d%s'%(p)).strip() for p in positions ]
272
+ print(','.join( fields ), file=out)
273
+
274
+ # Iterate over the domains identified
275
+ for i,j in chain_types[cts]:
276
+ line = [ sequences[i][0].replace(',',' '),
277
+ str(j),
278
+ details[i][j].get('species',''),
279
+ details[i][j].get('chain_type',''),
280
+ str(details[i][j].get('evalue','')),
281
+ str(details[i][j].get('bitscore','')),
282
+ str(numbered[i][j][1]),
283
+ str(numbered[i][j][2]),
284
+ details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][0],
285
+ details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][1],
286
+ '%.2f'%details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[1],
287
+ details[i][j].get('germlines',{}).get( 'j_gene',[['',''],0] )[0][1],
288
+ '%.2f'%details[i][j].get('germlines',{}).get( 'j_gene',[['',''],0] )[1] ]
289
+
290
+ # Hash the numbering. Insertion order has been preserved in the positions sort.
291
+ d = dict( numbered[i][j][0] )
292
+ line += [ d.get(p,'-') for p in positions ]
293
+
294
+ assert len( line ) == len( fields )
295
+ print(','.join( line ), file=out)
296
+
297
+
298
+
299
+ ## Parsing and recognising domain hits from hmmscan ##
300
+ def _domains_are_same(dom1, dom2):
301
+ """
302
+ Check to see if the domains are overlapping.
303
+ @param dom1:
304
+ @param dom2:
305
+
306
+ @return: True or False
307
+ """
308
+ dom1, dom2 = sorted( [dom1, dom2], key=lambda x: x.query_start )
309
+ if dom2.query_start >= dom1.query_end:
310
+ return False
311
+ return True
312
+
313
+
314
+ def _parse_hmmer_query(query, bit_score_threshold=80, hmmer_species=None):
315
+ """
316
+
317
+ @param query: hmmer query object from Biopython
318
+ @param bit_score_threshold: the threshold for which to consider a hit a hit.
319
+
320
+ The function will identify multiple domains if they have been found and provide the details for the best alignment for each domain.
321
+ This allows the ability to identify single chain fvs and engineered antibody sequences as well as the capability in the future for identifying constant domains.
322
+
323
+ """
324
+ hit_table = [ ['id', 'description', 'evalue', 'bitscore', 'bias',
325
+ 'query_start', 'query_end' ] ]
326
+
327
+ # Find the best hit for each domain in the sequence.
328
+
329
+ top_descriptions, domains,state_vectors = [], [], []
330
+
331
+ if query.hsps: # We have some hits
332
+ # If we have specified a species, check to see we have hits for that species
333
+ # Otherwise revert back to using any species
334
+ if hmmer_species:
335
+ #hit_correct_species = [hsp for hsp in query.hsps if hsp.hit_id.startswith(hmmer_species) and hsp.bitscore >= bit_score_threshold]
336
+ hit_correct_species = []
337
+ for hsp in query.hsps:
338
+ if hsp.bitscore >= bit_score_threshold:
339
+ for species in hmmer_species:
340
+ if hsp.hit_id.startswith(species):
341
+ hit_correct_species.append(hsp)
342
+
343
+ if hit_correct_species:
344
+ hsp_list = hit_correct_species
345
+ else:
346
+ print("Limiting hmmer search to species %s was requested but hits did not achieve a high enough bitscore. Reverting to using any species" %(hmmer_species))
347
+ hsp_list = query.hsps
348
+ else:
349
+ hsp_list = query.hsps
350
+
351
+ for hsp in sorted(hsp_list, key=lambda x: x.evalue): # Iterate over the matches of the domains in order of their e-value (most significant first)
352
+ new=True
353
+ if hsp.bitscore >= bit_score_threshold: # Only look at those with hits that are over the threshold bit-score.
354
+ for i in range( len(domains) ): # Check to see if we already have seen the domain
355
+ if _domains_are_same( domains[i], hsp ):
356
+ new = False
357
+ break
358
+ hit_table.append( [ hsp.hit_id, hsp.hit_description, hsp.evalue, hsp.bitscore, hsp.bias, hsp.query_start, hsp.query_end] )
359
+ if new: # It is a new domain and this is the best hit. Add it for further processing.
360
+ domains.append( hsp )
361
+ top_descriptions.append( dict( list(zip(hit_table[0], hit_table[-1])) ) ) # Add the last added to the descriptions list.
362
+
363
+ # Reorder the domains according to the order they appear in the sequence.
364
+ ordering = sorted( list(range(len(domains))), key=lambda x: domains[x].query_start)
365
+ domains = [ domains[_] for _ in ordering ]
366
+ top_descriptions = [ top_descriptions[_] for _ in ordering ]
367
+
368
+ ndomains = len( domains )
369
+ for i in range(ndomains): # If any significant hits were identified parse and align them to the reference state.
370
+ domains[i].order = i
371
+ species, chain = top_descriptions[i]["id"].split("_")
372
+ state_vectors.append( _hmm_alignment_to_states(domains[i], ndomains, query.seq_len) ) # Alignment to the reference states.
373
+ top_descriptions[i][ "species"] = species # Reparse
374
+ top_descriptions[i][ "chain_type"] = chain
375
+ top_descriptions[i][ "query_start"] = state_vectors[-1][0][-1] # Make sure the query_start agree if it was changed
376
+
377
+ return hit_table, state_vectors, top_descriptions
378
+
379
+
380
+ def _hmm_alignment_to_states(hsp, n, seq_length):
381
+ """
382
+ Take a hit hsp and turn the alignment into a state vector with sequence indices
383
+ """
384
+
385
+ # Extract the strings for the reference states and the posterior probability strings
386
+ reference_string = hsp.aln_annotation["RF"]
387
+ state_string = hsp.aln_annotation["PP"]
388
+
389
+ assert len(reference_string) == len(state_string), "Aligned reference and state strings had different lengths. Don't know how to handle"
390
+
391
+ # Extract the start an end points of the hmm states and the sequence
392
+ # These are python indices i.e list[ start:end ] and therefore start will be one less than in the text file
393
+ _hmm_start = hsp.hit_start
394
+ _hmm_end = hsp.hit_end
395
+
396
+ _seq_start = hsp.query_start
397
+ _seq_end = hsp.query_end
398
+
399
+ # Extact the full length of the HMM hit
400
+ species, ctype = hsp.hit_id.split('_')
401
+ _hmm_length = get_hmm_length( species, ctype )
402
+
403
+ # Handle cases where there are n terminal modifications.
404
+ # In most cases the user is going to want these included in the numbered domain even though they are not 'antibody like' and
405
+ # not matched to the germline. Only allow up to a maximum of 5 unmatched states at the start of the domain
406
+ # Adds a bug here if there is a very short linker between a scfv domains with a modified n-term second domain
407
+ # Thus this is only done for the first identified domain ( hence order attribute on hsp )
408
+ if hsp.order == 0 and _hmm_start and _hmm_start < 5:
409
+ n_extend = _hmm_start
410
+ if _hmm_start > _seq_start:
411
+ n_extend = min( _seq_start , _hmm_start - _seq_start )
412
+ state_string = '8'*n_extend + state_string
413
+ reference_string = 'x'*n_extend + reference_string
414
+ _seq_start = _seq_start - n_extend
415
+ _hmm_start = _hmm_start - n_extend
416
+
417
+ # Handle cases where the alignment should be extended to the end of the j-element
418
+ # This occurs when there a c-terminal modifications of the variable domain that are significantly different to germline
419
+ # Extension is only made when half of framework 4 has been recognised and there is only one domain recognised.
420
+ if n==1 and _seq_end < seq_length and (123 < _hmm_end < _hmm_length): # Extend forwards
421
+ n_extend = min( _hmm_length - _hmm_end, seq_length - _seq_end )
422
+ state_string = state_string + '8'*n_extend
423
+ reference_string = reference_string + 'x'*n_extend
424
+ _seq_end = _seq_end + n_extend
425
+ _hmm_end = _hmm_end + n_extend
426
+
427
+
428
+
429
+ # Generate lists for the states and the sequence indices that are included in this alignment
430
+ hmm_states = all_reference_states[ _hmm_start : _hmm_end ]
431
+ sequence_indices = list(range(_seq_start, _seq_end))
432
+ h, s = 0, 0 # initialise the current index in the hmm and the sequence
433
+
434
+ state_vector = []
435
+ # iterate over the state string (or the reference string)
436
+ for i in range( len(state_string) ):
437
+ if reference_string[i] == "x": # match state
438
+ state_type = "m"
439
+ else: # insert state
440
+ state_type = "i"
441
+
442
+ if state_string[i] == ".": # overloading if deleted relative to reference. delete_state
443
+ state_type = "d"
444
+ sequence_index = None
445
+ else:
446
+ sequence_index = sequence_indices[s]
447
+ # Store the alignment as the state identifier (uncorrected IMGT annotation) and the index of the sequence
448
+
449
+ state_vector.append( ((hmm_states[h], state_type), sequence_index ) )
450
+
451
+ # Updates to the indices
452
+ if state_type == "m":
453
+ h+=1
454
+ s+=1
455
+ elif state_type == "i":
456
+ s+=1
457
+ else: # delete state
458
+ h+=1
459
+
460
+ return state_vector
461
+
462
+
463
+ def parse_hmmer_output(filedescriptor="", bit_score_threshold=80, hmmer_species=None):
464
+ """
465
+ Parse the output of HMMscan and return top alignment and the score table for each input sequence.
466
+ """
467
+ results = []
468
+ if type(filedescriptor) is str:
469
+ openfile = open
470
+ elif type(filedescriptor) is int:
471
+ openfile = os.fdopen
472
+
473
+ with openfile(filedescriptor) as inputfile:
474
+ p = HMMERParser( inputfile )
475
+ for query in p:
476
+ results.append(_parse_hmmer_query(query,bit_score_threshold=bit_score_threshold,hmmer_species=hmmer_species ))
477
+
478
+ return results
479
+
480
+
481
+ def run_hmmer(sequence_list,hmm_database="ALL",hmmerpath="", ncpu=None, bit_score_threshold=80, hmmer_species=None):
482
+ """
483
+ Run the sequences in sequence list against a precompiled hmm_database.
484
+
485
+ Those sequence that have a significant hit with a bit score over a threshold will
486
+ be recognised and an alignment given. The alignment will be used to number the
487
+ sequence.
488
+
489
+ @param sequence_list: a list of (name, sequence) tuples. Both are strings
490
+ @param hmm_database: The hmm database to use. Currently, all hmms are in the ALL database.
491
+ The code to develop new models is in build_pipeline in the git repo.
492
+ @param hmmerpath: The path to hmmer binaries if not in the path
493
+ @param ncpu: The number of cpu's to allow hmmer to use.
494
+ """
495
+
496
+ # Check that hmm_database is available
497
+
498
+ assert hmm_database in ["ALL"], "Unknown HMM database %s"%hmm_database
499
+ HMM = os.path.join( HMM_path, "%s.hmm"%hmm_database )
500
+
501
+
502
+ # Create a fasta file for all the sequences. Label them with their sequence index
503
+ # This will go to a temp file
504
+ fasta_filehandle, fasta_filename = tempfile.mkstemp( ".fasta", text=True )
505
+ with os.fdopen(fasta_filehandle,'w') as outfile:
506
+ write_fasta(sequence_list, outfile)
507
+
508
+ output_filehandle, output_filename = tempfile.mkstemp( ".txt", text=True )
509
+
510
+ # Run hmmer as a subprocess
511
+ if hmmerpath:
512
+ hmmscan = os.path.join(hmmerpath,"hmmscan")
513
+ else:
514
+ hmmscan = "hmmscan"
515
+ try:
516
+ if ncpu is None:
517
+ command = [ hmmscan, "-o", output_filename, HMM, fasta_filename]
518
+ else:
519
+ command = [ hmmscan, "-o", output_filename, "--cpu", str(ncpu), HMM, fasta_filename]
520
+ process = Popen( command, stdout=PIPE, stderr=PIPE )
521
+ _, pr_stderr = process.communicate()
522
+
523
+ if pr_stderr:
524
+ _f = os.fdopen(output_filehandle) # This is to remove the filedescriptor from the os. I have had problems with it before.
525
+ _f.close()
526
+
527
+ raise HMMscanError(pr_stderr)
528
+ results = parse_hmmer_output(output_filehandle, bit_score_threshold=bit_score_threshold, hmmer_species=hmmer_species)
529
+
530
+ finally:
531
+ # clear up
532
+ os.remove(fasta_filename)
533
+ os.remove(output_filename)
534
+
535
+ return results
536
+
537
+ def get_hmm_length( species, ctype ):
538
+ '''
539
+ Get the length of an hmm given a species and chain type.
540
+ This tells us how many non-insertion positions there could possibly be in a domain (127 or 128 positions under imgt)
541
+ '''
542
+ try:
543
+ return len(list(all_germlines['J'][ctype][species].values())[0].rstrip('-'))
544
+ except KeyError:
545
+ return 128
546
+
547
+
548
+ def number_sequence_from_alignment(state_vector, sequence, scheme="imgt", chain_type=None):
549
+ """
550
+ Given you have an alignment. Give back the numbering
551
+
552
+ @param state_vector: List of states from the hmm. Effectively these are imgt columns but CDR3 has not been redone.
553
+ @param sequence: The original sequence string or list.
554
+ @param scheme: The numbering scheme to apply
555
+ @param chain_type: The type of chain to apply numbering for. Some schemes do not require this (IMGT). Others (e.g. Chothia/Wolfguy) do.
556
+
557
+ @return: A list of numbering identifier / amino acids tuples over the domain that has been numbered. The indices of the start (inclusive) and end point (exclusive) in the sequence for the numbering
558
+ """
559
+ scheme=scheme.lower()
560
+ if scheme == "imgt":
561
+ return number_imgt(state_vector, sequence)
562
+ elif scheme == "chothia":
563
+ if chain_type == "H":
564
+ return number_chothia_heavy(state_vector, sequence)
565
+ elif chain_type in "KL":
566
+ return number_chothia_light(state_vector, sequence)
567
+ else:
568
+ raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
569
+ elif scheme == "kabat":
570
+ if chain_type == "H":
571
+ return number_kabat_heavy(state_vector, sequence)
572
+ elif chain_type in "KL":
573
+ return number_kabat_light(state_vector, sequence)
574
+ else:
575
+ raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
576
+ elif scheme == "martin":
577
+ if chain_type == "H":
578
+ return number_martin_heavy(state_vector, sequence)
579
+ elif chain_type in "KL":
580
+ return number_martin_light(state_vector, sequence)
581
+ else:
582
+ raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
583
+ elif scheme == "aho":
584
+ return number_aho(state_vector, sequence, chain_type) # requires the chain type to heuristically put the CDR1 gap in position.
585
+ elif scheme == "wolfguy":
586
+ if chain_type == "H":
587
+ return number_wolfguy_heavy( state_vector, sequence )
588
+ elif chain_type in "KL":
589
+ return number_wolfguy_light( state_vector, sequence )
590
+ else:
591
+ raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
592
+ else:
593
+ raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
594
+
595
+ def number_sequences_from_alignment(sequences, alignments, scheme="imgt", allow=set(["H","K","L","A","B","G","D"]),
596
+ assign_germline=False, allowed_species=None):
597
+ '''
598
+ Given a list of sequences and a corresponding list of alignments from run_hmmer apply a numbering scheme.
599
+ '''
600
+
601
+ # Iteration over the sequence alignments performing the desired numbering
602
+ numbered = []
603
+ alignment_details = []
604
+ hit_tables = []
605
+ for i in range(len(sequences)):
606
+
607
+ # Unpack
608
+ hit_table, state_vectors, detailss = alignments[i] # We may have multiple domains per sequence (e.g. single chain fvs).
609
+
610
+ # Iterate over all the domains in the sequence that have been recognised (typcially only 1 with the current hmms available)
611
+ hit_numbered, hit_details = [], []
612
+ for di in range( len( state_vectors ) ):
613
+ state_vector = state_vectors[di]
614
+ details = detailss[di]
615
+ details["scheme"]=scheme
616
+ details["query_name"]=sequences[i][0]
617
+
618
+ # Only number things that are allowed. We still keep the alignment details and hit_table
619
+ if state_vector and details["chain_type"] in allow:
620
+ try:
621
+ # Do the numbering and validate (for development purposes)
622
+ hit_numbered.append( validate_numbering(number_sequence_from_alignment(state_vector, sequences[i][1],
623
+ scheme=scheme, chain_type=details["chain_type"]), sequences[i] ) )
624
+ if assign_germline:
625
+ details["germlines"] = run_germline_assignment( state_vector, sequences[i][1],
626
+ details["chain_type"], allowed_species=allowed_species)
627
+ hit_details.append( details )
628
+ except AssertionError as e: # Handle errors. Those I have implemented should be assertion.
629
+ print(str(e), file=sys.stderr)
630
+ raise e # Validation went wrong. Error message will go to stderr. Want this to be fatal during development.
631
+ except Exception as e:
632
+ print("Error: Something really went wrong that has not been handled", file=sys.stderr)
633
+ print(str(e), file=sys.stderr)
634
+ raise e
635
+
636
+ if hit_numbered:
637
+ numbered.append( hit_numbered )
638
+ alignment_details.append( hit_details )
639
+ else:
640
+ numbered.append( None )
641
+ alignment_details.append( None )
642
+ hit_tables.append(hit_table)
643
+
644
+ return numbered, alignment_details, hit_tables
645
+
646
+ def get_identity( state_sequence, germline_sequence ):
647
+ """
648
+ Get the partially matched sequence identity between two aligned sequences.
649
+ Partial in the sense that gaps can be in the state_sequence.
650
+ """
651
+ # Ensure that the sequences are the expected length
652
+ assert len( state_sequence) == len(germline_sequence ) == 128
653
+ n, m = 0, 0
654
+ for i in range( 128 ):
655
+ if germline_sequence[i] == "-":continue
656
+ if state_sequence[i].upper() == germline_sequence[i]: m+=1
657
+ n+=1
658
+
659
+ if not n:
660
+ return 0
661
+ return float(m)/n
662
+
663
+
664
+ def run_germline_assignment(state_vector, sequence, chain_type, allowed_species=None ):
665
+ """
666
+ Find the closest sequence identity match.
667
+ """
668
+ genes={'v_gene': [None,None],
669
+ 'j_gene': [None,None],
670
+ }
671
+
672
+
673
+ # Extract the positions that correspond to match (germline) states.
674
+ state_dict = dict( ((i, 'm'),None) for i in range(1,129))
675
+ state_dict.update(dict(state_vector))
676
+ state_sequence = "".join([ sequence[state_dict[(i, 'm')]] if state_dict[(i,'m')] is not None else "-" for i in range(1,129) ])
677
+
678
+ # Iterate over the v-germline sequences of the chain type of interest.
679
+ # The maximum sequence identity is used to assign the germline
680
+ if chain_type in all_germlines["V"]:
681
+ if allowed_species is not None:
682
+ if not all( [ sp in all_germlines['V'][chain_type] for sp in allowed_species ] ): # Made non-fatal
683
+ return {}
684
+ else:
685
+ allowed_species = all_species
686
+ seq_ids = {}
687
+ for species in allowed_species:
688
+ if species not in all_germlines["V"][ chain_type ]: continue # Previously bug.
689
+ for gene, germline_sequence in all_germlines["V"][ chain_type ][ species ].items():
690
+ seq_ids[ (species, gene) ] = get_identity( state_sequence , germline_sequence )
691
+ genes['v_gene' ][0] = max( seq_ids, key=lambda x: seq_ids[x] )
692
+ genes['v_gene' ][1] = seq_ids[ genes['v_gene' ][0] ]
693
+
694
+ # Use the assigned species for the v-gene for the j-gene.
695
+ # This assumption may affect exotically engineered abs but in general is fair.
696
+ species = genes['v_gene' ][0][0]
697
+ if chain_type in all_germlines["J"]:
698
+ if species in all_germlines["J"][chain_type]:
699
+ seq_ids = {}
700
+ for gene, germline_sequence in all_germlines["J"][ chain_type ][ species ].items():
701
+ seq_ids[ (species, gene) ] = get_identity( state_sequence , germline_sequence )
702
+ genes['j_gene' ][0] = max( seq_ids, key=lambda x: seq_ids[x] )
703
+ genes['j_gene' ][1] = seq_ids[ genes['j_gene' ][0] ]
704
+
705
+ return genes
706
+
707
+ def check_for_j( sequences, alignments, scheme ):
708
+ '''
709
+ As the length of CDR3 gets long (over 30ish) an alignment that does not include the J region becomes more favourable.
710
+ This leads to really long CDR3s not being numberable.
711
+
712
+ To overcome this problem, when no J region is detected we try without the v region.
713
+ '''
714
+ for i in range( len( sequences ) ):
715
+ # Check the alignment for J region
716
+ if len(alignments[i][1]) ==1: # Only do for single domain chains.
717
+
718
+ # Check whether a J region has been identified. If not check whether there is still a considerable amount of sequence
719
+ # remaining.
720
+ ali = alignments[i][1][0]
721
+
722
+ # Find the last match position.
723
+ last_state = ali[-1][0][0]
724
+ last_si = ali[-1][1]
725
+ if last_state < 120: # No or very little J region
726
+ if last_si + 30 < len( sequences[i][1] ): # Considerable amount of sequence left...suspicious of a long CDR3
727
+ # Find the position of the conserved cysteine (imgt 104).
728
+ cys_si = dict( ali ).get( (104,'m'), None )
729
+ if cys_si is not None: # 104 found.
730
+
731
+ # Find the corresponding index in the alignment.
732
+ cys_ai = ali.index( ((104, 'm'), cys_si) )
733
+
734
+ # Try to identify a J region in the remaining sequence after the 104. A low bit score threshold is used.
735
+ _, re_states, re_details = run_hmmer( [(sequences[i][0], sequences[i][1][cys_si+1:])],
736
+ bit_score_threshold=10 )[0]
737
+
738
+ # Check if a J region was detected in the remaining sequence.
739
+ if re_states and re_states[0][-1][0][0] >= 126 and re_states[0][0][0][0] <= 117:
740
+
741
+ # Sandwich the presumed CDR3 region between the V and J regions.
742
+
743
+ vRegion = ali[:cys_ai+1]
744
+ jRegion = [ (state, index+cys_si+1) for state, index in re_states[0] if state[0] >= 117 ]
745
+ cdrRegion = []
746
+ next = 105
747
+ for si in range( cys_si+1, jRegion[0][1] ):
748
+ if next >= 116:
749
+ cdrRegion.append( ( (116, 'i'), si ) )
750
+ else:
751
+ cdrRegion.append( ( (next, 'm'), si ) )
752
+ next +=1
753
+
754
+ # Update the alignment entry.
755
+ alignments[i][1][0] = vRegion + cdrRegion + jRegion
756
+ alignments[i][2][0]['query_end'] = jRegion[-1][1] + 1
757
+
758
+
759
+
760
+ ##################################
761
+ # High level numbering functions #
762
+ ##################################
763
+
764
+ # Main function for ANARCI
765
+ # Name conflict with function, module and package is kept for legacy unless issues are reported in future.
766
+ def anarci(sequences, scheme="imgt", database="ALL", output=False, outfile=None, csv=False, allow=set(["H","K","L","A","B","G","D"]),
767
+ hmmerpath="", ncpu=None, assign_germline=False, allowed_species=None, bit_score_threshold=80):
768
+ """
769
+ The main function for anarci. Identify antibody and TCR domains, number them and annotate their germline and species.
770
+
771
+ It is advised to use one of the wrapper functions:
772
+ o run_anarci - fasta file or sequence list in. Automated multiprocessing for large jobs. Sequences, numbering, details
773
+ and hit tables out.
774
+ o number - single sequence in, numbering out
775
+
776
+
777
+ @param sequences: A list or tuple of (Id, Sequence) pairs
778
+ e.g. [ ("seq1","EVQLQQSGAEVVRSG ..."),
779
+ ("seq2","DIVMTQSQKFMSTSV ...") ]
780
+ @param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
781
+ @param output: Boolean flag to say whether the result should be output.
782
+ @param outfile: The name of the file to output to. If output is True and outfile is None then output is printed
783
+ to stdout.
784
+ @param csv: Boolean flag to say whether the csv output alignment format or the vertical anarci format should be used.
785
+ @param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
786
+ as the scheme, anarci will ignore tcr chains. Choose a subset of ["H","K","L","A","B","G","D"]
787
+ @param assign_germline: Using highest sequence identity assign the germline to the chain. Can be more accurate at identifying
788
+ species than the best HMM hit alone. (Bool)
789
+ @param allowed_species: If assign_germline is true, limit the species that can be assigned to a limited set. Useful when the
790
+ animal species is known or when performing closest germline experiments. Choose a subset of ['human',
791
+ 'mouse','rat','rabbit','rhesus','pig','alpaca'].
792
+
793
+
794
+ @param bit_score_threshold: The threshold score from HMMER at which an alignment should be numbered. Lowering the threshold
795
+ means domain recognition is more permissive and can be useful for numbering heavily engineered molecules.
796
+ However, too low and false positive recognition of other ig-like molecules will occur.
797
+ @param hmmerpath: The path to hmmscan. If left unspecified then the PATH will be searched.
798
+ @param ncpu: The number of cpu's that hmmer should be allowed to use. If not specified then the hmmscan
799
+ default is used. N.B. hmmscan must be compiled with multithreading enabled for this option to have effect.
800
+ Please consider using the run_anarci function for native multiprocessing with anarci.
801
+ @param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
802
+
803
+
804
+ @return: Three lists. Numbered, Alignment_details and Hit_tables.
805
+ Each list is in the same order as the input sequences list.
806
+ A description of each entry in the three lists is as followed.
807
+ o Numbered: will be None if no domain was found for that sequence or a list of domains with their
808
+ numbering, start and finish indices.
809
+ o Alignment_details: will be None if no domain was found for that sequence or a dictionary for each
810
+ domain identified containing the details of the alignment (chain type, e-value, species etc).
811
+ o Hit_tables: None if no domain was found for that sequence or a nested list for each domain containing
812
+ the hit table from hmmscan.
813
+
814
+ """
815
+
816
+ # Validate the input scheme
817
+ try:
818
+ scheme = scheme_short_to_long[scheme.lower()]
819
+ except KeyError:
820
+ raise AssertionError("Unrecognised or unimplemented scheme: %s"%scheme)
821
+
822
+ # Check we have arguments for output before doing work.
823
+ if csv:
824
+ assert outfile, 'If csv output is True then an outfile must be specified'
825
+ _path, _ = os.path.split(outfile)
826
+ assert (not _path) or os.path.exists(_path), 'Output directory %s does not exist'%_path
827
+
828
+
829
+ # Perform the alignments of the sequences to the hmm database
830
+ alignments = run_hmmer(sequences,hmm_database=database,hmmerpath=hmmerpath,ncpu=ncpu,bit_score_threshold=bit_score_threshold,hmmer_species=allowed_species )
831
+
832
+ # Check the numbering for likely very long CDR3s that will have been missed by the first pass.
833
+ # Modify alignments in-place
834
+ check_for_j( sequences, alignments, scheme )
835
+
836
+ # Apply the desired numbering scheme to all sequences
837
+ numbered, alignment_details, hit_tables = number_sequences_from_alignment(sequences, alignments, scheme=scheme, allow=allow,
838
+ assign_germline=assign_germline,
839
+ allowed_species=allowed_species)
840
+
841
+ # Output if necessary
842
+ if output:
843
+ if csv:
844
+ csv_output(sequences, numbered, details, outfile)
845
+ else:
846
+ outto, close=sys.stdout, False
847
+ if outfile:
848
+ outto, close = open(outfile,'w'), True
849
+ anarci_output(numbered, sequences, alignment_details, outto)
850
+ if close:
851
+ outto.close()
852
+
853
+
854
+ return numbered, alignment_details, hit_tables
855
+
856
+ # Wrapper to run anarci using multiple processes and automate fasta file reading.
857
+ def run_anarci( seq, ncpu=1, **kwargs):
858
+ '''
859
+ Run the anarci numbering protocol for single or multiple sequences.
860
+
861
+ @param sequences: A list or tuple of (Id, Sequence) pairs
862
+ e.g. [ ("seq1","EVQLQQSGAEVVRSG ..."),
863
+ ("seq2","DIVMTQSQKFMSTSV ...") ]
864
+ @param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
865
+ @param output: Boolean flag to say whether the result should be output.
866
+ @param outfile: The name of the file to output to. If output is True and outfile is None then output is printed
867
+ to stdout.
868
+ @param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
869
+ as the scheme, anarci will ignore tcr chains. Choose a subset of ["H","K","L","A","B","G","D"]
870
+ @param assign_germline: Using highest sequence identity assign the germline to the chain. Can be more accurate at identifying
871
+ species than the best HMM hit alone. (Bool)
872
+ @param allowed_species: If assign_germline is true, limit the species that can be assigned to a limited set. Useful when the
873
+ animal species is known or when performing closest germline experiments. Choose a subset of ['human',
874
+ 'mouse','rat','rabbit','rhesus','pig','alpaca'].
875
+
876
+ @param bit_score_threshold: The threshold score from HMMER at which an alignment should be numbered. Lowering the threshold
877
+ means domain recognition is more permissive and can be useful for numbering heavily engineered molecules.
878
+ However, too low and false positive recognition of other ig-like molecules will occur.
879
+ @param hmmerpath: The path to hmmscan. If left unspecified then the PATH will be searched.
880
+ @param ncpu: The number of cpu's that hmmer should be allowed to use. If not specified then the hmmscan
881
+ default is used. N.B. hmmscan must be compiled with multithreading enabled for this option to have effect.
882
+ Please consider using the run_anarci function for native multiprocessing with anarci.
883
+ @param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
884
+
885
+ @return: Four lists. Sequences, Numbered, Alignment_details and Hit_tables.
886
+ Each list is in the same order.
887
+ A description of each entry in the four lists is as followed.
888
+ o Sequences: The list of sequences formatted as [(Id,sequence), ...].
889
+ o Numbered: will be None if no domain was found for that sequence or a list of domains with their
890
+ numbering, start and finish indices.
891
+ o Alignment_details: will be None if no domain was found for that sequence or a dictionary for each
892
+ domain identified containing the details of the alignment (chain type, e-value, species etc).
893
+ o Hit_tables: None if no domain was found for that sequence or a nested list for each domain containing
894
+ the hit table from hmmscan.
895
+
896
+ '''
897
+ # Parse the input sequence or fasta file.
898
+ if isinstance(seq, list) or isinstance(seq,tuple): # A list (or tuple) of (name,sequence) sequences
899
+ assert all( len(_) == 2 for _ in seq ), "If list or tuple supplied as input format must be [ ('ID1','seq1'), ('ID2', 'seq2'), ... ]"
900
+ sequences = seq
901
+ elif os.path.isfile( seq ): # Fasta file.
902
+ # Read the sequences. All are read into memory currently...
903
+ sequences = read_fasta( seq )
904
+ ncpu = int(max(1, ncpu ))
905
+ elif isinstance(seq, str): # Single sequence
906
+ validate_sequence( seq )
907
+ ncpu=1
908
+ sequences = [ ["Input sequence", seq ]]
909
+
910
+ # Handle the arguments to anarci.
911
+ output = kwargs.get('output', False )
912
+ outfile = kwargs.get('outfile', False )
913
+ csv = kwargs.get( 'csv', False )
914
+ if csv: # Check output arguments before doing work.
915
+ assert outfile, 'If csv output is True then an outfile must be specified'
916
+ _path, _ = os.path.split(outfile)
917
+ assert (not _path) or os.path.exists(_path), 'Output directory %s does not exist'%_path
918
+
919
+ kwargs['ncpu'] = 1 # Set hmmscan ncpu to 1. HMMER has to be compiled appropriately for this to have an effect.
920
+ kwargs['output'] = False # Overide and write the compiled results here.
921
+
922
+ anarci_partial = partial( anarci, **kwargs )
923
+ chunksize = math.ceil( float( len(sequences) )/ncpu )
924
+
925
+ # Run the anarci function using a pool of workers. Using the map_async to get over the KeyboardInterrupt bug in python2.7
926
+ if ncpu > 1:
927
+ pool = Pool( ncpu )
928
+ results = pool.map_async( anarci_partial, grouper( chunksize, sequences ) ).get()
929
+ pool.close()
930
+ else:
931
+ results = list(map( anarci_partial, grouper( chunksize, sequences ) ))
932
+
933
+ # Reformat the results to flat lists.
934
+ numbered = sum( (_[0] for _ in results), [] )
935
+ alignment_details = sum( (_[1] for _ in results ), [] )
936
+ hit_tables = sum( (_[2] for _ in results), [] )
937
+
938
+ # Output if necessary
939
+ if output:
940
+ if csv:
941
+ csv_output(sequences, numbered, alignment_details, outfile)
942
+ else:
943
+ outto, close=sys.stdout, False
944
+ if outfile:
945
+ outto, close = open(outfile,'w'), True
946
+ anarci_output(numbered, sequences, alignment_details, outto)
947
+ if close:
948
+ outto.close()
949
+
950
+ # Return the results
951
+ return sequences, numbered, alignment_details, hit_tables
952
+
953
+
954
+
955
+ # Wrapper function for simple sequence in numbering and chain type out behaviour.
956
+ def number(sequence, scheme="imgt", database="ALL", allow=set(["H","K","L","A","B","G","D"])):
957
+ """
958
+ Given a sequence string, use anarci to number it using the scheme of choice.
959
+ Only the first domain will be recognised and numbered
960
+
961
+ For multiple sequences it is advised to use run_anarci instead of iterative use of this function.
962
+
963
+ @param sequence: An amino acid sequence string
964
+ @param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
965
+ @param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
966
+ @param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
967
+ as the scheme, anarci will ignore tcr chains.
968
+
969
+ @return: If the sequence can be numbered, a list containing the numbering and sequence; and the chain type.
970
+ Otherwise both are False.
971
+
972
+ """
973
+
974
+ try:
975
+ validate_sequence( sequence )
976
+ scheme = scheme_short_to_long[scheme.lower()]
977
+ except KeyError:
978
+ raise AssertionError("Unrecognised to unimplemented scheme: %s"%scheme)
979
+
980
+ if len(sequence) < 70: # Length check. ANARCI can number fragments of chains well. Encourage full domain numbering.
981
+ return False, False
982
+
983
+ try:
984
+ numbered, alignment_details, _ = anarci( [("sequence_0", sequence)], scheme=scheme, database=database, output=False, allow=allow )
985
+ except AssertionError: # Catch where the user has tried to number a TCR with an antibody scheme
986
+ return False, False
987
+
988
+
989
+ # We return the numbering list and the chain type where kappa and lambda chains are both "L" for light
990
+ if numbered[0]:
991
+ return numbered[0][0][0], chain_type_to_class[alignment_details[0][0]["chain_type"]]
992
+ else:
993
+ return False, False
994
+
995
+ if __name__ == "__main__":
996
+ # Test and example useage of the anarci function.
997
+ sequences = [ ("12e8:H","EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSAAKTTPPSVYPLAP"),
998
+ ("12e8:L","DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASV"),
999
+ ("scfv:A","DIQMTQSPSSLSASVGDRVTITCRTSGNIHNYLTWYQQKPGKAPQLLIYNAKTLADGVPSRFSGSGSGTQFTLTISSLQPEDFANYYCQHFWSLPFTFGQGTKVEIKRTGGGGSGGGGSGGGGSGGGGSEVQLVESGGGLVQPGGSLRLSCAASGFDFSRYDMSWVRQAPGKRLEWVAYISSGGGSTYFPDTVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARQNKKLTWFDYWGQGTLVTVSSHHHHHH"),
1000
+ ("lysozyme:A","KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL")]
1001
+
1002
+ results = anarci(sequences, scheme="imgt", output=True)
1003
+ numbering, alignment_details, hit_tables = results
1004
+
1005
+ expect_one_VH_domain_numbering, expect_one_VL_domain_numbering, expect_VH_then_VL_numbering, expect_None = numbering
1006
+ assert len(expect_one_VH_domain_numbering) == 1
1007
+ assert len(expect_one_VL_domain_numbering) == 1
1008
+ assert len(expect_VH_then_VL_numbering) == 2
1009
+ assert expect_None == None
1010
+
1011
+
1012
+
1013
+
anarci/dat/.DS_Store ADDED
Binary file (6.15 kB). View file
 
anarci/dat/HMMs/ALL.hmm ADDED
The diff for this file is too large to render. See raw diff
 
anarci/dat/HMMs/ALL.hmm.h3f ADDED
Binary file (449 kB). View file
 
anarci/dat/HMMs/ALL.hmm.h3i ADDED
Binary file (1.12 kB). View file
 
anarci/dat/HMMs/ALL.hmm.h3m ADDED
Binary file (729 kB). View file
 
anarci/dat/HMMs/ALL.hmm.h3p ADDED
Binary file (843 kB). View file
 
anarci/germlines.py ADDED
The diff for this file is too large to render. See raw diff
 
anarci/schemes.py ADDED
@@ -0,0 +1,1691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ANARCI - Antibody Numbering and Antigen Receptor ClassIfication
2
+ # Copyright (C) 2016 Oxford Protein Informatics Group (OPIG)
3
+ #
4
+ # This program is free software: you can redistribute it and/or modify
5
+ # it under the terms of the GNU General Public License as published by
6
+ # the Free Software Foundation, either version 3 of the License, or
7
+ # (at your option) any later version.
8
+ #
9
+ # This program is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ # GNU General Public License for more details.#
13
+ #
14
+ # You should have received a copy of the GNU General Public License
15
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
+
17
+ '''
18
+ Module containing functions to convert hmm alignment to a numbering scheme.
19
+
20
+ Currently implemented
21
+
22
+ For IG's
23
+ IMGT
24
+ Chothia
25
+ Kabat
26
+ Martin (Extended Chothia)
27
+ Aho
28
+ Wolfguy
29
+
30
+ For TR's
31
+ IMGT
32
+ (Aho)
33
+
34
+ ---------------------------------------------------------------------------------------------------------------------
35
+ Functions are written to a template:
36
+
37
+ There are 128 match states in the HMMs (these are the IMGT states). The alignment to these states must be converted to
38
+ correspond to the scheme of choice.
39
+
40
+ We define:
41
+ - a state string consisting of 'X' and 'I' where:
42
+ X means that for the state there is an equivalent position in the numbering scheme.
43
+ I means that for the state there is not an equivalent position in the numbering scheme. It should therefore be
44
+ considered as an insertion in the scheme.
45
+
46
+ - a region string consisting of characters (integers in the currently implemented schemes). Each character
47
+ corresponds to a contiguous region. Therefore each state can be assigned a region according to the scheme.
48
+
49
+ - a mapping between region characters and region indices as a dictionary. e.g. the first region character maps
50
+ to 0, second to 1 ...
51
+
52
+ - a dictionary containing the difference between state number (imgt) and scheme number at the *beginning* of
53
+ each region using the region indices as keys and the difference as values.
54
+
55
+ - the number of regions defined
56
+
57
+ - a list for which delete states should not be included in the numbering (typically those for the cdrs). This
58
+ will allow the length of the region to be the number of residues found instead of the number of possible states plus
59
+ insertions.
60
+
61
+
62
+ This all goes into the _number_regions function along with the sequence and the state_vector (the alignment from the
63
+ HMM).
64
+
65
+ _number regions will then divide the aligned part of the sequence into as many regions as defined above. Within each
66
+ region it will give a numbering according to the input parameters. A list of lists will be returned containing the
67
+ numbered sequence for each region.
68
+
69
+ Some of the regions will not be numbered correctly according to the scheme. For example the insertions for the CDRs
70
+ will not necessarily be on the correct residue. For each different scheme these regions are then modified (see code
71
+ for implementation)
72
+
73
+ Finally the full numbered sequence is compiled and returned to the calling function.
74
+ ---------------------------------------------------------------------------------------------------------------------
75
+
76
+ Other schemes can be implemented following the template above.
77
+
78
+
79
+ '''
80
+
81
+ # Alphabet used for insertion (last (-1th) is a blank space for no insertion)
82
+ alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ", "KK", "LL", "MM", "NN", "OO", "PP", "QQ", "RR", "SS", "TT", "UU", "VV", "WW", "XX", "YY", "ZZ", " "]
83
+
84
+ # Blosum62 matrix. Used in some annotation methods to recognise pre-defined motifs
85
+ blosum62 = {('B', 'N'): 3, ('W', 'L'): -2, ('G', 'G'): 6, ('X', 'S'): 0, ('X', 'D'): -1, ('K', 'G'): -2, ('S', 'E'): 0, ('X', 'M'): -1, ('Y', 'E'): -2, ('W', 'R'): -3, ('I', 'R'): -3, ('X', 'Z'): -1, ('H', 'E'): 0, ('V', 'M'): 1, ('N', 'R'): 0, ('I', 'D'): -3, ('F', 'D'): -3, ('W', 'C'): -2, ('N', 'A'): -2, ('W', 'Q'): -2, ('L', 'Q'): -2, ('S', 'N'): 1, ('Z', 'K'): 1, ('V', 'N'): -3, ('Q', 'N'): 0, ('M', 'K'): -1, ('V', 'H'): -3, ('G', 'E'): -2, ('S', 'L'): -2, ('P', 'R'): -2, ('D', 'A'): -2, ('S', 'C'): -1, ('E', 'D'): 2, ('Y', 'G'): -3, ('W', 'P'): -4, ('X', 'X'): -1, ('Z', 'L'): -3, ('Q', 'A'): -1, ('V', 'Y'): -1, ('W', 'A'): -3, ('G', 'D'): -1, ('X', 'P'): -2, ('K', 'D'): -1, ('T', 'N'): 0, ('Y', 'F'): 3, ('W', 'W'): 11, ('Z', 'M'): -1, ('L', 'D'): -4, ('M', 'R'): -1, ('Y', 'K'): -2, ('F', 'E'): -3, ('M', 'E'): -2, ('S', 'S'): 4, ('X', 'C'): -2, ('Y', 'L'): -1, ('H', 'R'): 0, ('P', 'P'): 7, ('K', 'C'): -3, ('S', 'A'): 1, ('P', 'I'): -3, ('Q', 'Q'): 5, ('L', 'I'): 2, ('P', 'F'): -4, ('B', 'A'): -2, ('Z', 'N'): 0, ('M', 'Q'): 0, ('V', 'I'): 3, ('Q', 'C'): -3, ('I', 'H'): -3, ('Z', 'D'): 1, ('Z', 'P'): -1, ('Y', 'W'): 2, ('T', 'G'): -2, ('B', 'P'): -2, ('P', 'A'): -1, ('C', 'D'): -3, ('Y', 'H'): 2, ('X', 'V'): -1, ('B', 'B'): 4, ('Z', 'F'): -3, ('M', 'L'): 2, ('F', 'G'): -3, ('S', 'M'): -1, ('M', 'G'): -3, ('Z', 'Q'): 3, ('S', 'Q'): 0, ('X', 'A'): 0, ('V', 'T'): 0, ('W', 'F'): 1, ('S', 'H'): -1, ('X', 'N'): -1, ('B', 'Q'): 0, ('K', 'A'): -1, ('I', 'Q'): -3, ('X', 'W'): -2, ('N', 'N'): 6, ('W', 'T'): -2, ('P', 'D'): -1, ('B', 'C'): -3, ('I', 'C'): -1, ('V', 'K'): -2, ('X', 'Y'): -1, ('K', 'R'): 2, ('Z', 'R'): 0, ('W', 'E'): -3, ('T', 'E'): -1, ('B', 'R'): -1, ('L', 'R'): -2, ('Q', 'R'): 1, ('X', 'F'): -1, ('T', 'S'): 1, ('B', 'D'): 4, ('Z', 'A'): -1, ('M', 'N'): -2, ('V', 'D'): -3, ('F', 'A'): -2, ('X', 'E'): -1, ('F', 'H'): -1, ('M', 'A'): -1, ('K', 'Q'): 1, ('Z', 'S'): 0, ('X', 'G'): -1, ('V', 'V'): 4, ('W', 'D'): -4, ('X', 'H'): -1, ('S', 'F'): -2, ('X', 'L'): -1, ('B', 'S'): 0, ('S', 'G'): 0, ('P', 'M'): -2, ('Y', 'M'): -1, ('H', 'D'): -1, ('B', 'E'): 1, ('Z', 'B'): 1, ('I', 'E'): -3, ('V', 'E'): -2, ('X', 'T'): 0, ('X', 'R'): -1, ('R', 'R'): 5, ('Z', 'T'): -1, ('Y', 'D'): -3, ('V', 'W'): -3, ('F', 'L'): 0, ('T', 'C'): -1, ('X', 'Q'): -1, ('B', 'T'): -1, ('K', 'N'): 0, ('T', 'H'): -2, ('Y', 'I'): -1, ('F', 'Q'): -3, ('T', 'I'): -1, ('T', 'Q'): -1, ('P', 'L'): -3, ('R', 'A'): -1, ('B', 'F'): -3, ('Z', 'C'): -3, ('M', 'H'): -2, ('V', 'F'): -1, ('F', 'C'): -2, ('L', 'L'): 4, ('M', 'C'): -1, ('C', 'R'): -3, ('D', 'D'): 6, ('E', 'R'): 0, ('V', 'P'): -2, ('S', 'D'): 0, ('E', 'E'): 5, ('W', 'G'): -2, ('P', 'C'): -3, ('F', 'R'): -3, ('B', 'G'): -1, ('C', 'C'): 9, ('I', 'G'): -4, ('V', 'G'): -3, ('W', 'K'): -3, ('G', 'N'): 0, ('I', 'N'): -3, ('Z', 'V'): -2, ('A', 'A'): 4, ('V', 'Q'): -2, ('F', 'K'): -3, ('T', 'A'): 0, ('B', 'V'): -3, ('K', 'L'): -2, ('L', 'N'): -3, ('Y', 'N'): -2, ('F', 'F'): 6, ('L', 'G'): -4, ('B', 'H'): 0, ('Z', 'E'): 4, ('Q', 'D'): 0, ('X', 'B'): -1, ('Z', 'W'): -3, ('S', 'K'): 0, ('X', 'K'): -1, ('V', 'R'): -3, ('K', 'E'): 1, ('I', 'A'): -1, ('P', 'H'): -2, ('B', 'W'): -4, ('K', 'K'): 5, ('H', 'C'): -3, ('E', 'N'): 0, ('Y', 'Q'): -1, ('H', 'H'): 8, ('B', 'I'): -3, ('C', 'A'): 0, ('I', 'I'): 4, ('V', 'A'): 0, ('W', 'I'): -3, ('T', 'F'): -2, ('V', 'S'): -2, ('T', 'T'): 5, ('F', 'M'): 0, ('L', 'E'): -3, ('M', 'M'): 5, ('Z', 'G'): -2, ('D', 'R'): -2, ('M', 'D'): -3, ('W', 'H'): -2, ('G', 'C'): -3, ('S', 'R'): -1, ('S', 'I'): -2, ('P', 'Q'): -1, ('Y', 'A'): -2, ('X', 'I'): -1, ('E', 'A'): -1, ('B', 'Y'): -3, ('K', 'I'): -3, ('H', 'A'): -2, ('P', 'G'): -2, ('F', 'N'): -3, ('H', 'N'): 1, ('B', 'K'): 0, ('V', 'C'): -1, ('T', 'L'): -1, ('P', 'K'): -1, ('W', 'S'): -3, ('T', 'D'): -1, ('T', 'M'): -1, ('P', 'N'): -2, ('K', 'H'): -1, ('T', 'R'): -1, ('Y', 'R'): -2, ('L', 'C'): -1, ('B', 'L'): -4, ('Z', 'Y'): -2, ('W', 'N'): -4, ('G', 'A'): 0, ('S', 'P'): -1, ('E', 'Q'): 2, ('C', 'N'): -3, ('H', 'Q'): 0, ('D', 'N'): 1, ('Y', 'C'): -2, ('L', 'H'): -3, ('E', 'C'): -4, ('Z', 'H'): 0, ('H', 'G'): -2, ('P', 'E'): -1, ('Y', 'S'): -2, ('G', 'R'): -2, ('B', 'M'): -3, ('Z', 'Z'): 4, ('W', 'M'): -1, ('Y', 'T'): -2, ('Y', 'P'): -3, ('Y', 'Y'): 7, ('T', 'K'): -1, ('Z', 'I'): -3, ('T', 'P'): -1, ('V', 'L'): 1, ('F', 'I'): 0, ('G', 'Q'): -2, ('L', 'A'): -1, ('M', 'I'): 1}
86
+
87
+
88
+ def smooth_insertions(state_vector):
89
+ '''
90
+ The function aims to correct to the expected imgt alignment. Renumbering functions then translate from the imgt scheme to the
91
+ appropriate scheme.
92
+
93
+ Handle insertions made by HMMER that we suspect may be in the wrong position.
94
+ Edge cases include:
95
+ - Insertions at the C terminal of fw1, fw3 and fw3 regions. Can occur when 'conserved' residues have been mutated and the
96
+ same amino acid appears in the the following CDR (e.g. mutate cysteine at 104 but the CDR3 has one or more cysteines)
97
+ - Same as above possible (but not observed in structure seqs) for N terminal of fw2, fw3 and fw4... TODO
98
+ - Heavily mutated N terminal regions that are partially recognised (e.g. 3gk8 chain H). Insertions should not be allowed
99
+ before N terminal deletions have been used. Preserve deletion locations that are not N terminal (e.g. 10 in IMGT H) if
100
+ the gap has been placed by the alignment.
101
+
102
+ '''
103
+ # Small overhead doing these corrections but worth it for reducing edge cases.
104
+
105
+ # Enforce insertion patterns as below. The CDRs are renumbered in each case so that insertions are placed accoring to the scheme
106
+ # '11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777'
107
+ # ' mmmi mmmi mmmi '
108
+ # ' mmmi immm mmmi immm mmmi immm '
109
+
110
+ # Enforce any insertions at the end and beginning of framework regions to be moved into the CDR region for renumbering.
111
+ enforced_patterns = [ [(25,'m'),(26,'m'),( 27,'m'),( 28,'i')],
112
+ [(38,'i'),(38,'m'),(39,'m'),(40,'m')],
113
+ [(54,'m'),(55,'m'),(56,'m'),(57,'i')],
114
+ [(65,'i'),(65,'m'),(66,'m'),(67,'m')],
115
+ [(103,'m'),(104,'m'),(105,'m'),(106,'i')],
116
+ [(117,'i'),(117,'m'),(118,'m'),(119,'m')] ]
117
+
118
+ # Insertions in FW1 are only allowed if there are a fewer number of n-terminal deletions made.
119
+
120
+ state_buffer = []
121
+ sv = []
122
+ for (state_id, state_type ), si in state_vector:
123
+ if state_id < 23: # Everything before the cysteine at 23.
124
+ state_buffer.append( ((state_id, state_type ), si) )
125
+ reg = -1
126
+ elif 25 <= state_id < 28: # Add to the buffer
127
+ state_buffer.append( ((state_id, state_type ), si) )
128
+ reg = 0
129
+ elif 37 < state_id <= 40: # Add to the buffer
130
+ state_buffer.append( ((state_id, state_type ), si) )
131
+ reg = 1
132
+ elif 54 <= state_id < 57: # Add to the buffer
133
+ state_buffer.append( ((state_id, state_type ), si) )
134
+ reg = 2
135
+ elif 64 < state_id <= 67: # Add to the buffer
136
+ state_buffer.append( ((state_id, state_type ), si) )
137
+ reg = 3
138
+ elif 103 <= state_id < 106: # Add to the buffer
139
+ state_buffer.append( ((state_id, state_type ), si) )
140
+ reg = 4
141
+ elif 116 < state_id <= 119: # Add to the buffer
142
+ state_buffer.append( ((state_id, state_type ), si) )
143
+ reg = 5
144
+ elif len(state_buffer) != 0: # Add the buffer and reset
145
+
146
+ # Find the number of insertions in the buffer
147
+ nins = sum( 1 for s in state_buffer if s[0][1] == 'i' )
148
+
149
+ # If there are insertions, adjust the alignment
150
+ if nins > 0: # We have insertions
151
+
152
+ if reg == -1: # FW1, only adjust if there are the same or more N terminal deletions than insertions
153
+ nt_dels = state_buffer[0][0][0] - 1 # Missing states
154
+ for (_id, _type ), _si in state_buffer: # Explicit deletion states.
155
+ if _type == 'd' or _si == None:
156
+ nt_dels +=1
157
+ else: # First residue found
158
+ break
159
+ if nt_dels >= nins: # More n terminal deletions than insertions found. Likely misalignment.
160
+
161
+ # Preserve the deleted states structure by using the same match annotations
162
+ new_states = [ s for s, _ in state_buffer if s[1] == 'm']
163
+ _first = new_states[0][0]
164
+
165
+ # Remove the deletions so that only residue positions are included
166
+ state_buffer = [ s for s in state_buffer if s[0][1] != 'd' ]
167
+
168
+ # Extend N terminal states backwards from the first match states
169
+ _add = len( state_buffer ) - len( new_states )
170
+ assert _add >= 0, 'Implementation logic error' # Should be adding a positive number of positions
171
+ new_states = [ (_,'m') for _ in range( _first - _add, _first ) ] + new_states
172
+ assert len(new_states)==len(state_buffer), 'Implementation logic error' # Should have the same length
173
+
174
+ # Assign them preserving the order of the sequence.
175
+ for i in range( len(state_buffer ) ):
176
+ sv.append( ( new_states[i], state_buffer[i][1]) )
177
+ else:
178
+ sv += state_buffer # The insertions may be incorrect but unknown what to do. Let the alignment place.
179
+ else:
180
+ # Remove any deletions in the buffer. Unlikely to happen but do anyway
181
+ state_buffer = [ s for s in state_buffer if s[0][1] != 'd' ]
182
+
183
+ # Define the new states defined by the enforced pattern and the length of the buffer
184
+ if reg % 2: # nterm fw
185
+ new_states = [enforced_patterns[reg][0]]*max( 0, len(state_buffer)-3) + enforced_patterns[reg][ max( 4-len(state_buffer), 1):]
186
+ else: # cterm fw
187
+ new_states = enforced_patterns[reg][:3] + [enforced_patterns[reg][2]]*max( 0, len(state_buffer)-3)
188
+ # Assign them preserving the order of the sequence.
189
+ for i in range( len(state_buffer ) ):
190
+ sv.append( ( new_states[i], state_buffer[i][1]) )
191
+
192
+ else: # Nothing to do - either all match or deletion states.
193
+ sv += state_buffer
194
+
195
+ # Add the current state
196
+ sv.append( ((state_id, state_type ), si) )
197
+
198
+ # Reset state buffer
199
+ state_buffer = []
200
+
201
+ else: # Simply append
202
+ sv.append( ((state_id, state_type ), si) )
203
+
204
+
205
+ return sv
206
+
207
+
208
+ # General function to give annotations for regions that have direct mappings onto the hmm alignment (imgt states)
209
+ def _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions):
210
+ """
211
+ General function to number a sequence and divide it into different regions
212
+
213
+ @param sequence: The sequence string
214
+ @param state_vector: The list of states from the aligned hmm
215
+ @param state_string: A string of states for the scheme relative to IMGT (this is X for a direct equivalence, I if needs to be treated as insertion)
216
+ @param region_string: A string of characters that indicate which hmm states are in each regions for this scheme (i.e. how should the sequence be divided up)
217
+ @param region_index_dict: A dictionary converting the characters in region string to an index of the regions.
218
+ @param rels: The difference of the numbering integer at the *start* of each region
219
+ @param n_regions: The number of regions
220
+ @param exclude_deletions: A list of region indices for which deletion states should not be included. Typically the CDRs.
221
+ These will be reannotated in the scheme function. Also allows the reset of insertions.
222
+
223
+ @return: A list of lists where each region has been numbered according to the scheme. Some regions will need renumbering. This should be taken care of after the function called.
224
+
225
+ """
226
+
227
+ state_vector = smooth_insertions( state_vector )
228
+
229
+ _regions = [ [] for _ in range(n_regions) ]
230
+
231
+ # Initialise the insertion index (-1 is a blank space) and the previous state.
232
+ insertion = -1
233
+ previous_state_id = 1
234
+ previous_state_type = 'd'
235
+ start_index, end_index = None, None
236
+
237
+ region = None
238
+
239
+ # Iterate over the aligned state vector
240
+ for (state_id, state_type ), si in state_vector:
241
+
242
+ # Retrieve the region index
243
+ if state_type != "i" or region is None: # BUG_FIX - JD 9/4/15 - do not allow a new region to start as an insertion.
244
+ region = region_index_dict[region_string[state_id-1]]
245
+
246
+
247
+ # Check the state_types
248
+ if state_type == "m": # It is a match
249
+
250
+ # Check whether this position is in the scheme as an independent state
251
+ if state_string[state_id-1]=="I": # No, it should be treated as an insertion
252
+ if previous_state_type != 'd': # Unless there was a deletion beforehand in which case this should be a real pos.
253
+ insertion +=1 # Increment the insertion annotation index
254
+ rels[region] -= 1 # Update the relative numbering from the imgt states
255
+ else: # Yes
256
+ insertion = -1 # Reset the insertions
257
+
258
+ # Add the numbering annotation to the appropriate region list
259
+ _regions[region].append( ( (state_id + rels[region], alphabet[insertion] ), sequence[si] ) )
260
+ previous_state_id = state_id # Record the previous state ID
261
+ if start_index is None:
262
+ start_index = si
263
+ end_index = si
264
+
265
+ previous_state_type = state_type
266
+
267
+ elif state_type == "i": # It is an insertion
268
+ insertion +=1 # Increment the insertion annotation index
269
+
270
+ # Add the numbering annotation to the appropriate region list
271
+ _regions[region].append( ( (previous_state_id + rels[region], alphabet[insertion]), sequence[si] ) )
272
+ if start_index is None:
273
+ start_index = si
274
+ end_index = si
275
+
276
+ previous_state_type = state_type
277
+
278
+ else: # It is a deletion
279
+ previous_state_type = state_type
280
+
281
+ # Check whether this position is in the scheme as an independent state
282
+ if state_string[state_id-1]=="I": # No, therefore irrelevant to the scheme.
283
+ rels[region] -= 1 # Update the relative numbering from the imgt states
284
+ continue
285
+
286
+ insertion = -1 # Reset the insertions
287
+ previous_state_id = state_id # Record the previous state ID, should not be needed (no delete to insert state transition)
288
+
289
+
290
+ # Reset the inssertion index if necessary and allowed. (Means the insertion code is meaningless and will be reannotated)
291
+ if insertion >= 25 and region in exclude_deletions:
292
+ insertion = 0
293
+
294
+ assert insertion < 25, "Too many insertions for numbering scheme to handle" # We ran out of letters.
295
+
296
+ return _regions, start_index, end_index
297
+
298
+
299
+ # Functions to perform the numbering and the corrections for each of the implemented schemes.
300
+ # These have been written fairly verbosely so that the template of how to generate a function for a new scheme is more clear.
301
+ # They have two stages: Perform the mapping between imgt and the scheme; Renumber those regions that do not map nicely onto imgt (e.g. CDR insertions)
302
+
303
+
304
+
305
+ ########
306
+ # IMGT #
307
+ ########
308
+ # - Renumbering of the CDR 1 and 2 regions in IMGT has now been implemented to ensure consistency with the gapping rules of the
309
+ # scheme. Previously gaps were defined using the HMM alignment as the underlying model was already based on the IMGT scheme. This
310
+ # worked well in original test cases but appears to give inaccurate annotations in a significant number of cases in NGS size
311
+ # sequence sets. We therefore now explicitly renumber the CDR 1 and 2 as with all the other schemes.
312
+
313
+ def number_imgt(state_vector, sequence):
314
+ """
315
+ Apply the IMGT numbering scheme for heavy or light chains
316
+
317
+ Rules should be implemented using two strings - the state string and the region string.
318
+
319
+ There are 128 states in the HMMs. Treat X as a direct match in IMGT scheme, I is an insertion. (All X's for IMGT)
320
+ XXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXX XXXXXXXXXXXXXXXXX XXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
321
+ 11111111111111111111111111 222222222222 33333333333333333 4444444444 555555555555555555555555555555555555555 6666666666666 77777777777
322
+
323
+ Regions - (N.B These do not match up with any particular definition of CDR)
324
+ 1. All positions before CDR1
325
+ 2. CDR1 positions
326
+ 3. Positions between CDR1/2
327
+ 4. CDR2 positions
328
+ 5. Positions between CDR2/3
329
+ 6. CDR positions 105 (inc) to 118 (exc)
330
+ 7. Positions after CDR3
331
+
332
+ """
333
+
334
+ # Set up the numbering
335
+
336
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
337
+ state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
338
+
339
+ # Region string - regions that should be treated separately in putting the numbering together
340
+ region_string = '11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777'
341
+
342
+ region_index_dict = {
343
+ "1":0,
344
+ "2":1,
345
+ "3":2,
346
+ "4":3,
347
+ "5":4,
348
+ "6":5,
349
+ "7":6
350
+ }
351
+
352
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
353
+ # This is updated in the loop below
354
+ rels = {0:0,
355
+ 1:0,
356
+ 2:0,
357
+ 3:0,
358
+ 4:0,
359
+ 5:0,
360
+ 6:0,
361
+ 7:0
362
+ }
363
+
364
+ n_regions = 7
365
+
366
+ exclude_deletions = [1,3,5]
367
+
368
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
369
+
370
+ ###############
371
+ # Renumbering #
372
+ ###############
373
+
374
+ _numbering = [ _regions[0], # Fw1
375
+ [], # CDR1
376
+ _regions[2], # Fw2
377
+ [], # CDR2
378
+ _regions[4], # Fw3
379
+ [], # CDR3
380
+ _regions[6], # Fw4
381
+
382
+ ]
383
+
384
+ # The alignment from HMMER should be correct for CDRs 1 and 2. Testing has shown not always the case and 'manual' renumbering
385
+ # is required as with the other schemes.
386
+
387
+ # CDR1
388
+ # CDR1 has a range from 27 (inc.) to 39 (exc.) and has a theoretical maximum length of 12.
389
+ cdr1seq = "".join([ x[1] for x in _regions[1] if x[1] != "-" ])
390
+ cdr1length = len(cdr1seq)
391
+ si = 0
392
+ prev_state = 26
393
+ for ann in get_imgt_cdr(cdr1length, 12, 27, 39):
394
+ if not ann:
395
+ _numbering[1].append( ((prev_state+1, ' '), '-') )
396
+ prev_state += 1
397
+ else:
398
+ _numbering[1].append( (ann, cdr1seq[si]) )
399
+ prev_state = ann[0]
400
+ si += 1
401
+
402
+ # CDR2
403
+ # CDR2 has a range from 56 (inc.) to 66 (exc.) and has a theoretical length of 10.
404
+ cdr2seq = "".join([ x[1] for x in _regions[3] if x[1] != "-" ])
405
+ cdr2length = len(cdr2seq)
406
+ si = 0
407
+ prev_state = 55
408
+ for ann in get_imgt_cdr(cdr2length, 10, 56, 66):
409
+ if not ann:
410
+ _numbering[3].append( ((prev_state+1, ' '), '-') )
411
+ prev_state += 1
412
+ else:
413
+ _numbering[3].append( (ann, cdr2seq[si]) )
414
+ prev_state = ann[0]
415
+ si += 1
416
+
417
+ # FW3. We allow the HMM to place insertions. Technically all insertion points are taken care of but in reality insertions can
418
+ # and do occur. No specification of where the insertions should be placed.
419
+
420
+
421
+ # CDR3
422
+ # CDR3 has a range from 105 (inc.) to 118 (exc.). Insertions are placed on 112 and 111 symetrically. IMGT has a technical
423
+ # maximum length of 65 (13 positions, 26*2 insertions) . In practice ANARCI will not recognise CDR3s of this length.
424
+ cdr3seq = "".join([ x[1] for x in _regions[5] if x[1] != "-" ])
425
+ cdr3length = len(cdr3seq)
426
+ if cdr3length > 117: return [], startindex, endindex # Too many insertions. Do not apply numbering.
427
+ si = 0
428
+ previous_state_id = 104
429
+ for ann in get_imgt_cdr(cdr3length, 13, 105, 118):
430
+ if ann is None:
431
+ _numbering[5].append( ((previous_state_id+1, " "), "-" ) )
432
+ previous_state_id+=1
433
+ else:
434
+ _numbering[5].append( (ann, cdr3seq[si] ) )
435
+ previous_state_id = ann[0]
436
+ si+=1
437
+
438
+ # Return the full vector and the start and end indices of the numbered region of the sequence
439
+ return gap_missing( _numbering ), startindex, endindex
440
+
441
+ def get_imgt_cdr(length, maxlength, start, end):
442
+ """
443
+ Symmetrically number a CDR loop (e.g. CDRL1/CDRH2 for IMGT)
444
+ @param length: Define the length of target CDR
445
+ @param maxlength: Define the theoretical limit (e.g. L1 = 12 for the IMGT scheme)
446
+ @param start, end: Start and end position numbers
447
+ """
448
+ annotations = [ None for _ in range(max(length, maxlength)) ]
449
+ if length == 0:
450
+ return annotations
451
+ elif length == 1:
452
+ annotations[0] = (start, ' ')
453
+ return annotations
454
+
455
+ front, back = 0, -1
456
+ #az = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
457
+ #za = "ZYXWVUTSRQPONMLKJIHGFEDCBA"
458
+
459
+ az = alphabet[:-1]
460
+ za = az[::-1]
461
+
462
+ for i in range(min(length, maxlength)):
463
+ if i % 2:
464
+ annotations[back] = (end + back, " ")
465
+ back -= 1
466
+ else:
467
+ annotations[front] = (start + front, " ")
468
+ front += 1
469
+
470
+ # Add insertions around the centre point
471
+ centrepoint = [ i for i,v in enumerate(annotations) if v == None ]
472
+ if not centrepoint:
473
+ return annotations
474
+
475
+ centre_left = annotations[min(centrepoint)-1][0] # Get the index right before the first None
476
+ centre_right = annotations[max(centrepoint)+1][0] # Get the index right after the first None
477
+
478
+ # For cases with an even max length
479
+ if not maxlength % 2:
480
+ frontfactor, backfactor = maxlength//2, maxlength//2
481
+ # For cases with an odd max length
482
+ else:
483
+ frontfactor, backfactor = (maxlength//2)+1, maxlength//2
484
+
485
+ for i in range(max(0, length-maxlength)):
486
+ if not i % 2:
487
+ annotations[back] = (centre_right, za[back + backfactor])
488
+ back -= 1
489
+ else:
490
+ annotations[front] = (centre_left, az[front - frontfactor])
491
+ front += 1
492
+
493
+ return annotations
494
+
495
+
496
+ #######
497
+ # Aho #
498
+ #######
499
+ # Heuristic regapping based on the AHo specification as detailed on AAAAA website. Gap order depends on the chain type
500
+ def number_aho(state_vector, sequence, chain_type):
501
+ """
502
+ Apply the Aho numbering scheme
503
+
504
+ Rules should be implemented using two strings - the state string and the region string.
505
+
506
+ There are 128 states in the HMMs. Treat X as a direct match in IMGT scheme, I is an insertion. (All X's for IMGT)
507
+
508
+ XXXXXXX XXX XXXXXXXXXXXXXX XXXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
509
+ AAAAAAA BBB CCCCCCCCCCCCCC DDDDDDDDDDDDDDDD EEEEEEEEEEEEEEE FFFFFFFFFFFFFFFFFFFF HHHHHHHHHHHHHHHH IIIIIIIIIIIII JJJJJJJJJJJJJ KKKKKKKKKKK
510
+
511
+
512
+ Regions - (N.B These do not match up with any particular definition of CDR)
513
+ A. EMPTY (now included in B)
514
+ B. 1-10 inclusive. Indel occurs at 8
515
+ C. 11-24 inclusive.
516
+ D. 25-42 inclusive (deletion surround 28) 32-42 inclusive (deletions surround 36)
517
+ E. 43-57 inclusive
518
+ F. 58-77 inclusive (deletions surround 63). Alpha chains have deletions at 74,75
519
+ G. EMPTY (now included in H)
520
+ H. 78-93 inclusive gaps on 86 then 85, insertions on 85 linearly
521
+ I. 94-106 inclusive
522
+ J. 107-138 inclusive gaps on 123 symetrically.
523
+ K. 139-149 inclusive.
524
+
525
+ """
526
+
527
+ # Set up the numbering
528
+
529
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
530
+ state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
531
+
532
+ # Region string - regions that should be treated separately in putting the numbering together
533
+ region_string = 'BBBBBBBBBBCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFFHHHHHHHHHHHHHHHHIIIIIIIIIIIIIJJJJJJJJJJJJJKKKKKKKKKKK'
534
+ # 1 2 3 4 5 7 8 9 10
535
+
536
+
537
+ region_index_dict = dict( list(zip( "ABCDEFGHIJK", list(range(11)) )) )
538
+
539
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
540
+ # This is updated in the loop below
541
+ rels = {0:0,
542
+ 1:0,
543
+ 2:0,
544
+ 3:0,
545
+ 4:2,
546
+ 5:2,
547
+ 6:2,
548
+ 7:2,
549
+ 8:2,
550
+ 9:2,
551
+ 10:21}
552
+
553
+ n_regions = 11
554
+
555
+ exclude_deletions = [1,3,4,5,7,9]
556
+
557
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
558
+
559
+ ###############
560
+ # Renumbering #
561
+ ###############
562
+
563
+ _numbering = [ _regions[0], _regions[1], _regions[2],[], _regions[4], [], _regions[6], [], _regions[8],_regions[9],_regions[10] ]
564
+
565
+ ##################################
566
+ # Move the indel in fw 1 onto 8 #
567
+ ##################################
568
+
569
+ # Place indels on 8
570
+ # Find the first recognised residue and change the expected length of the stretch given the starting point.
571
+ # This prevents n terminal deletions being placed at 8 incorrectly.
572
+ length = len( _regions[1] )
573
+ if length > 0:
574
+ start = _regions[1][0][0][0]
575
+ stretch_len = 10 - (start -1)
576
+ if length > stretch_len: # Insertions are present. Place on 8
577
+ annotations = [ (_," ") for _ in range(start,9) ] + [ (8,alphabet[_]) for _ in range( length - stretch_len ) ] + [(9," "),(10," ")]
578
+ else:
579
+ ordered_deletions = [(8," ")] + [(_," ") for _ in range(start, 11) if _ != 8]
580
+ annotations = sorted( ordered_deletions[max(stretch_len-length, 0):] )
581
+ _numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
582
+
583
+ #########
584
+ # CDR 1 # - divided in two parts in the Aho scheme.
585
+ ######### - gaps at 28 depending on the chain type.
586
+
587
+ # "VH domains, as well as the majority of the VA domains, have a one-residue gap in position 28, VK and VB domains a two-residue
588
+ # gap in position 27 and 28."
589
+
590
+ # We use the link below as the reference for the scheme.
591
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Numbering/Alignment.html
592
+
593
+ # Some of the header lines in these images are offset by one (VH)! The gaps really are centered at 28 and 36
594
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VK.html
595
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VL.html
596
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VH.html
597
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VA.html
598
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VB.html
599
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VG.html
600
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VD.html
601
+
602
+ # We gap the CDR1 in a heuristic way using the gaps.
603
+ # This means that CDR1 gapping will not always be correct. For example if one grafts a Kappa CDR1 loop onto a Lambda framework
604
+ # the gapping patter might now be incorrect.
605
+ # Not a fan of being so prescriptive.
606
+
607
+ # The CDR1 region included here ranges from AHo 25 to AHo 42 inclusive
608
+
609
+ # The order in which the two loops are gapped is dependent on the chain type (see alignments in URLs above).
610
+ # Not all lengths are defined as not all lengths were crystallised in 2001 (or today). Where no example of the length was
611
+ # available the rule followed is to continue gapping the C terminal 'loop', then the N terminal 'loop', then 31 then the fw.
612
+ # In all cases I have commented where the gapping is undefined. Note that for alpha chains the gapping rules are inconsistent.
613
+
614
+ _L = 28,36,35,37,34,38,27,29,33,39,32,40,26,30,25,31,41,42
615
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
616
+ _K = 28,27,36,35,37,34,38,33,39,32,40,29,26,30,25,31,41,42
617
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
618
+ _H = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
619
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
620
+ # N.B. The header on the alignment image for PDB_VH is offset by 1!
621
+ _A = 28,36,35,37,34,38,33,39,27,32,40,29,26,30,25,31,41,42
622
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
623
+ # N.B The gapping is inconsistent for alpha chains. I follow the paper's statement that most VA have
624
+ # one gap at 28 and remove 28 and 27 before removing 40.
625
+ _B = 28,36,35,37,34,38,33,39,27,32,40,29,26,30,25,31,41,42
626
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
627
+ _D = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
628
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
629
+ # N.B only two sequence patterns available.
630
+ _G = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
631
+ # |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
632
+ # N.B only one sequence patterns available. Delta copied.
633
+
634
+ ordered_deletions = { 'L':_L,'K':_K, 'H':_H, 'A':_A, 'B':_B, 'D':_D, 'G':_G }
635
+
636
+ length = len( _regions[3] )
637
+
638
+ annotations = [ (i, ' ') for i in sorted( ordered_deletions[chain_type][ max(18-length, 0): ] ) ]
639
+
640
+ # Insertions are not described in the AHo scheme but must be included as there is a significant number of CDRH1s that are
641
+ # longer than the number of positions.
642
+ insertions = max( length-18 , 0 )
643
+ if insertions > 26:
644
+ return [], startindex, endindex # Too many insertions. Do not apply numbering.
645
+ elif insertions > 0:
646
+ # They are placed on residue 36 alphabetically.
647
+ insertat = annotations.index( (36, ' ') )+1 # Always 12
648
+ assert insertat == 12, 'AHo numbering failed'
649
+ annotations = annotations[:insertat] + [ (36, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
650
+
651
+ _numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
652
+
653
+ #########
654
+ # CDR 2 #
655
+ #########
656
+ # Gaps are placed symetically at 63.
657
+ # For VA a second gap is placed at 74 and 75 according to the text in the paper. However, all the reference sequences show a
658
+ # gap at 73 and 74 see:
659
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VA.html
660
+ # and
661
+ # https://www.bioc.uzh.ch/plueckthun/antibody/Numbering/Alignment.html
662
+ # Either I am mis-interpreting the text in the paper or there is something a little inconsistent here...
663
+ # Given that *all* the numbered examples show the VA gap at 73 and 74 on the AAAAA website I have decided to implement this.
664
+ #
665
+
666
+ # This region describes 58 to 77 inclusive
667
+
668
+ if chain_type == 'A':
669
+ ordered_deletions = [74,73,63,62,64,61,65,60,66,59,67,58,68,69,70,71,72,75,76,77]
670
+ else:
671
+ ordered_deletions = [63,62,64,61,65,60,66,59,67,58,68,69,70,71,72,73,74,75,76,77]
672
+
673
+ length = len(_regions[5])
674
+
675
+ annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(20-length, 0): ] ) ]
676
+
677
+ # Insertions are not described in the AHo scheme but must be included.
678
+ insertions = max( length-20 , 0 )
679
+ if insertions > 26:
680
+ return [], startindex, endindex # Too many insertions. Do not apply numbering.
681
+ elif insertions > 0:
682
+ # They are placed on residue 63 alphabetically.
683
+ insertat = annotations.index( (63, ' ') )+1 # Always 6
684
+ assert insertat == 6, 'AHo numbering failed'
685
+ annotations = annotations[:insertat] + [ (63, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
686
+
687
+ _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
688
+
689
+ #########
690
+ # FW3 ############################################
691
+ # Move deletions onto 86 then 85. Insertions on 85 #
692
+ ####################################################
693
+ ordered_deletions = [86,85,87,84,88,83,89,82,90,81,91,80,92,79,93,78]
694
+ length=len( _regions[7] )
695
+
696
+ annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(16-length, 0): ] ) ]
697
+
698
+ # Insertions are not described in the AHo scheme but must be included.
699
+ insertions = max( length-16 , 0 )
700
+ if insertions > 26:
701
+ return [], startindex, endindex # Too many insertions. Do not apply numbering.
702
+ elif insertions > 0:
703
+ # They are placed on residue 85 alphabetically.
704
+ insertat = annotations.index( (85, ' ') )+1 # Always 8
705
+ assert insertat == 8, 'AHo numbering failed'
706
+ annotations = annotations[:insertat] + [ (85, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
707
+
708
+ _numbering[7] = [ (annotations[i], _regions[7][i][1]) for i in range(length) ]
709
+
710
+
711
+ #########
712
+ # CDR 3 #
713
+ #########
714
+ # Deletions on 123.
715
+ # Point of the Aho scheme is that they have accounted for all possible positions.
716
+ # Assumption is that no more insertions will occur....
717
+ # We'll put insertions on 123 linearly.(i.e.ABCDEF...) if they ever do.
718
+
719
+ ordered_deletions = [123,124,122,125,121,126,120,127,119,128,118,129,117,130,116,131,115,132,114,133,113,134,112,135,111,
720
+ 136,110,137,109,138,108,107]
721
+
722
+ length=len( _regions[9] )
723
+
724
+ annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(32-length, 0): ] ) ]
725
+
726
+ # Insertions are not described in the AHo scheme but must be included.
727
+ insertions = max( length-32 , 0 )
728
+ if insertions > 26:
729
+ return [], startindex, endindex # Too many insertions. Do not apply numbering.
730
+ elif insertions > 0:
731
+ # They are placed on residue 123 alphabetically.
732
+ insertat = annotations.index( (123, ' ') )+1 # Always 17
733
+ assert insertat == 17, 'AHo numbering failed'
734
+ annotations = annotations[:insertat] + [ (123, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
735
+
736
+ _numbering[9] = [ (annotations[i], _regions[9][i][1]) for i in range(length) ]
737
+
738
+ # AHo includes one extra position than IMGT in what it considers the variable domain for light chains.
739
+ #If the last state is 148 and there is at least one more residue left, then add the residue to the numbering.
740
+ numbering = gap_missing( _numbering )
741
+ if len(numbering) > 0:
742
+ if numbering[-1][0] == (148, ' ') and numbering[-1][1] != '-' and endindex+1 < len(sequence):
743
+ numbering.append( ( (149, ' '), sequence[endindex+1]) )
744
+ endindex +=1
745
+
746
+ return numbering, startindex, endindex
747
+
748
+
749
+ ###########
750
+ # Chothia #
751
+ ###########
752
+
753
+ # Heavy chains
754
+ def number_chothia_heavy(state_vector, sequence):
755
+ """
756
+ Apply the Chothia numbering scheme for heavy chains
757
+
758
+ Rules should be implemented using two strings - the state string and the region string.
759
+
760
+ There are 128 states in the HMMs. Treat X as a direct match in Chothia scheme, I is an insertion.
761
+
762
+ XXXXXXXXXI XXXXXXXXXXXXX XXXXXXXIIIIXX XXXXXXXXXXXXXXXXXX XXXIXIIXXXX XXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXX XXXXXXXXIIIXX XXXXXXXXXXX'
763
+ 1111111111 2222222222222 3333333333333 444444444444444444 55555555555 666666666666666666666666666666666666666 7777777777777 88888888888'
764
+
765
+ Regions - (N.B These do not match up with any particular definition of CDR)
766
+ 1 - Put the insertions at Chothia position 6
767
+ 2 - Simple mapping (treat "I" states as inserts and not own match states)
768
+ 3 - CDRH1 - 30 (inc) to 34 (exc) put insertions on 31
769
+ 4 - Simple mapping (treat "I" states as inserts and not own match states)
770
+ 5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
771
+ 6 - Simple mapping (treat "I" states as inserts and not own match states)
772
+ 7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
773
+ 8 - Simple mapping (treat "I" states as inserts and not own match states)
774
+
775
+
776
+ Regions 1,3,5 and 7 are renumbered
777
+
778
+ """
779
+
780
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
781
+ state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
782
+
783
+ # Region string - regions that should be treated separately in putting the numbering together
784
+ region_string = '11111111112222222222222333333333333333444444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
785
+
786
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
787
+
788
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
789
+ # This is updated in the loop below
790
+ rels = {0:0,
791
+ 1:-1,
792
+ 2:-1,
793
+ 3:-5,
794
+ 4:-5,
795
+ 5:-8,
796
+ 6:-12,
797
+ 7:-15}
798
+
799
+ n_regions = 8
800
+
801
+ exclude_deletions = [0,2,4,6] # Don't put deletions in these regions
802
+
803
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
804
+
805
+
806
+ ###############
807
+ # Renumbering #
808
+ ###############
809
+
810
+ _numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
811
+
812
+ # Chothia H region 1 (index 0)
813
+ # Insertions are placed at Chothia position 6.
814
+ # Count how many we recognised as insertion by the hmm
815
+ insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
816
+ # We will place all insertion in this region at Chothia position 6.
817
+ if insertions:
818
+ start = _regions[0][0][0][0] # The starting Chothia number as found by the HMM (could easily start from 2 for example)
819
+ # I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
820
+ length = len( _regions[0] )
821
+ annotations = [ (_, " ") for _ in range(start, 7) ] + [ (6, alphabet[_]) for _ in range(insertions) ] + [(7," "),(8," "),(9," ")]
822
+ _numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
823
+ else:
824
+ _numbering[0] = _regions[0]
825
+
826
+
827
+ # CDR1
828
+ # Chothia H region 3 (index 2)
829
+ # put insertions onto 31
830
+ length = len( _regions[2] )
831
+ insertions = max(length - 11, 0) # Pulled back to the cysteine as heavily engineered cdr1's are not playing nicely
832
+
833
+ if insertions:
834
+ annotations = [(_, " ") for _ in range(23,32)] + [(31, alphabet[i]) for i in range(insertions) ] + [(32," "),(33," ")]
835
+ else:
836
+ annotations = [(_, " ") for _ in range(23,32)][:length-2] + [(32," "),(33," ")][:length]
837
+
838
+ _numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
839
+
840
+ # CDR2
841
+ # Chothia H region 5 (index 4)
842
+ # put insertions onto 52
843
+ length = len( _regions[4] )
844
+ # 50 to 57 inclusive
845
+ insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
846
+ # Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
847
+ annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
848
+ annotations += [(52, alphabet[i]) for i in range(insertions) ]
849
+ annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
850
+ _numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
851
+
852
+ # FW3 - insertions are annotated on 82. The first three are normal positions and annotated automatically.
853
+ # Additional insertions do not occur with the kabat or the chothia numbering scheme.
854
+ # It does not make sense to place more than A, B, C on 82 as Martin and AHo work show that this is not a place that accepts
855
+ # additional insertions.
856
+ # The decision here is to allow the alignment to place additional insertions. This is in contrast to Martin where the region
857
+ # is renumbered to place insertions on 72.
858
+
859
+ # CDR3
860
+ # Chothia H region 7 (index 6)
861
+ # put insertions onto 100
862
+ length = len( _regions[6] )
863
+ if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
864
+ annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy")
865
+ _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
866
+
867
+ # Return the full vector and the start and end indices of the numbered region of the sequence
868
+ return gap_missing( _numbering ), startindex, endindex
869
+
870
+ # Light chains
871
+ def number_chothia_light(state_vector, sequence):
872
+ """
873
+ Apply the Chothia numbering scheme for light chains
874
+
875
+ Rules should be implemented using two strings - the state string and the region string.
876
+
877
+ There are 128 states in the HMMs. Treat X as a direct match in Chothia scheme, I is an insertion.
878
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
879
+ 11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
880
+
881
+
882
+ Regions - (N.B These do not match up with any particular definition of CDR)
883
+ 1 - Simple mapping (treat "I" states as inserts and not own match states)
884
+ 2 - CDRL1 - 24 (inc) to 35 (exc) put insertions on 30
885
+ 3 - Simple mapping (treat "I" states as inserts and not own match states)
886
+ 4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
887
+ 5 - Simple mapping (treat "I" states as inserts and not own match states)
888
+ 6 - CDRL3 89 (inc) to 98 (exc) put insertion on 95
889
+ 7 - Simple mapping (treat "I" states as inserts and not own match states)
890
+
891
+ Region 2, 3 and 5 are renumbered
892
+
893
+ """
894
+
895
+ # Set up the numbering
896
+
897
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
898
+ state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIIIXXXXXXXXXXXXXXXXXXXXXXIIIIIIIXXXXXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXX'
899
+
900
+ # Region string - regions that should be treated separately in putting the numbering together
901
+ region_string = '11111111111111111111111222222222222222223333333333333333444444444445555555555555555555555555555555555555666666666666677777777777'
902
+
903
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
904
+
905
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
906
+ # This is updated in the loop below
907
+ rels = {0:0,
908
+ 1: 0,
909
+ 2:-6,
910
+ 3:-6,
911
+ 4:-13,
912
+ 5:-16,
913
+ 6:-20,
914
+ }
915
+
916
+
917
+ n_regions = 7
918
+
919
+ exclude_deletions = [1,3,4,5]
920
+
921
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
922
+
923
+ _numbering = [ _regions[0], [], _regions[2], [], _regions[4], [], _regions[6] ]
924
+
925
+
926
+ ###############
927
+ # Renumbering #
928
+ ###############
929
+
930
+ # CDR1
931
+ # Chothia L region 2 (index 1)
932
+ # put insertions onto 30
933
+ length = len( _regions[1] )
934
+ insertions = max(length - 11, 0) # Eleven positions can be accounted for, the remainder are insertions
935
+ # Delete forward from 31
936
+ annotations = [(24, " "),(25, " "), (26, " "), (27, " "), (28, " "),(29, " "),(30, " ")][:max(0,length)]
937
+ annotations += [(30, alphabet[i]) for i in range(insertions) ]
938
+ annotations += [(31, " "),(32, " "),(33, " "),(34, " ")][ abs( min(0,length-11) ):]
939
+ _numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
940
+
941
+
942
+ # CDR2
943
+ # Chothia L region 4 (index 3)
944
+ # put insertions onto 52.
945
+ length = len( _regions[3] )
946
+ insertions = max( length - 4, 0 )
947
+ if insertions > 0:
948
+ annotations = [(51, " "),(52, " ")] + [(52, alphabet[i]) for i in range(insertions) ] + [(53, " "),(54, " ")]
949
+ _numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
950
+ else: # How to gap L2 in Chothia/Kabat/Martin is unclear so we let the alignment do it.
951
+ _numbering[3] = _regions[3]
952
+
953
+ # FW3
954
+ # Insertions on 68. First deletion 68. Otherwise default to alignment
955
+ length = len( _regions[4] )
956
+ insertions = max(length - 34, 0)
957
+ if insertions > 0: # Insertions on 68
958
+ annotations = [(i," ") for i in range(55,69)]+[(68, alphabet[i]) for i in range(insertions) ]+[(i," ") for i in range(69,89)]
959
+ _numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
960
+ elif length == 33: # First deletion on 68
961
+ annotations = [(i," ") for i in range(55,68)]+[(i," ") for i in range(69,89)]
962
+ _numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
963
+ else: # More deletions - allow alignment to place them
964
+ _numbering[4] = _regions[4]
965
+
966
+
967
+ # CDR3
968
+ # Chothia L region 6 (index 5)
969
+ # put insertions onto 95
970
+ length = len( _regions[5] )
971
+
972
+ if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering.
973
+ annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="light")
974
+ _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
975
+
976
+ # Return the full vector and the start and end indices of the numbered region of the sequence
977
+
978
+ return gap_missing( _numbering ), startindex, endindex
979
+
980
+
981
+ #########
982
+ # Kabat #
983
+ #########
984
+
985
+ # Heavy chains
986
+ def number_kabat_heavy(state_vector, sequence):
987
+ """
988
+ Apply the Kabat numbering scheme for heavy chains
989
+
990
+ Rules should be implemented using two strings - the state string and the region string.
991
+
992
+ There are 128 states in the HMMs. Treat X as a direct match in Kabat scheme, I is an insertion.
993
+ XXXXXXXXXI XXXXXXXXXXXXXXXXXXXX IIIIXXXXXX XXXXXXXXXXXXXXXX XIXII XXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXX XXXXXXIII XXXXXXXXXXXXX
994
+ 1111111111 22222222222222222222 3333333333 4444444444444444 55555 666666666666666666666666666666666666666666666 777777777 8888888888888
995
+
996
+
997
+ Regions - (N.B These do not match up with any particular definition of CDR)
998
+ 1 - Put the insertions at Chothia position 6
999
+ 2 - Simple mapping (treat "I" states as inserts and not own match states)
1000
+ 3 - CDRH1 - 30 (inc) to 36 (exc) put insertions on 35
1001
+ 4 - Simple mapping (treat "I" states as inserts and not own match states)
1002
+ 5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
1003
+ 6 - Simple mapping (treat "I" states as inserts and not own match states)
1004
+ 7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
1005
+ 8 - Simple mapping (treat "I" states as inserts and not own match states)
1006
+
1007
+ """
1008
+
1009
+ # Set up the numbering
1010
+
1011
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
1012
+ state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
1013
+
1014
+ # Region string - regions that should be treated separately in putting the numbering together
1015
+ region_string = '11111111112222222222222333333333333333334444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
1016
+
1017
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
1018
+
1019
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
1020
+ # This is updated in the loop below
1021
+ rels = {0:0,
1022
+ 1:-1,
1023
+ 2:-1,
1024
+ 3:-5,
1025
+ 4:-5,
1026
+ 5:-8,
1027
+ 6:-12,
1028
+ 7:-15}
1029
+
1030
+ n_regions = 8
1031
+
1032
+ exclude_deletions = [2,4,6]
1033
+
1034
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
1035
+
1036
+
1037
+ ###############
1038
+ # Renumbering #
1039
+ ###############
1040
+
1041
+ # Renumbering required for 0, 2, 4, 6 regions in Chothia heavy
1042
+
1043
+ _numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
1044
+
1045
+
1046
+ # Kabat H region 1 (index 0)
1047
+ # Insertions are placed at Kabat position 6.
1048
+ # Count how many we recognised as insertion by the hmm
1049
+ insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
1050
+ # We will place all insertion in this region at Kabat position 6.
1051
+ if insertions:
1052
+ start = _regions[0][0][0][0] # The starting Kabat number as found by the HMM (could easily start from 2 for example)
1053
+ # I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
1054
+ length = len( _regions[0] )
1055
+ annotations = [ (_, " ") for _ in range(start, 7) ] + [ (6, alphabet[_]) for _ in range(insertions) ] + [(7," "),(8," "),(9," ")]
1056
+ _numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
1057
+ else:
1058
+ _numbering[0] = _regions[0]
1059
+
1060
+
1061
+ # CDR1
1062
+ # Kabat H region 3 (index 2)
1063
+ # Put insertions onto 35. Delete from 35 backwards
1064
+ length = len( _regions[2] )
1065
+ insertions = max(0,length - 13)
1066
+ annotations = [(_,' ') for _ in range(23, 36)][:length]
1067
+ annotations += [(35, alphabet[i]) for i in range(insertions) ]
1068
+ _numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
1069
+
1070
+ # CDR2
1071
+ # Chothia H region 5 (index 4)
1072
+ # put insertions onto 52
1073
+ length = len( _regions[4] )
1074
+ # 50 to 57 inclusive
1075
+ insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
1076
+ # Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
1077
+ annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
1078
+ annotations += [(52, alphabet[i]) for i in range(insertions) ]
1079
+ annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
1080
+ _numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
1081
+
1082
+ # FW3 - insertions are annotated on 82. The first three are normal positions and annotated automatically.
1083
+ # Additional insertions do not occur with the kabat or the chothia numbering scheme.
1084
+ # It does not make sense to place more than A, B, C on 82 as Martin and AHo work show that this is not a place that accepts
1085
+ # additional insertions.
1086
+ # The decision here is to allow the alignment to place additional insertions. This is in contrast to Martin where the region
1087
+ # is renumbered to place insertions on 72.
1088
+
1089
+ # CDR3
1090
+ # Chothia H region 7 (index 6)
1091
+ # put insertions onto 100
1092
+ length = len( _regions[6] )
1093
+ if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
1094
+ annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="heavy") # Chothia and Kabat the same here
1095
+ _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
1096
+
1097
+ # Return the full vector and the start and end indices of the numbered region of the sequence
1098
+ return gap_missing( _numbering ), startindex, endindex
1099
+
1100
+ # Light chains
1101
+ def number_kabat_light(state_vector, sequence):
1102
+ """
1103
+ Apply the Kabat numbering scheme for light chains
1104
+
1105
+ Rules should be implemented using two strings - the state string and the region string.
1106
+
1107
+ There are 128 states in the HMMs. Treat X as a direct match in Kabat scheme, I is an insertion.
1108
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
1109
+ 11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
1110
+
1111
+
1112
+ Regions - (N.B These do not match up with any particular definition of CDR)
1113
+ 1 - Simple mapping (treat "I" states as inserts and not own match states)
1114
+ 2 - CDRL1 - 24 (inc) to 35 (exc) put insertions on 27
1115
+ 3 - Simple mapping (treat "I" states as inserts and not own match states)
1116
+ 4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
1117
+ 5 - Simple mapping (treat "I" states as inserts and not own match states)
1118
+ 6 - CDRL3 89 (inc) to 96 (exc) put insertion on 95
1119
+ 7 - Simple mapping (treat "I" states as inserts and not own match states)
1120
+
1121
+ """
1122
+
1123
+ # Set up the numbering
1124
+
1125
+
1126
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
1127
+ state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIIIXXXXXXXXXXXXXXXXXXXXXXIIIIIIIXXXXXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXX'
1128
+
1129
+ # Region string - regions that should be treated separately in putting the numbering together
1130
+ region_string = '11111111111111111111111222222222222222223333333333333333444444444445555555555555555555555555555555555555666666666666677777777777'
1131
+
1132
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
1133
+
1134
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
1135
+ # This is updated in the loop below
1136
+ rels = {0:0,
1137
+ 1: 0,
1138
+ 2:-6,
1139
+ 3:-6,
1140
+ 4:-13,
1141
+ 5:-16,
1142
+ 6:-20,
1143
+ }
1144
+
1145
+ n_regions = 7
1146
+
1147
+ exclude_deletions = [1,3,5]
1148
+
1149
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
1150
+
1151
+ _numbering = [ _regions[0], [], _regions[2], [], _regions[4], [], _regions[6] ]
1152
+
1153
+
1154
+ ###############
1155
+ # Renumbering #
1156
+ ###############
1157
+
1158
+ # CDR1
1159
+ # Kabat L region 2 (index 1)
1160
+ # put insertions onto 27
1161
+ length = len( _regions[1] )
1162
+ insertions = max(length - 11, 0) # Eleven positions can be accounted for, the remainder are insertions
1163
+ # Delete forward from 28
1164
+ annotations = [(24, " "),(25, " "), (26, " "), (27, " ")][:max(0,length)]
1165
+ annotations += [(27, alphabet[i]) for i in range(insertions) ]
1166
+ annotations += [(28, " "),(29, " "),(30, " "),(31, " "),(32, " "),(33, " "),(34, " ")][ abs( min(0,length-11) ):]
1167
+ _numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
1168
+
1169
+ # CDR2
1170
+ # Chothia L region 4 (index 3)
1171
+ # put insertions onto 52.
1172
+ length = len( _regions[3] )
1173
+ insertions = max( length - 4, 0 )
1174
+ if insertions > 0:
1175
+ annotations = [(51, " "),(52, " ")] + [(52, alphabet[i]) for i in range(insertions) ] + [(53, " "),(54, " ")]
1176
+ _numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
1177
+ else: # How to gap L2 in Chothia/Kabat/Martin is unclear so we let the alignment do it.
1178
+ _numbering[3] = _regions[3]
1179
+
1180
+
1181
+ # FW3
1182
+ # All insertions are placed by alignment. This is in contrast to Martin (and Chothia) where they are placed on 68.
1183
+ # The kabat scheme was defined using a sequence alignment alone. In keeping with this, insertions in FW3 are also only placed
1184
+ # with respect to the sequence alignment (the HMM).
1185
+
1186
+ # CDR3
1187
+ # Chothia L region 6 (index 5)
1188
+ # put insertions onto 95
1189
+ length = len( _regions[5] )
1190
+
1191
+ if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering.
1192
+ annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="light")
1193
+ _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
1194
+
1195
+ return gap_missing( _numbering ), startindex, endindex
1196
+
1197
+
1198
+
1199
+
1200
+ #############################
1201
+ # Martin (extended Chothia) #
1202
+ #############################
1203
+
1204
+ # Heavy chains
1205
+ def number_martin_heavy(state_vector, sequence):
1206
+ """
1207
+ Apply the Martin (extended Chothia) numbering scheme for heavy chains
1208
+
1209
+ Rules should be implemented using two strings - the state string and the region string.
1210
+
1211
+ There are 128 states in the HMMs. Treat X as a direct match in Martin scheme, I is an insertion.
1212
+ XXXXXXXXXI XXXXXXXXXXXXXXXXXXXX IIIIXX XXXXXXXXXXXXXXXXXXXX XIXII XXXXXXXXXXXIXXXXXXXXIIIXXXXXXXXXXXXXXXXXXXXXX XXXXXXIII XXXXXXXXXXXXX
1213
+ 1111111111 22222222222222222222 333333 44444444444444444444 55555 666666666666666666666666666666666666666666666 777777777 8888888888888
1214
+
1215
+
1216
+ Regions - (N.B These do not match up with any particular definition of CDR)
1217
+ 1 - Put the insertions at Chothia position 8
1218
+ 2 - Simple mapping (treat "I" states as inserts and not own match states)
1219
+ 3 - CDRH1 - 30 (inc) to 34 (exc) put insertions on 31
1220
+ 4 - Simple mapping (treat "I" states as inserts and not own match states)
1221
+ 5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
1222
+ 6 - Simple mapping (treat "I" states as inserts and not own match states)
1223
+ 7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
1224
+ 8 - Simple mapping (treat "I" states as inserts and not own match states)
1225
+
1226
+
1227
+ Regions 1,3,5 and 7 are renumbered
1228
+
1229
+ """
1230
+
1231
+ # Set up the numbering
1232
+
1233
+
1234
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
1235
+ state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXIIIXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
1236
+
1237
+ # Region string - regions that should be treated separately in putting the numbering together
1238
+ region_string = '11111111112222222222222333333333333333444444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
1239
+
1240
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
1241
+
1242
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
1243
+ # This is updated in the loop below
1244
+ rels = {0:0,
1245
+ 1:-1,
1246
+ 2:-1,
1247
+ 3:-5,
1248
+ 4:-5,
1249
+ 5:-8,
1250
+ 6:-12,
1251
+ 7:-15}
1252
+
1253
+ n_regions = 8
1254
+
1255
+ exclude_deletions = [2,4,5,6]
1256
+
1257
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
1258
+
1259
+
1260
+ ###############
1261
+ # Renumbering #
1262
+ ###############
1263
+
1264
+ # Renumbering required for 0, 2, 4, 6 regions in Chothia heavy
1265
+
1266
+ _numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
1267
+
1268
+ # Chothia H region 1 (index 0)
1269
+ # Insertions are placed at Chothia position 8.
1270
+ # Count how many we recognised as insertion by the hmm
1271
+ insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
1272
+ # We will place all insertion in this region at Chothia position 8.
1273
+ if insertions:
1274
+ start = _regions[0][0][0][0] # The starting Chothia number as found by the HMM (could easily start from 2 for example)
1275
+ # I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
1276
+ length = len( _regions[0] )
1277
+ annotations = [ (_, " ") for _ in range(start, 9) ] + [ (8, alphabet[_]) for _ in range(insertions) ] + [(9," ")]
1278
+ _numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
1279
+ else:
1280
+ _numbering[0] = _regions[0]
1281
+
1282
+
1283
+ # CDR1
1284
+ # Chothia H region 3 (index 2)
1285
+ # put insertions onto 31
1286
+ length = len( _regions[2] )
1287
+ insertions = max(length - 11, 0) # Pulled back to the cysteine as heavily engineered cdr1's are not playing nicely
1288
+ if insertions:
1289
+ annotations = [(_, " ") for _ in range(23,32)] + [(31, alphabet[i]) for i in range(insertions) ] + [(32," "),(33," ")]
1290
+ else:
1291
+ annotations = [(_, " ") for _ in range(23,32)][:length-2] + [(32," "),(33," ")][:length]
1292
+ _numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
1293
+
1294
+ # CDR2
1295
+ # Chothia H region 5 (index 4)
1296
+ # put insertions onto 52
1297
+ length = len( _regions[4] )
1298
+ # 50 to 57 inclusive
1299
+ insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
1300
+ # Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
1301
+ annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
1302
+ annotations += [(52, alphabet[i]) for i in range(insertions) ]
1303
+ annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
1304
+ _numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
1305
+
1306
+ # FW3
1307
+ # Place all insertions on 72 explicitly.
1308
+ # This is in contrast to Chothia implementation where 3 insertions are on 82 and then further insertions are placed by the
1309
+ # alignment
1310
+ # Gaps are placed according to the alignment.
1311
+ length = len( _regions[5] )
1312
+ insertions = max(length - 35, 0)
1313
+ if insertions > 0: # Insertions on 72
1314
+ annotations = [(i,' ') for i in range(58,73)]+[(72, alphabet[i]) for i in range(insertions) ]+[(i,' ') for i in range(73,93)]
1315
+ _numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
1316
+ else: # Deletions - all alignment to place them.
1317
+ _numbering[4] = _regions[4]
1318
+
1319
+
1320
+ # CDR3
1321
+ # Chothia H region 7 (index 6)
1322
+ # put insertions onto 100
1323
+ length = len( _regions[6] )
1324
+ if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
1325
+ annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy")
1326
+ _numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
1327
+
1328
+ # Return the full vector and the start and end indices of the numbered region of the sequence
1329
+ return gap_missing( _numbering ), startindex, endindex
1330
+
1331
+ # Light chains
1332
+ def number_martin_light(state_vector, sequence):
1333
+ """
1334
+ Apply the Martin numbering scheme for light chains
1335
+
1336
+ Rules should be implemented using two strings - the state string and the region string.
1337
+
1338
+ There are 128 states in the HMMs. Treat X as a direct match in Martin scheme, I is an insertion.
1339
+ XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
1340
+ 11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
1341
+
1342
+
1343
+ Regions - (N.B These do not match up with any particular definition of CDR)
1344
+ 1 - Simple mapping (treat "I" states as inserts and not own match states)
1345
+ 2 - CDRL1 - 30 (inc) to 31 (exc) put insertions on 30
1346
+ 3 - Simple mapping (treat "I" states as inserts and not own match states)
1347
+ 4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
1348
+ 5 - Simple mapping (treat "I" states as inserts and not own match states)
1349
+ 6 - CDRL3 89 (inc) to 96 (exc) put insertion on 95
1350
+ 7 - Simple mapping (treat "I" states as inserts and not own match states)
1351
+
1352
+ Region 2, 3 and 5 are renumbered
1353
+
1354
+ """
1355
+
1356
+ # The Martin and Chothia specification for light chains are very similar. Martin is more explicit in the location of indels
1357
+ # but unlike the heavy chain these are additional instead of changes to the Chothia scheme. Thus, Chothia light is implemented
1358
+ # as martin light.
1359
+ return number_chothia_light(state_vector,sequence)
1360
+
1361
+
1362
+ ###########
1363
+ # Wolfguy #
1364
+ ###########
1365
+ # The Wolfguy numbering scheme is an in-house scheme used at Roche. It has been described publicly in the paper:
1366
+ # Prediction of VH-VL domain orientation for antibody variable domain modeling. Bujotzek A. et al. Protein 2015 83(4) 681-95
1367
+ #
1368
+ # It is similar in gapping as IMGT and is defined only for heavy and light antibody chains.
1369
+ # Unlike other schemes the numbering denotes both the chain (heavy 101-499, light 501-799) and the region (less than -50 framework
1370
+ # greater than -50 CDR). All CDRs of length less than 50 can be handled without the need for insertion codes. Numbering of the
1371
+ # framework behaves similarly to IMGT in that all positions are assumed to be accounted for. Framework insertions are placed by
1372
+ # the alignment.
1373
+ #
1374
+ # Numbering of all CDRs is performed symmetrically with the exception of CDRL1. In this case the CDR is numbered according to a
1375
+ # pattern specific to the canonical class. This is recognised by length and by sequence similarity to a consensus sequence. If a
1376
+ # length has not been observed it is numbered symmetrically.
1377
+
1378
+
1379
+ def number_wolfguy_heavy(state_vector, sequence):
1380
+ """
1381
+ Apply the wolfguy numbering scheme for heavy chains
1382
+
1383
+ The scheme numbers the sequence using different segments so that the numbering tells you
1384
+ where in the antibody the sequence is describing.
1385
+
1386
+ XXXXXXXXXIXXXXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXIX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
1387
+ 11111111111111111111111111 22222222222222 33333333333333 44444444444444444444 555555555555555555555555555555 6666666666666 77777777777'
1388
+
1389
+ Regions - (N.B These do not match up with any particular definition of CDR)
1390
+ 1 - Simple mapping (treat "I" states as inserts and not own match states)
1391
+ 2 - CDRH1 - 155-199 (inc). Gap symmetrically about 175-176.
1392
+ 3 - Simple mapping (treat "I" states as inserts and not own match states)
1393
+ 4 - CDRH2 - 251-299 (inc). Gap symmetrically about 271-272, then gap back from 294.
1394
+ 5 - Simple mapping (treat "I" states as inserts and not own match states)
1395
+ 6 - CDRH3 331,332 and 351-399 (inc). Gap according to the
1396
+ 7 - Simple mapping (treat "I" states as inserts and not own match states)
1397
+
1398
+ Start gaps on rhs each time.
1399
+ """
1400
+ # Set up the numbering
1401
+
1402
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
1403
+ state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
1404
+
1405
+ # Region string - regions that should be treated separately in putting the numbering together
1406
+ region_string = '11111111111111111111111111222222222222223333333333333344444444444444444444555555555555555555555555555555666666666666677777777777'
1407
+
1408
+ region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
1409
+
1410
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
1411
+ # This is updated in the loop below
1412
+ rels = {0:100,
1413
+ 1:124,
1414
+ 2:160,
1415
+ 3:196,
1416
+ 4:226,
1417
+ 5:244,
1418
+ 6:283}
1419
+
1420
+ n_regions = 7
1421
+
1422
+ exclude_deletions = [1,3,5]
1423
+
1424
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
1425
+
1426
+ ###############
1427
+ # Renumbering #
1428
+ ###############
1429
+
1430
+ # Renumbering required for 1, 3, 5 regions in wolfguy heavy
1431
+ _numbering = [ _regions[0], [] , _regions[2], [], _regions[4] , [], _regions[6] ]
1432
+
1433
+ # CDRH1
1434
+ # Delete symmetrically about 177. Delete right first.
1435
+ # May have to change this to reflect where the point of symmetry is
1436
+ ordered_deletions = [151]
1437
+ for p1,p2 in zip( list(range(152,176)), list(range(199, 175,-1))): ordered_deletions += [ p1,p2 ]
1438
+ length = len( _regions[1] )
1439
+ annotations = sorted(ordered_deletions[:length])
1440
+ _numbering[1] = [ ((annotations[i]," "), _regions[1][i][1]) for i in range(length) ]
1441
+
1442
+ # CDRH2
1443
+ # Delete symmetrically about 271. Delete right first.
1444
+ # Then delete right from 288
1445
+ ordered_deletions = [251]
1446
+ for p1,p2 in zip( list(range(252,271)), list(range(290, 271,-1))): ordered_deletions += [ p1,p2 ]
1447
+ ordered_deletions.append( 271 )
1448
+ ordered_deletions = list(range( 299, 290, -1)) + ordered_deletions
1449
+ length = len( _regions[3] )
1450
+ annotations = sorted(ordered_deletions[:length])
1451
+ _numbering[3] = [ ((annotations[i]," "), _regions[3][i][1]) for i in range(length) ]
1452
+
1453
+ # CDRH3
1454
+ # Delete symmetrically about 374. Delete right first.
1455
+ # Scheme changes at length 8
1456
+ # Scheme changes at length 12
1457
+ ordered_deletions = []
1458
+ for p1,p2 in zip( list(range(356,374)), list(range(391, 373,-1))): ordered_deletions += [ p1,p2 ]
1459
+ ordered_deletions = [ 354, 394, 355, 393, 392 ] + ordered_deletions
1460
+ ordered_deletions = [331,332] + [ 399, 398, 351, 352, 397, 353, 396, 395 ] + ordered_deletions
1461
+ length = len( _regions[5] )
1462
+
1463
+ if length > len(ordered_deletions): return [], startindex, endindex # Too many insertions. Do not apply numbering.
1464
+ annotations = sorted(ordered_deletions[:length])
1465
+ _numbering[5] = [ ((annotations[i]," "), _regions[5][i][1]) for i in range(length) ]
1466
+
1467
+ # Return the full vector and the start and end indices of the numbered region of the sequence
1468
+ return sum( _numbering, [] ), startindex, endindex
1469
+
1470
+
1471
+ def number_wolfguy_light(state_vector, sequence):
1472
+ """
1473
+ Apply the wolfguy numbering scheme for light chains
1474
+
1475
+ The scheme numbers the sequence using different segments so that the numbering tells you
1476
+ where in the antibody the sequence is describing.
1477
+
1478
+ XXXXXXX XXX XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXIXXXXXXX XXXX XXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
1479
+ 1111111 AAA BBBBBBBBBBBBB 22222222222222222 333333333333333 44444444444444 55555555555 6666 77777777777777777777 8888888888888 99999999999
1480
+
1481
+ Regions - (N.B These do not match up with any particular definition of CDR)
1482
+ 1 - Simple mapping (treat "I" states as inserts and not own match states)
1483
+ A - Move indels onto 508
1484
+ B - Simple mapping (treat "I" states as inserts and not own match states)
1485
+ 2 - CDRL1 - 551-599 (inc). Assign via the matching consensus sequence and length.
1486
+ 3 - Simple mapping (treat "I" states as inserts and not own match states)
1487
+ 4 - CDRL2 - 651-699 (inc). Gap about 673 then right from 694
1488
+ 5 - Simple mapping (treat "I" states as inserts and not own match states)
1489
+ 6 - Move indels onto 713 and 714
1490
+ 7 - Simple mapping (treat "I" states as inserts and not own match states)
1491
+ 8 - CDRL3 751-799 (inc). Gap symmetrically about 374-375
1492
+ 9 - Simple mapping (treat "I" states as inserts and not own match states)
1493
+
1494
+ """
1495
+ # Set up the numbering
1496
+
1497
+ # State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
1498
+ state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
1499
+
1500
+ # Region string - regions that should be treated separately in putting the numbering together
1501
+ region_string = '1111111AAABBBBBBBBBBBBB222222222222222223333333333333334444444444444455555555555666677777777777777777777888888888888899999999999'
1502
+
1503
+ region_index_dict = {"1":0,"A":1,"B":2,"2":3,"3":4,"4":5,"5":6,"6":7,"7":8,"8":9,"9":10}
1504
+
1505
+ # Define how the scheme's numbering differs from IMGT at the start of each region.
1506
+ # This is updated in the loop below
1507
+ rels = {0:500,
1508
+ 1:500,
1509
+ 2:500,
1510
+ 3:527,
1511
+ 4:560,
1512
+ 5:595,
1513
+ 6:631,
1514
+ 7:630,
1515
+ 8:630,
1516
+ 9:646,
1517
+ 10:683}
1518
+
1519
+ n_regions = 11
1520
+
1521
+ exclude_deletions = [1,3,5,7,9]
1522
+
1523
+ _regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
1524
+
1525
+ ###############
1526
+ # Renumbering #
1527
+ ###############
1528
+
1529
+ # Renumbering required for 1, 3, 5 regions in wolfguy heavy
1530
+ _numbering = [ _regions[0], [], _regions[2], [] , _regions[4], [], _regions[6], [], _regions[8], [], _regions[10] ]
1531
+
1532
+
1533
+ # Gaps in the first section go 508 instead of the imgt 510 equivalent
1534
+ length = len(_regions[1] )
1535
+ annotations = sorted([ (510,' '), (509, ' '), (508, ' ')][ :length ] + [(508,a) for a in alphabet[:max(0, length-3)]])
1536
+ _numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
1537
+
1538
+ # CDRL1
1539
+ # Number by predicting the canonical
1540
+ length = len(_regions[3] )
1541
+ annotations = _get_wolfguy_L1( _regions[3], length)
1542
+ _numbering[3] = [ ((annotations[i]," "), _regions[3][i][1]) for i in range(length) ]
1543
+
1544
+ # CDRL2
1545
+ # Delete about 673. Finally delete right from 694. Maintain 651 as the last deletion
1546
+ ordered_deletions = []
1547
+ for p1,p2 in zip( list(range(652,673)), list(range(694, 672,-1))): ordered_deletions += [ p2,p1 ]
1548
+ ordered_deletions = [651] + list(range( 699, 694, -1)) + ordered_deletions + [673]
1549
+
1550
+ length = len( _regions[5] )
1551
+ annotations = sorted(ordered_deletions[:length])
1552
+ _numbering[5] = [ ((annotations[i]," "), _regions[5][i][1]) for i in range(length) ]
1553
+
1554
+
1555
+ # The placement of the indel in wolfguy is different to that in imgt
1556
+ length = len( _regions[7] )
1557
+ insertions = max( 0, length - 4 )
1558
+ annotations = [(711, ' '), (712, ' '), (713, ' '), (714, ' ')][:length] + [ (714, a) for a in alphabet[:insertions] ]
1559
+ _numbering[7] = [ (annotations[i], _regions[7][i][1]) for i in range(length) ]
1560
+
1561
+ # CDRL3
1562
+ # Delete symmetrically about 775. Delete right first. Finally delete 798 and 799
1563
+ ordered_deletions = []
1564
+ for p1,p2 in zip( list(range(751,775)), list(range(799, 775,-1))): ordered_deletions += [ p1,p2 ]
1565
+ ordered_deletions.append( 775 )
1566
+
1567
+ length = len( _regions[9] )
1568
+ if length > len(ordered_deletions): return [], startindex, endindex # Too many insertions. Do not apply numbering.
1569
+ annotations = sorted(ordered_deletions[:length])
1570
+ _numbering[9] = [ ((annotations[i]," "), _regions[9][i][1]) for i in range(length) ]
1571
+
1572
+ # Return the full vector and the start and end indices of the numbered region of the sequence
1573
+ return sum( _numbering, [] ), startindex, endindex
1574
+
1575
+
1576
+ def _get_wolfguy_L1(seq, length):
1577
+ """
1578
+ Wolfguy's L1 annotation is based on recognising the length and the sequence pattern defined
1579
+ by a set of rules. If the length has not been characterised, we number symmetrically about the
1580
+ middle of the loop.
1581
+ """
1582
+
1583
+ # These are the annotations for different lengths of L1 according to the wolfguy definitions.
1584
+ L1_sequences = {
1585
+ 9: [['9', 'XXXXXXXXX', [551, 552, 554, 556, 563, 572, 597, 598, 599]]],
1586
+ 10: [['10', 'XXXXXXXXXX', [551, 552, 553, 556, 561, 562, 571, 597, 598, 599]]],
1587
+ 11: [['11a', 'RASQDISSYLA', [551, 552, 553, 556, 561, 562, 571, 596, 597, 598, 599]],
1588
+ ['11b', 'GGNNIGSKSVH', [551, 552, 554, 556, 561, 562, 571, 572, 597, 598, 599]],
1589
+ ['11b.2','SGDQLPKKYAY', [551, 552, 554, 556, 561, 562, 571, 572, 597, 598, 599]]],
1590
+ 12: [['12a', 'TLSSQHSTYTIE', [551, 552, 553, 554, 555, 556, 561, 563, 572, 597, 598, 599]],
1591
+ ['12b', 'TASSSVSSSYLH', [551, 552, 553, 556, 561, 562, 571, 595, 596, 597, 598, 599]],
1592
+ ['12c', 'RASQSVxNNYLA', [551, 552, 553, 556, 561, 562, 571, 581, 596, 597, 598, 599]],
1593
+ ['12d', 'rSShSIrSrrVh', [551, 552, 553, 556, 561, 562, 571, 581, 596, 597, 598, 599]]],
1594
+ 13: [['13a', 'SGSSSNIGNNYVS', [551, 552, 554, 555, 556, 557, 561, 562, 571, 572, 597, 598, 599]],
1595
+ ['13b', 'TRSSGSLANYYVQ', [551, 552, 553, 554, 556, 561, 562, 563, 571, 572, 597, 598, 599]]],
1596
+ 14: [['14a', 'RSSTGAVTTSNYAN', [551, 552, 553, 554, 555, 561, 562, 563, 564, 571, 572, 597, 598, 599]],
1597
+ ['14b', 'TGTSSDVGGYNYVS', [551, 552, 554, 555, 556, 557, 561, 562, 571, 572, 596, 597, 598, 599]]],
1598
+ 15: [['15', 'XXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 594, 595, 596, 597, 598, 599]]],
1599
+ 16: [['16', 'XXXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 583, 594, 595, 596, 597, 598, 599]]],
1600
+ 17: [['17', 'XXXXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 583, 584, 594, 595, 596, 597, 598, 599]]]
1601
+ }
1602
+
1603
+ if length in L1_sequences: # Use the pre-defined motif
1604
+ # Find the maximum scoring canonical form for this length.
1605
+ curr_max = None, -10000
1606
+ for canonical in L1_sequences[length]:
1607
+ sub_score = 0
1608
+ for i in range( length ):
1609
+ try:
1610
+ sub_score += blosum62[ (seq[i][1].upper(), canonical[1][i].upper() ) ]
1611
+ except KeyError:
1612
+ sub_score += blosum62[ (canonical[1][i].upper(), seq[i][1].upper() ) ]
1613
+ if sub_score > curr_max[1]:
1614
+ curr_max = canonical, sub_score
1615
+
1616
+ # return the annotations
1617
+ return curr_max[0][2]
1618
+ else: # Use a symmetric numbering about the anchors.
1619
+ ordered_deletions = []
1620
+ for p1,p2 in zip( list(range(551,575)), list(range(599, 575,-1))): ordered_deletions += [ p2,p1 ]
1621
+ ordered_deletions.append(575)
1622
+ return sorted( ordered_deletions[:length] )
1623
+
1624
+ def gap_missing( numbering ):
1625
+ '''
1626
+ Place gaps when a number is missing. All except wolfguy are continuously numbered
1627
+ '''
1628
+ # Gaps placed where a number is not present
1629
+ num = [ ((0,' '),'-') ]
1630
+ for p, a in sum( numbering, [] ):
1631
+ if p[0] > num[-1][0][0]+1:
1632
+ for _i in range( num[-1][0][0]+1, p[0] ):
1633
+ num.append( ((_i, ' '), '-' ) )
1634
+ num.append( (p,a) )
1635
+ return num[1:]
1636
+
1637
+
1638
+ ######################
1639
+ # Annotation of CDR3 #
1640
+ ######################
1641
+
1642
+ def get_cdr3_annotations(length, scheme="imgt", chain_type=""):
1643
+ """
1644
+ Given a length of a cdr3 give back a list of the annotations that should be applied to the sequence.
1645
+
1646
+ This function should be depreciated
1647
+ """
1648
+ az = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
1649
+ za = "ZYXWVUTSRQPONMLKJIHGFEDCBA"
1650
+
1651
+ if scheme=="imgt":
1652
+ start, end = 105, 118 # start (inclusive) end (exclusive)
1653
+ annotations = [None for _ in range(max(length,13))]
1654
+ front = 0
1655
+ back = -1
1656
+ assert (length-13) < 50, "Too many insertions for numbering scheme to handle" # We ran out of letters.
1657
+ for i in range(min(length,13)):
1658
+ if i%2:
1659
+ annotations[back] = (end+back, " ")
1660
+ back -= 1
1661
+ else:
1662
+ annotations[front] = (start+front, " ")
1663
+ front += 1
1664
+ for i in range(max(0,length-13)): # add insertions onto 111 and 112 in turn
1665
+ if i%2:
1666
+ annotations[back] = (112, za[back+6])
1667
+ back-=1
1668
+ else:
1669
+ annotations[front] = (111, az[front-7])
1670
+ front +=1
1671
+ return annotations
1672
+
1673
+ elif scheme in [ "chothia", "kabat"] and chain_type=="heavy": # For chothia and kabat
1674
+ # Number forwards from 93
1675
+ insertions = max(length - 10, 0)
1676
+ assert insertions < 27, "Too many insertions for numbering scheme to handle" # We ran out of letters.
1677
+ ordered_deletions = [ (100, ' '), (99,' '), (98,' '), (97,' '), (96,' '), (95,' '), (101,' '),(102,' '),(94,' '), (93,' ') ]
1678
+ annotations = sorted( ordered_deletions[ max(0, 10-length): ] + [ (100,a) for a in az[:insertions ] ] )
1679
+ return annotations
1680
+
1681
+ elif scheme in [ "chothia", "kabat"] and chain_type=="light":
1682
+ # Number forwards from 89
1683
+ insertions = max(length - 9, 0)
1684
+ assert insertions < 27, "Too many insertions for numbering scheme to handle" # We ran out of letters.
1685
+ ordered_deletions = [ (95,' '),(94,' '),(93,' '),( 92,' '),(91,' '),(96,' '),(97,' '),(90,' '),(89,' ') ]
1686
+ annotations = sorted( ordered_deletions[ max(0, 9-length): ] + [ (95,a) for a in az[:insertions ] ] )
1687
+ return annotations
1688
+
1689
+ else:
1690
+ raise AssertionError("Unimplemented scheme")
1691
+
app.py CHANGED
@@ -194,7 +194,7 @@ def main():
194
  )
195
 
196
  if uploaded_file is None:
197
- with st.expander('Download examples', expanded=True):
198
  with open('./data/examples/7DK2_AB_C.pdb', 'r') as f:
199
  st.download_button(
200
  'RBD + Antibody Complex',
 
194
  )
195
 
196
  if uploaded_file is None:
197
+ with st.expander("Don't know what to upload? Try these examples", expanded=True):
198
  with open('./data/examples/7DK2_AB_C.pdb', 'r') as f:
199
  st.download_button(
200
  'RBD + Antibody Complex',
requirements.txt CHANGED
@@ -2,8 +2,6 @@
2
  torch
3
  torchvision
4
  biopython==1.79
5
- git+https://github.com/oxpig/ANARCI.git
6
- git+https://github.com/prihoda/AbNumber.git
7
  joblib
8
  lmdb
9
  tqdm
 
2
  torch
3
  torchvision
4
  biopython==1.79
 
 
5
  joblib
6
  lmdb
7
  tqdm