Spaces:
Runtime error
Runtime error
Update
Browse files- abnumber/__init__.py +6 -0
- abnumber/__version__.py +1 -0
- abnumber/alignment.py +195 -0
- abnumber/chain.py +781 -0
- abnumber/common.py +133 -0
- abnumber/exceptions.py +2 -0
- abnumber/germlines.py +684 -0
- abnumber/position.py +158 -0
- anarci/.DS_Store +0 -0
- anarci/__init__.py +3 -0
- anarci/anarci.py +1013 -0
- anarci/dat/.DS_Store +0 -0
- anarci/dat/HMMs/ALL.hmm +0 -0
- anarci/dat/HMMs/ALL.hmm.h3f +0 -0
- anarci/dat/HMMs/ALL.hmm.h3i +0 -0
- anarci/dat/HMMs/ALL.hmm.h3m +0 -0
- anarci/dat/HMMs/ALL.hmm.h3p +0 -0
- anarci/germlines.py +0 -0
- anarci/schemes.py +1691 -0
- app.py +1 -1
- requirements.txt +0 -2
abnumber/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abnumber.__version__ import __version__
|
2 |
+
from abnumber.chain import Chain
|
3 |
+
from abnumber.position import Position, sort_positions
|
4 |
+
from abnumber.alignment import Alignment
|
5 |
+
from abnumber.common import SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS
|
6 |
+
from abnumber.exceptions import ChainParseError
|
abnumber/__version__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__version__ = '0.3.0'
|
abnumber/alignment.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
from abnumber.common import is_similar_residue, is_integer
|
4 |
+
from abnumber.position import Position
|
5 |
+
|
6 |
+
|
7 |
+
class Alignment:
|
8 |
+
"""Antibody chain alignment of two or more chains
|
9 |
+
|
10 |
+
>>> from abnumber import Chain
|
11 |
+
>>>
|
12 |
+
>>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP'
|
13 |
+
>>> chain1 = Chain(seq1, scheme='imgt')
|
14 |
+
>>>
|
15 |
+
>>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP'
|
16 |
+
>>> chain2 = Chain(seq2, scheme='imgt')
|
17 |
+
>>> alignment = chain1.align(chain2)
|
18 |
+
|
19 |
+
Alignment can be sliced and iterated:
|
20 |
+
|
21 |
+
>>> for pos, (aa, bb) in alignment[:'5']:
|
22 |
+
>>> print(pos, aa, bb)
|
23 |
+
H1 Q Q
|
24 |
+
H2 V V
|
25 |
+
H3 Q Q
|
26 |
+
H4 L L
|
27 |
+
H5 Q V
|
28 |
+
...
|
29 |
+
|
30 |
+
"""
|
31 |
+
def __init__(self, positions, residues, scheme, chain_type):
|
32 |
+
assert isinstance(positions, list), 'Expected list of positions and residues. ' \
|
33 |
+
'Use chain.align(other) to create an alignment.'
|
34 |
+
assert len(positions) == len(residues)
|
35 |
+
unique_cdr_definitions = set(pos.cdr_definition for pos in positions)
|
36 |
+
assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
|
37 |
+
self.positions = positions
|
38 |
+
self.residues = residues
|
39 |
+
self.scheme = scheme
|
40 |
+
self.chain_type = chain_type
|
41 |
+
self._zipped = list(zip(self.positions, self.residues))
|
42 |
+
|
43 |
+
def __repr__(self):
|
44 |
+
return self.format()
|
45 |
+
|
46 |
+
def __iter__(self):
|
47 |
+
yield from self._zipped.__iter__()
|
48 |
+
|
49 |
+
def __len__(self):
|
50 |
+
return len(self.positions)
|
51 |
+
|
52 |
+
def __getitem__(self, item):
|
53 |
+
if isinstance(item, slice):
|
54 |
+
if item.step is not None and item.step != 1:
|
55 |
+
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
|
56 |
+
return self.slice(start=item.start, stop=item.stop)
|
57 |
+
pos = self._parse_position(item)
|
58 |
+
raw_pos = self.positions.index(pos)
|
59 |
+
return self.residues[raw_pos]
|
60 |
+
|
61 |
+
def slice(self, start: Union[str, int, 'Position'] = None, stop: Union[str, int, 'Position'] = None,
|
62 |
+
stop_inclusive: bool = True, allow_raw: bool = False):
|
63 |
+
"""Create a slice of this alignment
|
64 |
+
|
65 |
+
You can also slice directly using ``alignment['111':'112A']`` or ``alignment.raw[10:20]``.
|
66 |
+
|
67 |
+
:param start: Slice start position (inclusive), :class:`Position` or string (e.g. '111A')
|
68 |
+
:param stop: Slice stop position (inclusive), :class:`Position` or string (e.g. '112A')
|
69 |
+
:param stop_inclusive: Include stop position in slice
|
70 |
+
:param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1
|
71 |
+
:return: new sliced Alignment object
|
72 |
+
"""
|
73 |
+
|
74 |
+
start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None
|
75 |
+
stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None
|
76 |
+
|
77 |
+
new_positions = []
|
78 |
+
new_residues = []
|
79 |
+
for pos, residues in zip(self.positions, self.residues):
|
80 |
+
if start is not None and pos < start:
|
81 |
+
continue
|
82 |
+
if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)):
|
83 |
+
break
|
84 |
+
new_positions.append(pos)
|
85 |
+
new_residues.append(residues)
|
86 |
+
|
87 |
+
return Alignment(positions=new_positions, residues=new_residues, scheme=self.scheme, chain_type=self.chain_type)
|
88 |
+
|
89 |
+
def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
|
90 |
+
"""Create :class:`Position` key object from string or int.
|
91 |
+
|
92 |
+
Note: The position should only be used for indexing, CDR definition is not preserved!
|
93 |
+
|
94 |
+
:param position: Numeric or string position representation
|
95 |
+
:param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1
|
96 |
+
:return: new Position object, should only be used for indexing, CDR definition is not preserved!
|
97 |
+
"""
|
98 |
+
if isinstance(position, str):
|
99 |
+
return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme)
|
100 |
+
if isinstance(position, Position):
|
101 |
+
return position
|
102 |
+
try:
|
103 |
+
position = int(position)
|
104 |
+
except TypeError:
|
105 |
+
raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"')
|
106 |
+
if not allow_raw:
|
107 |
+
raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. "
|
108 |
+
"For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])")
|
109 |
+
if position >= len(self.positions):
|
110 |
+
return None
|
111 |
+
return self.positions[position]
|
112 |
+
|
113 |
+
def format(self, mark_identity=True, mark_cdrs=True):
|
114 |
+
"""Format alignment to string
|
115 |
+
|
116 |
+
:param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
|
117 |
+
:param mark_cdrs: Add line highlighting CDR regions using ``^``
|
118 |
+
:return: formatted string
|
119 |
+
"""
|
120 |
+
|
121 |
+
def _identity_symbol(a, b):
|
122 |
+
return '|' if a == b else ('+' if is_similar_residue(a, b) else '.')
|
123 |
+
|
124 |
+
lines = []
|
125 |
+
for i in range(len(self.residues[0])):
|
126 |
+
if mark_identity and i != 0:
|
127 |
+
lines.append(''.join(_identity_symbol(aas[i], aas[i-1]) for pos, aas in self))
|
128 |
+
lines.append(''.join(aas[i] for pos, aas in self))
|
129 |
+
if mark_cdrs:
|
130 |
+
if self.positions[0].cdr_definition == 'kabat':
|
131 |
+
lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
|
132 |
+
else:
|
133 |
+
lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
|
134 |
+
return '\n'.join(lines)
|
135 |
+
|
136 |
+
def print(self, mark_identity=True, mark_cdrs=True):
|
137 |
+
"""Print string representation of alignment created using :meth:`Alignment.format`
|
138 |
+
|
139 |
+
>>> alignment.print()
|
140 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
141 |
+
||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.|||||||||||
|
142 |
+
QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
|
143 |
+
^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^
|
144 |
+
>>> alignment.print(mark_identity=False, mark_cdrs=False)
|
145 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
146 |
+
QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
|
147 |
+
|
148 |
+
:param mark_identity: Add BLAST style middle line showing identity (``|``), similar residue (``+``) or different residue (``.``)
|
149 |
+
:param mark_cdrs: Add line highlighting CDR regions using ``^``
|
150 |
+
"""
|
151 |
+
print(self.format(mark_identity=mark_identity, mark_cdrs=mark_cdrs))
|
152 |
+
|
153 |
+
def has_mutation(self):
|
154 |
+
"""Check if there is a mutation in the alignment or not"""
|
155 |
+
return any(len(set(aas)) != 1 for aas in self.residues)
|
156 |
+
|
157 |
+
def num_mutations(self):
|
158 |
+
"""Get number of mutations (positions with more than one type of residue)"""
|
159 |
+
return sum(len(set(aas)) != 1 for aas in self.residues)
|
160 |
+
|
161 |
+
@property
|
162 |
+
def raw(self):
|
163 |
+
"""Access raw representation of this alignment to allow unaligned numeric indexing and slicing
|
164 |
+
|
165 |
+
>>> # Numbering of ``chain.raw`` starts at 0
|
166 |
+
>>> alignment.raw[0]
|
167 |
+
'H1'
|
168 |
+
>>> # Slicing with string is based on schema numbering, the end is inclusive
|
169 |
+
>>> chain['1':'10']
|
170 |
+
'QVQLQQSGAE'
|
171 |
+
>>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style)
|
172 |
+
>>> chain.raw[0:10]
|
173 |
+
'QVQLQQSGAE'
|
174 |
+
:return: Raw alignment accessor that can be sliced or indexed to produce a new :class:`Alignment` object
|
175 |
+
"""
|
176 |
+
return RawAlignmentAccessor(self)
|
177 |
+
|
178 |
+
|
179 |
+
class RawAlignmentAccessor:
|
180 |
+
def __init__(self, alignment: Alignment):
|
181 |
+
self.alignment = alignment
|
182 |
+
|
183 |
+
def __getitem__(self, item):
|
184 |
+
if isinstance(item, slice):
|
185 |
+
if item.step is not None and item.step != 1:
|
186 |
+
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
|
187 |
+
if item.start is not None and not is_integer(item.start):
|
188 |
+
raise IndexError(f'Expected int start index for alignment.raw, got {type(item.start)}: {item.start}')
|
189 |
+
if item.stop is not None and not is_integer(item.stop):
|
190 |
+
raise IndexError(f'Expected int end index for alignment.raw, got {type(item.stop)}: {item.stop}')
|
191 |
+
return self.alignment.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True)
|
192 |
+
if not is_integer(item):
|
193 |
+
raise IndexError(f'Expected int indexing for alignment.raw, got {type(item)}: {item}')
|
194 |
+
pos = self.alignment.positions[item]
|
195 |
+
return self.alignment[pos]
|
abnumber/chain.py
ADDED
@@ -0,0 +1,781 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import OrderedDict
|
2 |
+
from typing import Union, List, Generator, Tuple
|
3 |
+
from Bio import SeqIO
|
4 |
+
from Bio.SeqRecord import SeqRecord
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from abnumber.alignment import Alignment
|
8 |
+
from abnumber.common import _anarci_align, _validate_chain_type, SUPPORTED_SCHEMES, SUPPORTED_CDR_DEFINITIONS, \
|
9 |
+
is_integer, SCHEME_BORDERS, _get_unique_chains
|
10 |
+
from abnumber.exceptions import ChainParseError
|
11 |
+
import numpy as np
|
12 |
+
from Bio.Seq import Seq
|
13 |
+
|
14 |
+
from abnumber.position import Position
|
15 |
+
|
16 |
+
|
17 |
+
class Chain:
|
18 |
+
"""
|
19 |
+
Antibody chain aligned to a chosen antibody numbering scheme
|
20 |
+
|
21 |
+
:example:
|
22 |
+
|
23 |
+
>>> from abnumber import Chain
|
24 |
+
>>>
|
25 |
+
>>> seq = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAPSVYPLA'
|
26 |
+
>>> chain = Chain(seq, scheme='imgt')
|
27 |
+
>>> chain
|
28 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
29 |
+
^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
|
30 |
+
|
31 |
+
Chain can be iterated:
|
32 |
+
|
33 |
+
>>> for pos, aa in chain:
|
34 |
+
>>> print(pos, aa)
|
35 |
+
H1 Q
|
36 |
+
H2 V
|
37 |
+
H3 Q
|
38 |
+
H4 L
|
39 |
+
H5 Q
|
40 |
+
...
|
41 |
+
|
42 |
+
Chain can also be indexed and sliced using scheme numbering:
|
43 |
+
|
44 |
+
>>> chain['5']
|
45 |
+
'Q'
|
46 |
+
>>> for pos, aa in chain['H2':'H5']:
|
47 |
+
>>> print(pos, aa)
|
48 |
+
H2 V
|
49 |
+
H3 Q
|
50 |
+
H4 L
|
51 |
+
H5 Q
|
52 |
+
|
53 |
+
:param sequence: Unaligned string sequence
|
54 |
+
:param name: Optional sequence identifier
|
55 |
+
:param scheme: Numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho``
|
56 |
+
:param cdr_definition: Numbering scheme to be used for definition of CDR regions. Same as ``scheme`` by default.
|
57 |
+
One of ``imgt``, ``chothia``, ``kabat``, ``north``. Required for ``aho``.
|
58 |
+
:param assign_germline: Assign germline name using ANARCI based on best sequence identity
|
59 |
+
:param allowed_species: Allowed species for germline assignment. Use ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
|
60 |
+
:param aa_dict: (Internal use only) Create Chain object directly from dictionary of region objects (internal use)
|
61 |
+
:param tail: (Internal use only) Constant region sequence
|
62 |
+
:param species: (Internal use only) Species as identified by ANARCI
|
63 |
+
:param germline: (Internal use only) Germline as identified by ANARCI
|
64 |
+
"""
|
65 |
+
|
66 |
+
def __init__(self, sequence, scheme, cdr_definition=None, name=None, assign_germline=False, allowed_species=None, **kwargs):
|
67 |
+
aa_dict = kwargs.pop('aa_dict', None)
|
68 |
+
chain_type = kwargs.pop('chain_type', None)
|
69 |
+
tail = kwargs.pop('tail', None)
|
70 |
+
species = kwargs.pop('species', None)
|
71 |
+
v_gene = kwargs.pop('v_gene', None)
|
72 |
+
j_gene = kwargs.pop('j_gene', None)
|
73 |
+
if isinstance(allowed_species, str):
|
74 |
+
allowed_species = [allowed_species]
|
75 |
+
if len(kwargs):
|
76 |
+
raise TypeError(f'Argument not recognized: {", ".join(kwargs)}')
|
77 |
+
if aa_dict is not None:
|
78 |
+
if sequence is not None:
|
79 |
+
raise ChainParseError('Only one of aa_dict= and sequence= can be provided')
|
80 |
+
assert isinstance(aa_dict, dict), f'Expected dict, got: {type(aa_dict)}'
|
81 |
+
assert tail is not None
|
82 |
+
assert chain_type is not None
|
83 |
+
else:
|
84 |
+
if sequence is None:
|
85 |
+
raise ChainParseError('Expected sequence, got None')
|
86 |
+
if not isinstance(sequence, str) and not isinstance(sequence, Seq):
|
87 |
+
raise ChainParseError(f'Expected string or Seq, got {type(sequence)}: {sequence}')
|
88 |
+
if '-' in sequence:
|
89 |
+
raise ChainParseError(f'Please provide an unaligned sequence, got: {sequence}')
|
90 |
+
if chain_type is not None:
|
91 |
+
raise ChainParseError('Do not use chain_type= when providing sequence=, it will be inferred automatically')
|
92 |
+
if tail is not None:
|
93 |
+
raise ChainParseError('Do not use tail= when providing sequence=, it will be inferred automatically')
|
94 |
+
if isinstance(sequence, Seq):
|
95 |
+
sequence = str(sequence)
|
96 |
+
results = _anarci_align(sequence, scheme=scheme, allowed_species=allowed_species, assign_germline=assign_germline)
|
97 |
+
if len(results) > 1:
|
98 |
+
raise ChainParseError(f'Found {len(results)} antibody domains in sequence: "{sequence}"')
|
99 |
+
aa_dict, chain_type, tail, species, v_gene, j_gene = results[0]
|
100 |
+
|
101 |
+
_validate_chain_type(chain_type)
|
102 |
+
|
103 |
+
self.name: str = name
|
104 |
+
"""User-provided sequence identifier"""
|
105 |
+
self.chain_type: str = chain_type
|
106 |
+
"""Chain type as identified by ANARCI: ``H`` (heavy), ``K`` (kappa light) or ``L`` (lambda light)
|
107 |
+
|
108 |
+
See also :meth:`Chain.is_heavy_chain` and :meth:`Chain.is_light_chain`.
|
109 |
+
"""
|
110 |
+
self.scheme: str = scheme
|
111 |
+
"""Numbering scheme used to align the sequence"""
|
112 |
+
self.cdr_definition: str = cdr_definition or scheme
|
113 |
+
"""Numbering scheme to be used for definition of CDR regions (same as ``scheme`` by default)"""
|
114 |
+
self.tail: str = tail
|
115 |
+
"""Constant region sequence"""
|
116 |
+
self.species: str = species
|
117 |
+
"""Species as identified by ANARCI"""
|
118 |
+
self.v_gene: str = v_gene
|
119 |
+
"""V gene germline as identified by ANARCI (if assign_germline is True)"""
|
120 |
+
self.j_gene: str = j_gene
|
121 |
+
"""J gene germline as identified by ANARCI (if assign_germline is True)"""
|
122 |
+
|
123 |
+
self.fr1_dict = OrderedDict()
|
124 |
+
self.cdr1_dict = OrderedDict()
|
125 |
+
self.fr2_dict = OrderedDict()
|
126 |
+
self.cdr2_dict = OrderedDict()
|
127 |
+
self.fr3_dict = OrderedDict()
|
128 |
+
self.cdr3_dict = OrderedDict()
|
129 |
+
self.fr4_dict = OrderedDict()
|
130 |
+
|
131 |
+
self._init_from_dict(aa_dict, allowed_species=allowed_species)
|
132 |
+
|
133 |
+
def _init_from_dict(self, aa_dict, allowed_species):
|
134 |
+
if self.scheme not in SUPPORTED_SCHEMES:
|
135 |
+
raise NotImplementedError(f'Scheme "{self.scheme}" is not supported. Available schemes: {", ".join(SUPPORTED_SCHEMES)}')
|
136 |
+
if self.cdr_definition in ['aho']:
|
137 |
+
raise ValueError('CDR regions are not defined for AHo, '
|
138 |
+
'you need to specify cdr_definition="chothia" or another scheme for CDR extraction.')
|
139 |
+
if self.cdr_definition not in SUPPORTED_CDR_DEFINITIONS:
|
140 |
+
raise NotImplementedError(f'CDR definition "{self.scheme}" is not supported. Available definitions: {", ".join(SUPPORTED_SCHEMES)}')
|
141 |
+
# list of region start positions
|
142 |
+
borders = SCHEME_BORDERS[self.cdr_definition] if self.cdr_definition in SCHEME_BORDERS else SCHEME_BORDERS[f'{self.cdr_definition}_{self.chain_type}']
|
143 |
+
|
144 |
+
regions_list = [self.fr1_dict, self.cdr1_dict, self.fr2_dict, self.cdr2_dict, self.fr3_dict, self.cdr3_dict, self.fr4_dict]
|
145 |
+
region_idx = 0
|
146 |
+
|
147 |
+
sorted_positions = sorted(aa_dict.keys())
|
148 |
+
|
149 |
+
cdr_definition_ready = True
|
150 |
+
for pos in sorted_positions:
|
151 |
+
assert pos.scheme == self.scheme, f'Schemes of provided position ({pos.scheme}) does not match Chain scheme ({self.scheme})'
|
152 |
+
if pos.cdr_definition != self.cdr_definition:
|
153 |
+
cdr_definition_ready = False
|
154 |
+
|
155 |
+
if cdr_definition_ready:
|
156 |
+
combined_aa_dict = aa_dict
|
157 |
+
else:
|
158 |
+
seq = ''.join(aa_dict[pos] for pos in sorted_positions)
|
159 |
+
renumbered_aa_dict = _anarci_align(
|
160 |
+
seq,
|
161 |
+
scheme=self.cdr_definition if self.cdr_definition != 'north' else 'chothia',
|
162 |
+
allowed_species=allowed_species
|
163 |
+
)[0][0]
|
164 |
+
cdr_definition_positions = [pos.number for pos in sorted(renumbered_aa_dict.keys())]
|
165 |
+
combined_aa_dict = {}
|
166 |
+
for orig_pos, cdr_definition_position in zip(sorted_positions, cdr_definition_positions):
|
167 |
+
aa = aa_dict[orig_pos]
|
168 |
+
pos = orig_pos.copy()
|
169 |
+
pos.set_cdr_definition(self.cdr_definition, cdr_definition_position)
|
170 |
+
combined_aa_dict[pos] = aa
|
171 |
+
|
172 |
+
for pos in sorted(combined_aa_dict.keys()):
|
173 |
+
assert isinstance(pos, Position), f'Expected Position object, got {type(pos)}: {pos}'
|
174 |
+
aa = combined_aa_dict[pos].upper().strip()
|
175 |
+
if aa in [None, '*', '-', '', '.']:
|
176 |
+
continue
|
177 |
+
while pos.cdr_definition_position >= borders[region_idx]:
|
178 |
+
region_idx += 1
|
179 |
+
regions_list[region_idx][pos] = aa
|
180 |
+
|
181 |
+
def __repr__(self):
|
182 |
+
return self.format()
|
183 |
+
|
184 |
+
def __str__(self):
|
185 |
+
return self.seq
|
186 |
+
|
187 |
+
def __iter__(self):
|
188 |
+
yield from self.positions.items().__iter__()
|
189 |
+
|
190 |
+
def __getitem__(self, item):
|
191 |
+
if isinstance(item, slice):
|
192 |
+
if item.step is not None and item.step != 1:
|
193 |
+
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
|
194 |
+
return self.slice(start=item.start, stop=item.stop)
|
195 |
+
pos = self._parse_position(item)
|
196 |
+
return self.positions[pos]
|
197 |
+
|
198 |
+
def __len__(self):
|
199 |
+
return len(self.positions)
|
200 |
+
|
201 |
+
def __hash__(self):
|
202 |
+
return hash(self.positions)
|
203 |
+
|
204 |
+
def __eq__(self, other):
|
205 |
+
"""Check chain equality. Only checks scheme, aligned sequence and tail sequence, ignores name, metadata and CDR definitions."""
|
206 |
+
assert isinstance(other, Chain), f'Can only compare Chain to another Chain, got {type(other)}: {other}'
|
207 |
+
return self.positions == other.positions and self.tail == other.tail
|
208 |
+
|
209 |
+
@classmethod
|
210 |
+
def to_fasta(cls, chains, path_or_fd, keep_tail=False, description=''):
|
211 |
+
"""Save multiple chains to FASTA"""
|
212 |
+
if isinstance(chains, Chain):
|
213 |
+
records = chains.to_seq_record(keep_tail=keep_tail, description=description)
|
214 |
+
else:
|
215 |
+
records = (chain.to_seq_record(keep_tail=keep_tail, description=description) for chain in chains)
|
216 |
+
return SeqIO.write(records, path_or_fd, 'fasta-2line')
|
217 |
+
|
218 |
+
@classmethod
|
219 |
+
def from_fasta(cls, path_or_handle, scheme, cdr_definition=None, as_series=False, as_generator=False, **kwargs) -> Union[List['Chain'], pd.Series, Generator['Chain', None, None]]:
|
220 |
+
"""Read multiple chains from FASTA"""
|
221 |
+
generator = (cls(record.seq, name=record.name, scheme=scheme, cdr_definition=cdr_definition, **kwargs)
|
222 |
+
for record in SeqIO.parse(path_or_handle, 'fasta'))
|
223 |
+
if as_generator:
|
224 |
+
return generator
|
225 |
+
chains = list(generator)
|
226 |
+
if as_series:
|
227 |
+
return pd.Series(chains, index=[c.name for c in chains])
|
228 |
+
return chains
|
229 |
+
|
230 |
+
def to_seq_record(self, keep_tail=False, description=''):
|
231 |
+
"""Create BioPython SeqRecord object from this Chain"""
|
232 |
+
if not self.name:
|
233 |
+
raise ValueError('Name needs to be present to convert to a SeqRecord')
|
234 |
+
seq = Seq(self.seq + self.tail if keep_tail else self.seq)
|
235 |
+
return SeqRecord(seq, id=self.name, description=description)
|
236 |
+
|
237 |
+
@classmethod
|
238 |
+
def to_anarci_csv(cls, chains: List['Chain'], path):
|
239 |
+
"""Save multiple chains to ANARCI-like CSV"""
|
240 |
+
df = cls.to_dataframe(chains)
|
241 |
+
df.to_csv(path)
|
242 |
+
|
243 |
+
@classmethod
|
244 |
+
def to_dataframe(cls, chains: List['Chain']):
|
245 |
+
"""Produce a Pandas dataframe with aligned chain sequences in the columns
|
246 |
+
|
247 |
+
Note: Contains only positions (columns) that are present in the provided chains,
|
248 |
+
so number of columns can differ based on the input.
|
249 |
+
"""
|
250 |
+
series_list = [chain.to_series() for chain in chains]
|
251 |
+
|
252 |
+
# Each chain can have a different set of positions
|
253 |
+
# so we need to sort the columns to make sure they are in the right order
|
254 |
+
# this is using the correct Position sorting
|
255 |
+
columns = set(c for series in series_list for c in series.index)
|
256 |
+
prop_columns = [c for c in columns if not isinstance(c, Position)]
|
257 |
+
position_columns = sorted([c for c in columns if isinstance(c, Position)])
|
258 |
+
# Columns can come from K and L chain, so we need to convert them to string and remove duplicates here
|
259 |
+
position_columns_str = pd.Series(
|
260 |
+
[pos.format(chain_type=False) for pos in position_columns]
|
261 |
+
).drop_duplicates().to_list()
|
262 |
+
|
263 |
+
# Get full list of string columns
|
264 |
+
columns_str = prop_columns + position_columns_str
|
265 |
+
|
266 |
+
# Reindex each series using ordered list of string columns
|
267 |
+
series_list_ordered = []
|
268 |
+
for series in series_list:
|
269 |
+
series.index = series.index.map(lambda pos: pos.format(chain_type=False))
|
270 |
+
series_list_ordered.append(series.reindex(columns_str))
|
271 |
+
|
272 |
+
df = pd.DataFrame(series_list_ordered)[columns_str].fillna('-')
|
273 |
+
df.index.name = 'Id'
|
274 |
+
|
275 |
+
return df
|
276 |
+
|
277 |
+
def to_series(self):
|
278 |
+
props = {
|
279 |
+
'chain_type': self.chain_type,
|
280 |
+
'species': self.species
|
281 |
+
}
|
282 |
+
return pd.Series({**props, **self.positions}, name=self.name)
|
283 |
+
|
284 |
+
@classmethod
|
285 |
+
def from_series(cls, series, scheme, cdr_definition=None) -> 'Chain':
|
286 |
+
chain_type = series['chain_type']
|
287 |
+
species = series.get('species')
|
288 |
+
position_index = [c for c in series.index if c[:1].isnumeric()]
|
289 |
+
aa_dict = {Position.from_string(pos, chain_type=chain_type, scheme=scheme): aa
|
290 |
+
for pos, aa in series[position_index].items() if aa != '-' and not pd.isna(aa)}
|
291 |
+
return cls(sequence=None, aa_dict=aa_dict, name=series.name, scheme=scheme, cdr_definition=cdr_definition,
|
292 |
+
chain_type=chain_type, species=species, tail='')
|
293 |
+
|
294 |
+
@classmethod
|
295 |
+
def from_anarci_csv(cls, path, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]:
|
296 |
+
df = pd.read_csv(path, index_col=0)
|
297 |
+
return cls.from_dataframe(df, scheme=scheme, cdr_definition=cdr_definition, as_series=as_series)
|
298 |
+
|
299 |
+
@classmethod
|
300 |
+
def from_dataframe(cls, df, scheme, cdr_definition=None, as_series=False) -> Union[List['Chain'], pd.Series]:
|
301 |
+
chains = [cls.from_series(series, scheme=scheme, cdr_definition=cdr_definition) for i, series in df.iterrows()]
|
302 |
+
if as_series:
|
303 |
+
return pd.Series(chains, index=[c.name for c in chains])
|
304 |
+
return chains
|
305 |
+
|
306 |
+
def format(self, method='wide', **kwargs):
|
307 |
+
"""Format sequence to string
|
308 |
+
|
309 |
+
:param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()`
|
310 |
+
:return: formatted string
|
311 |
+
"""
|
312 |
+
if method == 'wide':
|
313 |
+
return self.format_wide(**kwargs)
|
314 |
+
elif method == 'tall':
|
315 |
+
return self.format_tall(**kwargs)
|
316 |
+
raise ValueError(f'Use method="wide" or method="tall", unknown method: "{method}"')
|
317 |
+
|
318 |
+
def print(self, method='wide', **kwargs):
|
319 |
+
"""Print string representation using :meth:`Chain.format`
|
320 |
+
|
321 |
+
By default, produces "wide" format with sequence on first line and CDR regions higlighted with ``^`` on second line:
|
322 |
+
|
323 |
+
>>> chain.print()
|
324 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
325 |
+
^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
|
326 |
+
|
327 |
+
:param method: use ``"wide"`` for :meth:`Chain.format_wide` or ``"tall"`` for :meth:`Chain.format_tall()`
|
328 |
+
"""
|
329 |
+
print(self.format(method=method, **kwargs))
|
330 |
+
|
331 |
+
def format_tall(self, columns=5):
|
332 |
+
"""Create string with one position per line, showing position numbers and amino acids
|
333 |
+
|
334 |
+
:return: formatted string
|
335 |
+
"""
|
336 |
+
height = int(np.ceil(len(self) / columns))
|
337 |
+
rows = [''] * height
|
338 |
+
for column, start in enumerate(range(0, len(self), height)):
|
339 |
+
chain_slice = self.raw[start:start+height]
|
340 |
+
for row, (pos, aa) in enumerate(chain_slice):
|
341 |
+
rows[row] = rows[row].ljust(column * 15)
|
342 |
+
pos_format = (pos.get_region() + ' ' if pos.is_in_cdr() else '') + pos.format()
|
343 |
+
rows[row] += f'{pos_format.rjust(9)} {aa}'
|
344 |
+
|
345 |
+
return '\n'.join(rows)
|
346 |
+
|
347 |
+
def print_tall(self, columns=5):
|
348 |
+
"""Print string representation using :meth:`Chain.format_tall`
|
349 |
+
|
350 |
+
>>> chain.print_tall()
|
351 |
+
FR1 H1 Q
|
352 |
+
FR1 H2 V
|
353 |
+
FR1 H3 Q
|
354 |
+
FR1 H4 L
|
355 |
+
FR1 H5 Q
|
356 |
+
FR1 H6 Q
|
357 |
+
FR1 H7 S
|
358 |
+
...
|
359 |
+
"""
|
360 |
+
print(self.format_tall(columns=columns))
|
361 |
+
|
362 |
+
def format_wide(self, numbering=False):
|
363 |
+
"""Create string with sequence on first line and CDR regions higlighted with `^` on second line
|
364 |
+
|
365 |
+
:param numbering: Add position numbers on top
|
366 |
+
:return: formatted string
|
367 |
+
"""
|
368 |
+
lines = []
|
369 |
+
if numbering:
|
370 |
+
|
371 |
+
first_order = ''
|
372 |
+
prev_number = None
|
373 |
+
after_double_digit = False
|
374 |
+
for pos in self.positions:
|
375 |
+
number = str(pos.number // 10)
|
376 |
+
if number != prev_number:
|
377 |
+
if after_double_digit:
|
378 |
+
# Special case: when double digits follow another double digits, do not print the first digit
|
379 |
+
number = number[1:]
|
380 |
+
first_order += number
|
381 |
+
if len(number) > 1:
|
382 |
+
after_double_digit = True
|
383 |
+
else:
|
384 |
+
if after_double_digit:
|
385 |
+
# Special case: After 10, 11, etc, skip adding the space
|
386 |
+
after_double_digit = False
|
387 |
+
else:
|
388 |
+
first_order += ' '
|
389 |
+
prev_number = number
|
390 |
+
|
391 |
+
lines.append(first_order)
|
392 |
+
lines.append(''.join(str(pos.number % 10) for pos in self.positions))
|
393 |
+
letters = ''.join(pos.letter or ' ' for pos in self.positions)
|
394 |
+
if letters.strip():
|
395 |
+
lines.append(letters)
|
396 |
+
lines.append(self.seq)
|
397 |
+
if self.cdr_definition == 'kabat':
|
398 |
+
lines.append(''.join('^' if pos.is_in_cdr() else ("°" if pos.is_in_vernier() else ' ') for pos in self.positions))
|
399 |
+
else:
|
400 |
+
lines.append(''.join('^' if pos.is_in_cdr() else ' ' for pos in self.positions))
|
401 |
+
return '\n'.join(lines)
|
402 |
+
|
403 |
+
def print_wide(self, numbering=False):
|
404 |
+
"""Print string representation using :meth:`Chain.format_wide`
|
405 |
+
|
406 |
+
>>> chain.print_wide()
|
407 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
408 |
+
^^^^^^^^ ^^^^^^^^ ^^^^^^^^^^^^
|
409 |
+
"""
|
410 |
+
print(self.format_wide(numbering=numbering))
|
411 |
+
|
412 |
+
def is_heavy_chain(self):
|
413 |
+
"""Check if this chain is heavy chain (``chain_type=="H"``)"""
|
414 |
+
return self.chain_type == 'H'
|
415 |
+
|
416 |
+
def is_light_chain(self):
|
417 |
+
"""Check if this chain is light chain (``chain_type=="K" or chain_type=="L"``)"""
|
418 |
+
return self.is_lambda_light_chain() or self.is_kappa_light_chain()
|
419 |
+
|
420 |
+
def is_lambda_light_chain(self):
|
421 |
+
"""Check if this chain is lambda light chain (``chain_type=="L"``)"""
|
422 |
+
return self.chain_type == 'L'
|
423 |
+
|
424 |
+
def is_kappa_light_chain(self):
|
425 |
+
"""Check if this chain is kappa light chain (``chain_type=="K"``)"""
|
426 |
+
return self.chain_type == 'K'
|
427 |
+
|
428 |
+
def align(self, *other) -> 'Alignment':
|
429 |
+
"""Align this chain to other chains by using their existing numbering
|
430 |
+
|
431 |
+
>>> from abnumber import Chain
|
432 |
+
>>>
|
433 |
+
>>> seq1 = 'QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPSRGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSSAKTTAP'
|
434 |
+
>>> chain1 = Chain(seq1, scheme='imgt')
|
435 |
+
>>>
|
436 |
+
>>> seq2 = 'QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYDDYLDRWGQGTTLTVSSAKTTAP'
|
437 |
+
>>> chain2 = Chain(seq2, scheme='imgt')
|
438 |
+
>>>
|
439 |
+
>>> alignment = chain1.align(chain2)
|
440 |
+
>>> print(alignment.format())
|
441 |
+
QVQLQQSGAELARPGASVKMSCKASGYTFTRYTMHWVKQRPGQGLEWIGYINPS-RGYTNYNQKFKDKATLTTDKSSSTAYMQLSSLTSEDSAVYYCARYYDDHYCLDYWGQGTTLTVSS
|
442 |
+
||||.||||||.||||+|||||||||||.||||||||||||||||+||||||||.|.||||||||||||||||||||||||||.+|||||||||||||||||....||.|||||||||||
|
443 |
+
QVQLVQSGAELDRPGATVKMSCKASGYTTTRYTMHWVKQRPGQGLDWIGYINPSDRSYTNYNQKFKDKATLTTDKSSSTAYMQKTSLTSEDSAVYYCARYYD--DYLDRWGQGTTLTVSS
|
444 |
+
^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^^
|
445 |
+
|
446 |
+
:param other: The :class:`Chain` object to align, can be repeated to create a multiple sequence alignment
|
447 |
+
:return: :class:`Alignment` object
|
448 |
+
"""
|
449 |
+
pos_dicts = [self.positions]
|
450 |
+
for chain in other:
|
451 |
+
assert isinstance(chain, Chain), f'Expected Chain object, got {type(chain)}: {chain}'
|
452 |
+
pos_dicts.append(chain.positions)
|
453 |
+
|
454 |
+
unique_cdr_definitions = set(pos.cdr_definition for pos_dict in pos_dicts for pos in pos_dict.keys())
|
455 |
+
assert len(unique_cdr_definitions) <= 1, f'Aligned chains should use the same CDR definitions, got: {unique_cdr_definitions}'
|
456 |
+
|
457 |
+
shared_pos = sorted(set(pos for pos_dict in pos_dicts for pos in pos_dict.keys()))
|
458 |
+
residues = [tuple(pos_dict.get(pos, '-') for pos_dict in pos_dicts) for pos in shared_pos]
|
459 |
+
return Alignment(shared_pos, residues, chain_type=self.chain_type, scheme=self.scheme)
|
460 |
+
|
461 |
+
def clone(self, replace_seq: str = None):
|
462 |
+
"""Create a copy of this chain, optionally with a replacement sequence
|
463 |
+
|
464 |
+
:param replace_seq: Optional replacement sequence, needs to be the same length
|
465 |
+
:return: new Chain object
|
466 |
+
"""
|
467 |
+
return self.slice(replace_seq=replace_seq)
|
468 |
+
|
469 |
+
def slice(self, replace_seq: str = None, start: Union[str, int, 'Position'] = None,
|
470 |
+
stop: Union[str, int, 'Position'] = None, stop_inclusive: bool = True, allow_raw: bool = False):
|
471 |
+
"""Create a slice of this chain, optionally with a replacement sequence that is placed into the same numbering
|
472 |
+
|
473 |
+
You can also slice directly using ``chain['111':'112A']`` or ``chain.raw[10:20]``.
|
474 |
+
|
475 |
+
:param replace_seq: Optional replacement sequence, needs to be the same length
|
476 |
+
:param start: Optional slice start position (inclusive), :class:`Position` or string (e.g. '111A')
|
477 |
+
:param stop: Optional slice stop position (inclusive), :class:`Position` or string (e.g. '112A')
|
478 |
+
:param stop_inclusive: Include stop position in slice
|
479 |
+
:param allow_raw: Allow unaligned numeric indexing from 0 to length of sequence - 1
|
480 |
+
:return: new Chain object
|
481 |
+
"""
|
482 |
+
aa_dict = {}
|
483 |
+
positions = self.positions
|
484 |
+
if replace_seq is not None:
|
485 |
+
assert len(replace_seq) == len(positions), 'Sequence needs to be the same length'
|
486 |
+
|
487 |
+
start = self._parse_position(start, allow_raw=allow_raw) if start is not None else None
|
488 |
+
stop = self._parse_position(stop, allow_raw=allow_raw) if stop is not None else None
|
489 |
+
|
490 |
+
for i, (pos, aa) in enumerate(positions.items()):
|
491 |
+
if start is not None and pos < start:
|
492 |
+
continue
|
493 |
+
if stop is not None and (pos > stop or (not stop_inclusive and pos >= stop)):
|
494 |
+
break
|
495 |
+
aa_dict[pos] = replace_seq[i] if replace_seq is not None else aa
|
496 |
+
|
497 |
+
return Chain(
|
498 |
+
sequence=None,
|
499 |
+
aa_dict=aa_dict,
|
500 |
+
name=self.name,
|
501 |
+
scheme=self.scheme,
|
502 |
+
chain_type=self.chain_type,
|
503 |
+
cdr_definition=self.cdr_definition,
|
504 |
+
tail=self.tail,
|
505 |
+
species=self.species,
|
506 |
+
v_gene=self.v_gene,
|
507 |
+
j_gene=self.j_gene
|
508 |
+
)
|
509 |
+
|
510 |
+
def renumber(self, scheme=None, cdr_definition=None, allowed_species=None):
|
511 |
+
"""Return copy of this chain aligned using a different numbering scheme or CDR definition
|
512 |
+
|
513 |
+
:param scheme: Change numbering scheme: One of ``imgt``, ``chothia``, ``kabat``, ``aho``.
|
514 |
+
:param cdr_definition: Change CDR definition scheme: One of ``imgt``, ``chothia``, ``kabat``, ``north``.
|
515 |
+
:param allowed_species: ``None`` to allow all species, or one or more of: ``'human', 'mouse','rat','rabbit','rhesus','pig','alpaca'``
|
516 |
+
"""
|
517 |
+
|
518 |
+
return Chain(
|
519 |
+
self.seq + self.tail,
|
520 |
+
name=self.name,
|
521 |
+
allowed_species=allowed_species,
|
522 |
+
scheme=scheme or self.scheme,
|
523 |
+
cdr_definition=cdr_definition or scheme or self.cdr_definition,
|
524 |
+
assign_germline=self.v_gene is not None
|
525 |
+
)
|
526 |
+
|
527 |
+
def graft_cdrs_onto(self, other: 'Chain', backmutate_vernier=False, backmutations: List[Union['Position',str]] = [], name: str = None) -> 'Chain':
|
528 |
+
"""Graft CDRs from this Chain onto another chain
|
529 |
+
|
530 |
+
:param other: Chain to graft CDRs into (source of frameworks and tail sequence)
|
531 |
+
:param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations)
|
532 |
+
:param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`)
|
533 |
+
:param name: Name of new Chain. If not provided, use name of this chain.
|
534 |
+
:return: Chain with CDRs grafted from this chain and frameworks from the given chain
|
535 |
+
"""
|
536 |
+
assert self.scheme == other.scheme, \
|
537 |
+
f'Sequences need to have the same numbering scheme, got {self.scheme} and {other.scheme}'
|
538 |
+
assert self.cdr_definition == other.cdr_definition, \
|
539 |
+
f'Sequences need to have the same CDR definition, got {self.cdr_definition} and {other.cdr_definition}'
|
540 |
+
assert self.chain_type == other.chain_type, \
|
541 |
+
f'Sequences need to have the same chain type, got {self.chain_type} and {other.chain_type}'
|
542 |
+
|
543 |
+
backmutations = [self._parse_position(pos) for pos in backmutations]
|
544 |
+
|
545 |
+
grafted_dict = {pos: aa for pos, aa in other if not pos.is_in_cdr()}
|
546 |
+
for pos, aa in self:
|
547 |
+
if pos.is_in_cdr() or (backmutate_vernier and pos.is_in_vernier()) or pos in backmutations:
|
548 |
+
grafted_dict[pos] = aa
|
549 |
+
|
550 |
+
return Chain(sequence=None, aa_dict=grafted_dict, name=name or self.name, chain_type=self.chain_type,
|
551 |
+
scheme=self.scheme, cdr_definition=self.cdr_definition, tail=other.tail,
|
552 |
+
v_gene=other.v_gene, j_gene=other.j_gene)
|
553 |
+
|
554 |
+
def graft_cdrs_onto_human_germline(self, v_gene=None, j_gene=None,
|
555 |
+
backmutate_vernier=False, backmutations: List[Union['Position',str]] = []):
|
556 |
+
"""Graft CDRs from this Chain onto the nearest human germline sequence
|
557 |
+
|
558 |
+
:param v_gene: Use defined V germline allele (e.g. IGHV1-18*01), gene (e.g. IGHV1-18) or family (e.g. IGHV1)
|
559 |
+
:param j_gene: Use defined J germline allele (e.g. IGHJ1*01) or gene (e.g. IGHJ1)
|
560 |
+
:param backmutate_vernier: Also graft all Kabat Vernier positions from this chain (perform backmutations)
|
561 |
+
:param backmutations: List of positions that should additionally be grafted from this chain (str or or :class:`Position`)
|
562 |
+
:return: Chain with CDRs grafted from this chain and frameworks from TODO
|
563 |
+
"""
|
564 |
+
germline_chain = self.find_merged_human_germline(v_gene=v_gene, j_gene=j_gene)
|
565 |
+
|
566 |
+
if self.scheme != 'imgt' or self.cdr_definition != 'imgt':
|
567 |
+
germline_chain = germline_chain.renumber(self.scheme, self.cdr_definition)
|
568 |
+
|
569 |
+
return self.graft_cdrs_onto(germline_chain, backmutate_vernier=backmutate_vernier, backmutations=backmutations)
|
570 |
+
|
571 |
+
def _parse_position(self, position: Union[int, str, 'Position'], allow_raw=False):
|
572 |
+
"""Create :class:`Position` key object from string or int.
|
573 |
+
|
574 |
+
Note: The position should only be used for indexing, CDR definition is not preserved!
|
575 |
+
|
576 |
+
:param position: Numeric or string position representation
|
577 |
+
:param allow_raw: Also allow unaligned numeric (int) indexing from 0 to length of sequence - 1
|
578 |
+
:return: new Position object, should only be used for indexing, CDR definition is not preserved!
|
579 |
+
"""
|
580 |
+
if isinstance(position, str):
|
581 |
+
return Position.from_string(position, chain_type=self.chain_type, scheme=self.scheme)
|
582 |
+
if isinstance(position, Position):
|
583 |
+
return position
|
584 |
+
try:
|
585 |
+
position = int(position)
|
586 |
+
except TypeError:
|
587 |
+
raise IndexError(f'Invalid position key, expected Position, string or integer, got {type(position)}: "{position}"')
|
588 |
+
if not allow_raw:
|
589 |
+
raise IndexError("Use chain.raw[i] for raw numeric indexing or pass allow_raw=True. "
|
590 |
+
"For named position indexing, use string (e.g. chain['111A'] or chain['H111A'])")
|
591 |
+
if position >= len(self.positions):
|
592 |
+
return None
|
593 |
+
return self.get_position_by_raw_index(position)
|
594 |
+
|
595 |
+
def get_position_by_raw_index(self, index):
|
596 |
+
"""Get Position object at corresponding raw numeric position"""
|
597 |
+
return list(self.positions.keys())[index]
|
598 |
+
|
599 |
+
def find_human_germlines(self, limit=10, v_gene=None, j_gene=None, unique=True) -> Tuple[List['Chain'], List['Chain']]:
|
600 |
+
"""Find most identical V and J germline sequences based on IMGT alignment
|
601 |
+
|
602 |
+
:param limit: Number of best matching germlines to return
|
603 |
+
:param v_gene: Filter germlines to specific V gene name
|
604 |
+
:param j_gene: Filter germlines to specific J gene name
|
605 |
+
:param unique: Skip germlines with duplicate amino acid sequence
|
606 |
+
:return: list of top V chains, list of top J chains
|
607 |
+
"""
|
608 |
+
from abnumber.germlines import get_imgt_v_chains, get_imgt_j_chains
|
609 |
+
|
610 |
+
chain = self if self.scheme == 'imgt' and self.cdr_definition == 'imgt' else self.renumber('imgt')
|
611 |
+
v_chains = list(get_imgt_v_chains(chain.chain_type).values())
|
612 |
+
j_chains = list(get_imgt_j_chains(chain.chain_type).values())
|
613 |
+
|
614 |
+
if v_gene:
|
615 |
+
if v_gene.startswith('IGKV') and self.chain_type == 'L':
|
616 |
+
raise NotImplementedError('Cannot graft lambda chain into kappa chain')
|
617 |
+
if v_gene.startswith('IGLV') and self.chain_type == 'K':
|
618 |
+
raise NotImplementedError('Cannot graft kappa chain into lambda chain')
|
619 |
+
v_chains = [chain for chain in v_chains if chain.name.startswith(v_gene)]
|
620 |
+
if not v_chains:
|
621 |
+
print('Available V genes:', get_imgt_v_chains(chain.chain_type).keys())
|
622 |
+
raise ValueError(f'No V genes found for "{chain.chain_type}" chain gene name "{v_gene}"')
|
623 |
+
|
624 |
+
if j_gene:
|
625 |
+
j_chains = [chain for chain in j_chains if chain.name.startswith(j_gene)]
|
626 |
+
if not j_chains:
|
627 |
+
print('Available J genes:', get_imgt_j_chains(chain.chain_type).keys())
|
628 |
+
raise ValueError(f'No J genes found for "{chain.chain_type}" chain gene name "{j_gene}"')
|
629 |
+
|
630 |
+
if unique:
|
631 |
+
v_chains = _get_unique_chains(v_chains)
|
632 |
+
j_chains = _get_unique_chains(j_chains)
|
633 |
+
|
634 |
+
v_alignments = [chain.align(germline) for germline in v_chains]
|
635 |
+
v_ranks = np.array([alignment.num_mutations() for alignment in v_alignments]).argsort(kind='stable')[:limit]
|
636 |
+
top_v_chains = [v_chains[r] for r in v_ranks]
|
637 |
+
|
638 |
+
j_alignments = [chain.align(germline) for germline in j_chains]
|
639 |
+
j_ranks = np.array([alignment.num_mutations() for alignment in j_alignments]).argsort(kind='stable')[:limit]
|
640 |
+
top_j_chains = [j_chains[r] for r in j_ranks]
|
641 |
+
|
642 |
+
return top_v_chains, top_j_chains
|
643 |
+
|
644 |
+
def find_merged_human_germline(self, top=0, v_gene=None, j_gene=None) -> 'Chain':
|
645 |
+
"""Find n-th most identical V and J germline sequence based on IMGT alignment and merge them into one Chain
|
646 |
+
|
647 |
+
:param top: Return top N most identical germline (0-indexed)
|
648 |
+
:param v_gene: Filter germlines to specific V gene name
|
649 |
+
:param j_gene: Filter germlines to specific J gene name
|
650 |
+
:return: merged germline sequence Chain object
|
651 |
+
"""
|
652 |
+
v_chains, j_chains = self.find_human_germlines(limit=top+1, v_gene=v_gene, j_gene=j_gene)
|
653 |
+
v_chain = v_chains[top]
|
654 |
+
j_chain = j_chains[top]
|
655 |
+
|
656 |
+
merged_dict = {
|
657 |
+
**{pos: aa for pos, aa in j_chain},
|
658 |
+
**{pos: aa for pos, aa in v_chain}
|
659 |
+
}
|
660 |
+
|
661 |
+
return Chain(
|
662 |
+
sequence=None,
|
663 |
+
aa_dict=merged_dict,
|
664 |
+
chain_type=self.chain_type,
|
665 |
+
scheme='imgt',
|
666 |
+
tail=''
|
667 |
+
)
|
668 |
+
|
669 |
+
@property
|
670 |
+
def raw(self):
|
671 |
+
"""Access raw representation of this chain to allow unaligned numeric indexing and slicing
|
672 |
+
|
673 |
+
>>> # String numbering is based on schema numbering
|
674 |
+
>>> chain['1']
|
675 |
+
'QVQLQQSGAE'
|
676 |
+
>>> # Numbering of ``chain.raw`` starts at 0
|
677 |
+
>>> chain.raw[0]
|
678 |
+
'QVQLQQSGAE'
|
679 |
+
>>> # Slicing with string is based on schema numbering, the end is inclusive
|
680 |
+
>>> chain['1':'10']
|
681 |
+
'QVQLQQSGAE'
|
682 |
+
>>> # Slicing with ``chain.raw`` starts at 0, the end is exclusive (Python style)
|
683 |
+
>>> chain.raw[0:10]
|
684 |
+
'QVQLQQSGAE'
|
685 |
+
|
686 |
+
:return: Raw chain accessor that can be sliced or indexed to produce a new :class:`Chain` object
|
687 |
+
"""
|
688 |
+
return RawChainAccessor(self)
|
689 |
+
|
690 |
+
@property
|
691 |
+
def regions(self):
|
692 |
+
"""Dictionary of region dictionaries
|
693 |
+
|
694 |
+
Region is an uppercase string, one of: ``"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"``
|
695 |
+
|
696 |
+
:return: Dictionary of Region name -> Dictionary of (:class:`Position` -> Amino acid)
|
697 |
+
"""
|
698 |
+
return OrderedDict(
|
699 |
+
FR1=self.fr1_dict,
|
700 |
+
CDR1=self.cdr1_dict,
|
701 |
+
FR2=self.fr2_dict,
|
702 |
+
CDR2=self.cdr2_dict,
|
703 |
+
FR3=self.fr3_dict,
|
704 |
+
CDR3=self.cdr3_dict,
|
705 |
+
FR4=self.fr4_dict
|
706 |
+
)
|
707 |
+
|
708 |
+
@property
|
709 |
+
def positions(self):
|
710 |
+
"""Dictionary of :class:`Position` -> Amino acid"""
|
711 |
+
positions = OrderedDict()
|
712 |
+
for region, aa_dict in self.regions.items():
|
713 |
+
for pos, aa in aa_dict.items():
|
714 |
+
positions[pos] = aa
|
715 |
+
return positions
|
716 |
+
|
717 |
+
@property
|
718 |
+
def seq(self):
|
719 |
+
"""Unaligned string representation of the variable chain sequence
|
720 |
+
|
721 |
+
:return: Unaligned string representation of the variable chain sequence
|
722 |
+
"""
|
723 |
+
return ''.join(self.positions.values())
|
724 |
+
|
725 |
+
@property
|
726 |
+
def fr1_seq(self):
|
727 |
+
"""Unaligned string representation of the Framework 1 region sequence"""
|
728 |
+
return ''.join(self.fr1_dict.values())
|
729 |
+
|
730 |
+
@property
|
731 |
+
def cdr1_seq(self):
|
732 |
+
"""Unaligned string representation of the CDR 1 region sequence"""
|
733 |
+
return ''.join(self.cdr1_dict.values())
|
734 |
+
|
735 |
+
@property
|
736 |
+
def fr2_seq(self):
|
737 |
+
"""Unaligned string representation of the Framework 2 region sequence"""
|
738 |
+
return ''.join(self.fr2_dict.values())
|
739 |
+
|
740 |
+
@property
|
741 |
+
def cdr2_seq(self):
|
742 |
+
"""Unaligned string representation of the CDR 2 region sequence"""
|
743 |
+
return ''.join(self.cdr2_dict.values())
|
744 |
+
|
745 |
+
@property
|
746 |
+
def fr3_seq(self):
|
747 |
+
"""Unaligned string representation of the Framework 3 region sequence"""
|
748 |
+
return ''.join(self.fr3_dict.values())
|
749 |
+
|
750 |
+
@property
|
751 |
+
def cdr3_seq(self):
|
752 |
+
"""Unaligned string representation of the CDR 3 region sequence"""
|
753 |
+
return ''.join(self.cdr3_dict.values())
|
754 |
+
|
755 |
+
@property
|
756 |
+
def fr4_seq(self):
|
757 |
+
"""Unaligned string representation of the Framework 4 region sequence"""
|
758 |
+
return ''.join(self.fr4_dict.values())
|
759 |
+
|
760 |
+
|
761 |
+
class RawChainAccessor:
|
762 |
+
def __init__(self, chain: Chain):
|
763 |
+
self.chain = chain
|
764 |
+
|
765 |
+
def __getitem__(self, item):
|
766 |
+
if isinstance(item, slice):
|
767 |
+
if item.step is not None and item.step != 1:
|
768 |
+
raise IndexError(f'Slicing with step != 1 is not implemented, got: {item}')
|
769 |
+
if item.start is not None and not is_integer(item.start):
|
770 |
+
raise IndexError(f'Expected int start index for chain.raw, got {type(item.start)}: {item.start}')
|
771 |
+
if item.stop is not None and not is_integer(item.stop):
|
772 |
+
raise IndexError(f'Expected int end index for chain.raw, got {type(item.stop)}: {item.stop}')
|
773 |
+
return self.chain.slice(start=item.start, stop=item.stop, stop_inclusive=False, allow_raw=True)
|
774 |
+
if not is_integer(item):
|
775 |
+
raise IndexError(f'Expected int indexing for chain.raw, got {type(item)}: {item}')
|
776 |
+
pos = self.chain.get_position_by_raw_index(item)
|
777 |
+
return self.chain[pos]
|
778 |
+
|
779 |
+
|
780 |
+
|
781 |
+
|
abnumber/common.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from typing import List, Tuple
|
3 |
+
import re
|
4 |
+
import numpy as np
|
5 |
+
from abnumber.exceptions import ChainParseError
|
6 |
+
try:
|
7 |
+
from anarci.anarci import anarci
|
8 |
+
except ImportError:
|
9 |
+
# Only print the error without failing - required to import
|
10 |
+
print('ANARCI module not available. Please install it separately or install AbNumber through Bioconda')
|
11 |
+
print('See: https://abnumber.readthedocs.io/')
|
12 |
+
sys.exit(1)
|
13 |
+
|
14 |
+
POS_REGEX = re.compile(r'([HL]?)(\d+)([A-Z]?)')
|
15 |
+
WHITESPACE = re.compile(r'\s+')
|
16 |
+
|
17 |
+
|
18 |
+
def _validate_chain_type(chain_type):
|
19 |
+
assert chain_type in ['H', 'L', 'K'], \
|
20 |
+
f'Invalid chain type "{chain_type}", it should be "H" (heavy), "L" (lambda light chian) or "K" (kappa light chain)'
|
21 |
+
|
22 |
+
|
23 |
+
def _anarci_align(sequence, scheme, allowed_species, assign_germline=False) -> List[Tuple]:
|
24 |
+
from abnumber.position import Position
|
25 |
+
sequence = re.sub(WHITESPACE, '', sequence)
|
26 |
+
all_numbered, all_ali, all_hits = anarci(
|
27 |
+
[('id', sequence)],
|
28 |
+
scheme=scheme,
|
29 |
+
allowed_species=allowed_species,
|
30 |
+
assign_germline=assign_germline
|
31 |
+
)
|
32 |
+
seq_numbered = all_numbered[0]
|
33 |
+
seq_ali = all_ali[0]
|
34 |
+
if seq_numbered is None:
|
35 |
+
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
|
36 |
+
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
|
37 |
+
results = []
|
38 |
+
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
|
39 |
+
chain_type = ali['chain_type']
|
40 |
+
species = ali['species']
|
41 |
+
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
|
42 |
+
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
|
43 |
+
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
|
44 |
+
for (num, letter), aa in positions if aa != '-'}
|
45 |
+
tail = sequence[end+1:]
|
46 |
+
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
|
47 |
+
return results
|
48 |
+
|
49 |
+
|
50 |
+
def _get_unique_chains(chains):
|
51 |
+
seqs = set()
|
52 |
+
chains_filtered = []
|
53 |
+
for chain in chains:
|
54 |
+
if chain.seq in seqs:
|
55 |
+
continue
|
56 |
+
seqs.add(chain.seq)
|
57 |
+
chains_filtered.append(chain)
|
58 |
+
return chains_filtered
|
59 |
+
|
60 |
+
|
61 |
+
# Based on positive score in Blosum62
|
62 |
+
SIMILAR_PAIRS = {'AA', 'AS', 'CC', 'DD', 'DE', 'DN', 'ED', 'EE', 'EK', 'EQ', 'FF', 'FW', 'FY', 'GG', 'HH', 'HN', 'HY',
|
63 |
+
'II', 'IL', 'IM', 'IV', 'KE', 'KK', 'KQ', 'KR', 'LI', 'LL', 'LM', 'LV', 'MI', 'ML', 'MM', 'MV', 'ND',
|
64 |
+
'NH', 'NN', 'NS', 'PP', 'QE', 'QK', 'QQ', 'QR', 'RK', 'RQ', 'RR', 'SA', 'SN', 'SS', 'ST', 'TS', 'TT',
|
65 |
+
'VI', 'VL', 'VM', 'VV', 'WF', 'WW', 'WY', 'YF', 'YH', 'YW', 'YY'}
|
66 |
+
|
67 |
+
|
68 |
+
def is_similar_residue(a, b):
|
69 |
+
if a == '-' or b == '-':
|
70 |
+
return a == b
|
71 |
+
return a+b in SIMILAR_PAIRS
|
72 |
+
|
73 |
+
|
74 |
+
def is_integer(object):
|
75 |
+
return isinstance(object, int) or isinstance(object, np.integer)
|
76 |
+
|
77 |
+
|
78 |
+
SUPPORTED_SCHEMES = ['imgt', 'aho', 'chothia', 'kabat']
|
79 |
+
SUPPORTED_CDR_DEFINITIONS = ['imgt', 'chothia', 'kabat', 'north']
|
80 |
+
|
81 |
+
SCHEME_BORDERS = {
|
82 |
+
# Start coordinates
|
83 |
+
# CDR1, FR2, CDR2, FR3, CDR3, FR4
|
84 |
+
'imgt': [27, 39, 56, 66, 105, 118, 129],
|
85 |
+
'kabat_H': [31, 36, 50, 66, 95, 103, 114],
|
86 |
+
'kabat_K': [24, 35, 50, 57, 89, 98, 108],
|
87 |
+
'kabat_L': [24, 35, 50, 57, 89, 98, 108],
|
88 |
+
'chothia_H': [26, 33, 52, 57, 95, 103, 114],
|
89 |
+
'chothia_K': [24, 35, 50, 57, 89, 98, 108],
|
90 |
+
'chothia_L': [24, 35, 50, 57, 89, 98, 108],
|
91 |
+
'north_H': [23, 36, 50, 59, 93, 103, 114],
|
92 |
+
'north_K': [24, 35, 49, 57, 89, 98, 108],
|
93 |
+
'north_L': [24, 35, 49, 57, 89, 98, 108],
|
94 |
+
}
|
95 |
+
|
96 |
+
# { scheme -> { region -> list of position numbers } }
|
97 |
+
SCHEME_REGIONS = {
|
98 |
+
scheme: {
|
99 |
+
'FR1': list(range(1, borders[0])),
|
100 |
+
'CDR1': list(range(borders[0], borders[1])),
|
101 |
+
'FR2': list(range(borders[1], borders[2])),
|
102 |
+
'CDR2': list(range(borders[2], borders[3])),
|
103 |
+
'FR3': list(range(borders[3], borders[4])),
|
104 |
+
'CDR3': list(range(borders[4], borders[5])),
|
105 |
+
'FR4': list(range(borders[5], borders[6])),
|
106 |
+
} for scheme, borders in SCHEME_BORDERS.items()
|
107 |
+
}
|
108 |
+
|
109 |
+
# { scheme -> { position number -> region } }
|
110 |
+
SCHEME_POSITION_TO_REGION = {
|
111 |
+
scheme: {pos_num: region for region, positions in regions.items() for pos_num in positions} \
|
112 |
+
for scheme, regions in SCHEME_REGIONS.items()
|
113 |
+
}
|
114 |
+
|
115 |
+
# { scheme -> set of vernier position numbers }
|
116 |
+
SCHEME_VERNIER = {
|
117 |
+
# 'imgt_H': frozenset([2, 52, 53, 54, 76, 78, 80, 82, 87, 118]),
|
118 |
+
# 'chothia_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
|
119 |
+
# 'north_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
|
120 |
+
'kabat_H': frozenset([2, 27, 28, 29, 30, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
|
121 |
+
|
122 |
+
# 'imgt_K': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
|
123 |
+
# 'imgt_L': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
|
124 |
+
# 'chothia_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
125 |
+
# 'chothia_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
126 |
+
# 'north_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
127 |
+
# 'north_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
128 |
+
'kabat_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
129 |
+
'kabat_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
|
130 |
+
}
|
131 |
+
|
132 |
+
#'kabat_H': 31-35, 50-65, 95-102
|
133 |
+
#'kabat_K': 24-34, 50-56, 89-97
|
abnumber/exceptions.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
class ChainParseError(Exception):
|
2 |
+
pass
|
abnumber/germlines.py
ADDED
@@ -0,0 +1,684 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_HUMAN_IMGT_V_CHAINS = None
|
2 |
+
_HUMAN_IMGT_J_CHAINS = None
|
3 |
+
|
4 |
+
|
5 |
+
def get_imgt_chain(gene_name):
|
6 |
+
if gene_name.startswith('IGH'):
|
7 |
+
chain_type = 'H'
|
8 |
+
elif gene_name.startswith('IGK'):
|
9 |
+
chain_type = 'K'
|
10 |
+
elif gene_name.startswith('IGL'):
|
11 |
+
chain_type = 'L'
|
12 |
+
else:
|
13 |
+
raise ValueError(f'Gene name should start with IG(H/K/L), got: {gene_name}')
|
14 |
+
|
15 |
+
if gene_name.startswith(f'IG{chain_type}V'):
|
16 |
+
chains = get_imgt_v_chains(chain_type)
|
17 |
+
elif gene_name.startswith(f'IG{chain_type}J'):
|
18 |
+
chains = get_imgt_j_chains(chain_type)
|
19 |
+
else:
|
20 |
+
raise ValueError(f'Expected V or J gene name, got: {gene_name}')
|
21 |
+
|
22 |
+
if gene_name not in chains:
|
23 |
+
suffixes = [chain_name for chain_name in chains if chain_name.startswith(gene_name)]
|
24 |
+
if suffixes:
|
25 |
+
raise ValueError(f'Gene name "{gene_name}" not complete, use one of: {suffixes}')
|
26 |
+
print('Available gene names:', chains.keys())
|
27 |
+
raise ValueError(f'Gene name "{gene_name}" not found')
|
28 |
+
|
29 |
+
return chains[gene_name]
|
30 |
+
|
31 |
+
|
32 |
+
def get_imgt_v_chains(chain_type=None):
|
33 |
+
global _HUMAN_IMGT_V_CHAINS
|
34 |
+
if _HUMAN_IMGT_V_CHAINS is None or chain_type not in _HUMAN_IMGT_V_CHAINS:
|
35 |
+
_HUMAN_IMGT_V_CHAINS = {}
|
36 |
+
for t, germlines in HUMAN_IMGT_IG_V.items():
|
37 |
+
positions = germlines['positions']
|
38 |
+
seqs = germlines['aligned_sequences']
|
39 |
+
_HUMAN_IMGT_V_CHAINS[t] = {name: germline_to_chain(positions, seq, name=name, chain_type=t) for name, seq in seqs.items()}
|
40 |
+
return _HUMAN_IMGT_V_CHAINS[chain_type]
|
41 |
+
|
42 |
+
|
43 |
+
def get_imgt_j_chains(chain_type=None):
|
44 |
+
global _HUMAN_IMGT_J_CHAINS
|
45 |
+
if _HUMAN_IMGT_J_CHAINS is None or chain_type not in _HUMAN_IMGT_J_CHAINS:
|
46 |
+
_HUMAN_IMGT_J_CHAINS = {}
|
47 |
+
for t, germlines in HUMAN_IMGT_IG_J.items():
|
48 |
+
positions = germlines['positions']
|
49 |
+
seqs = germlines['aligned_sequences']
|
50 |
+
_HUMAN_IMGT_J_CHAINS[t] = {name: germline_to_chain(positions, seq, name=name, chain_type=t) for name, seq in seqs.items()}
|
51 |
+
return _HUMAN_IMGT_J_CHAINS[chain_type]
|
52 |
+
|
53 |
+
|
54 |
+
def germline_to_chain(positions, seq, chain_type, **kwargs):
|
55 |
+
from abnumber.chain import Chain, Position
|
56 |
+
return Chain(sequence=None, scheme='imgt', chain_type=chain_type, tail='', aa_dict={
|
57 |
+
Position.from_string(pos, chain_type=chain_type, scheme='imgt'): aa for pos, aa in zip(positions, seq)
|
58 |
+
}, **kwargs)
|
59 |
+
|
60 |
+
|
61 |
+
def get_germline_v_families(chain_type):
|
62 |
+
names = HUMAN_IMGT_IG_V[chain_type]['aligned_sequences'].keys()
|
63 |
+
return sorted(set([name.split('-')[0].split('/')[0] for name in names]))
|
64 |
+
|
65 |
+
|
66 |
+
def get_germline_v_genes(chain_type):
|
67 |
+
names = HUMAN_IMGT_IG_V[chain_type]['aligned_sequences'].keys()
|
68 |
+
return sorted(set([name.split('*')[0] for name in names]))
|
69 |
+
|
70 |
+
|
71 |
+
HUMAN_IMGT_IG_V = {
|
72 |
+
'H': {
|
73 |
+
"positions": [
|
74 |
+
"H1", "H2", "H3", "H4", "H5", "H6", "H7", "H8", "H9", "H11", "H12", "H13", "H14", "H15", "H16", "H17", "H18", "H19", "H20", "H21", "H22", "H23", "H24", "H25", "H26", "H27", "H28", "H29", "H30", "H31", "H34", "H35", "H36", "H37", "H38", "H39", "H40", "H41", "H42", "H43", "H44", "H45", "H46", "H47", "H48", "H49", "H50", "H51", "H52", "H53", "H54", "H55", "H56", "H57", "H58", "H59", "H60", "H61", "H62", "H63", "H64", "H65", "H66", "H67", "H68", "H69", "H70", "H71", "H72", "H74", "H75", "H76", "H77", "H78", "H79", "H80", "H81", "H82", "H83", "H84", "H85", "H86", "H87", "H88", "H89", "H90", "H91", "H92", "H93", "H94", "H95", "H96", "H97", "H98", "H99", "H100", "H101", "H102", "H103", "H104", "H105", "H106", "H107"
|
75 |
+
],
|
76 |
+
"aligned_sequences": {
|
77 |
+
"IGHV1-18*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCAR-",
|
78 |
+
"IGHV1-18*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDMAVYYCAR-",
|
79 |
+
"IGHV1-18*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYGISWVRQAPGQGLEWMGWISAY--NGNTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCAR-",
|
80 |
+
"IGHV1-2*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTSTRDTSISTAYMELSRLRSDDTVVYYCAR-",
|
81 |
+
"IGHV1-2*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
|
82 |
+
"IGHV1-2*03": "QVQLVQSGAEVKKLGASVKVSCKASGYTF--TGYYMHWVXQAPGQGLEWMGWINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
|
83 |
+
"IGHV1-2*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAQKFQGWVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
|
84 |
+
"IGHV1-2*05": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTVVYYCAR-",
|
85 |
+
"IGHV1-2*06": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
|
86 |
+
"IGHV1-2*07": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TGYYMHWVRQAPGQGLEWMGWINPN--SGGTNYAHKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCAR-",
|
87 |
+
"IGHV1-24*01": "QVQLVQSGAEVKKPGASVKVSCKVSGYTL--TELSMHWVRQAPGKGLEWMGGFDPE--DGETIYAQKFQGRVTMTEDTSTDTAYMELSSLRSEDTAVYYCAT-",
|
88 |
+
"IGHV1-3*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCAR-",
|
89 |
+
"IGHV1-3*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWSNAG--NGNTKYSQEFQGRVTITRDTSASTAYMELSSLRSEDMAVYYCAR-",
|
90 |
+
"IGHV1-3*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQEFQGRVTITRDTSASTAYMELSSLRSEDMAVYYCAR-",
|
91 |
+
"IGHV1-3*05": "QVQLVQSGAEEKKPGASVKVSCKASGYTF--TSYAMHWVRQAPGQRLEWMGWINAG--NGNTKYSQKFQGRVTITRDTSASTAYMELSSLRSEDTAVYYCAR-",
|
92 |
+
"IGHV1-38-4*01": "QVQLVQSWAEVRKSGASVKVSCSFSGFTI--TSYGIHWVQQSPGQGLEWMGWINPG--NGSPSYAKKFQGRFTMTRDMSTTTAYTDLSSLTSEDMAVYYYAR-",
|
93 |
+
"IGHV1-45*01": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPGQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
|
94 |
+
"IGHV1-45*02": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPGQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
|
95 |
+
"IGHV1-45*03": "QMQLVQSGAEVKKTGSSVKVSCKASGYTF--TYRYLHWVRQAPRQALEWMGWITPF--NGNTNYAQKFQDRVTITRDRSMSTAYMELSSLRSEDTAMYYCAR-",
|
96 |
+
"IGHV1-46*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
|
97 |
+
"IGHV1-46*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--NSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
|
98 |
+
"IGHV1-46*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKFQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
|
99 |
+
"IGHV1-46*04": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWVRQAPGQGLEWMGIINPS--GGSTSYAQKLQGRVTMTRDTSTSTVYMELSSLRSEDTAVYYCAR-",
|
100 |
+
"IGHV1-58*01": "QMQLVQSGPEVKKPGTSVKVSCKASGFTF--TSSAVQWVRQARGQRLEWIGWIVVG--SGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAA-",
|
101 |
+
"IGHV1-58*02": "QMQLVQSGPEVKKPGTSVKVSCKASGFTF--TSSAMQWVRQARGQRLEWIGWIVVG--SGNTNYAQKFQERVTITRDMSTSTAYMELSSLRSEDTAVYYCAA-",
|
102 |
+
"IGHV1-68*01": "QVQLGQSEAEVKKPGASVKVSCKASGYTF--TCCSLHWLQQAPGQGLERMRWITLY--NGNTNYAKKFQGRVTITRDMSLRTAYIELSSLRSEDSAVYYWAR-",
|
103 |
+
"IGHV1-68*02": "QVQLGQSEAEVKKPGASVKVSCKASGYTF--TYCSLHWLQQAPGQGLERMRWITLY--NGNINYAKKFQSRVTITRDMSLRTAYIELSSLRSEDSAVYYWAR-",
|
104 |
+
"IGHV1-69*01": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
105 |
+
"IGHV1-69*02": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
106 |
+
"IGHV1-69*04": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
107 |
+
"IGHV1-69*05": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITTDESTSTAYMELSSLRSEDTAVYYCAR-",
|
108 |
+
"IGHV1-69*06": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
109 |
+
"IGHV1-69*08": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGRIIPI--LGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
110 |
+
"IGHV1-69*09": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
111 |
+
"IGHV1-69*10": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--LGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
112 |
+
"IGHV1-69*11": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--LGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
113 |
+
"IGHV1-69*12": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
114 |
+
"IGHV1-69*13": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
115 |
+
"IGHV1-69*14": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
116 |
+
"IGHV1-69*15": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGRIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
117 |
+
"IGHV1-69*16": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYTISWVRQAPGQGLEWMGGIIPI--LGTANYAQKFQGRVTITTDESTSTAYMELSSLRSEDTAVYYCAR-",
|
118 |
+
"IGHV1-69*17": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGIANYAQKFQGRVTITADKSTSTAYMELSSLRSEDTAVYYCAR-",
|
119 |
+
"IGHV1-69*19": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
120 |
+
"IGHV1-69-2*01": "EVQLVQSGAEVKKPGATVKISCKVSGYTF--TDYYMHWVQQAPGKGLEWMGLVDPE--DGETIYAEKFQGRVTITADTSTDTAYMELSSLRSEDTAVYYCAT-",
|
121 |
+
"IGHV1-69D*01": "QVQLVQSGAEVKKPGSSVKVSCKASGGTF--SSYAISWVRQAPGQGLEWMGGIIPI--FGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCAR-",
|
122 |
+
"IGHV1-8*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCAR-",
|
123 |
+
"IGHV1-8*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCAR-",
|
124 |
+
"IGHV1-8*03": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYDINWVRQATGQGLEWMGWMNPN--SGNTGYAQKFQGRVTITRNTSISTAYMELSSLRSEDTAVYYCAR-",
|
125 |
+
"IGHV1-NL1*01": "QVQLLQPGVQVKKPGSSVKVSC-ASRYTF--TKYFTRWV-QSPGQGHXWMG-INPY--NDNTHYAQTFWGRVTITSDRSMSTAYMELSXLRSEDMVVYYCVR-",
|
126 |
+
"IGHV1/OR15-1*01": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYTELSSLRSEDTATYYCAR-",
|
127 |
+
"IGHV1/OR15-1*02": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTACTELSSLRSEDTATYYCAR-",
|
128 |
+
"IGHV1/OR15-1*03": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYTELSSLRSEDTATYYCAR-",
|
129 |
+
"IGHV1/OR15-1*04": "QVQLVQSGAEVKKPGASVKVSCKASGYIF--TDYYMHWVRQAPGQELGWMGRINPN--SGGTNYAQKFQGRVTMTRDTSISTAYMELSSLRSEDTATYYCAR-",
|
130 |
+
"IGHV1/OR15-2*01": "QVQLVQSGAEVKKPRASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSRLRSDDMAVYYCAR-",
|
131 |
+
"IGHV1/OR15-2*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSSLRSDDMAVYYCAR-",
|
132 |
+
"IGHV1/OR15-2*03": "QVQLVQSGAEVKKPRASVKVSCKASGYTF--TSYYMHWV-QAPEQGLEWMGWINTY--NGNTNYPQKLQGRVTMTRDTSTSTAYMELSSLRSDDMAVYYCAR-",
|
133 |
+
"IGHV1/OR15-3*01": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TDYFMNWMRQAPGQRLEWMGWINAG--NGNTKYSQKLQGRVTITRDTSSSTAYMQLSSLRSEDTAVYYCAR-",
|
134 |
+
"IGHV1/OR15-3*02": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TDYFMNWMRQAPGQRLEWMGWINAG--NGNTKYSQKLQGRVTITRDTSASTAYMQLSSLRSEDTAVYYCAR-",
|
135 |
+
"IGHV1/OR15-3*03": "QVQLV-SGAEVKKPGASVKVSCKASGYTF--TSYYMNWMRQAPGQGFEWMGWINAG--NGNTKYSQKLQGRVTITRDTSASTAYMQLSSLRSEDTAVYYCAR-",
|
136 |
+
"IGHV1/OR15-4*01": "QDQLVQSGAEVKKPLSSVKVSFKASGYTF--TNNFMHWV-QAPGQGLEWMGWINAG--NGNTTYAQKFQGRVTITRDTSMSTAYTELSSLRSEDTAVYYCAR-",
|
137 |
+
"IGHV1/OR15-5*02": "QVQLVQSGAEVKKPGASVKVSCKASGYTF--TNYCMHWVRQVHAQGLEWMGLVCPS--DGSTSYAQKFQARVTITRDTSMSTAYMELSSLRSEDTAMYYCVR-",
|
138 |
+
"IGHV1/OR15-9*01": "QVQLMQSGAEVKKPGASVRISCKASGYTF--TSYCMHWVCQAHAQGLEWMGLVCPS--DGSTSYAQKFQGRVTITRDTSMGTAYMELSSLRSEDTAMYYCVR-",
|
139 |
+
"IGHV1/OR21-1*01": "QVQLVQSGAEVKKPGASVKVSCKASGYTI--TSYCMHWVHQVHAQGLEWMGLVCPS--DGSTSYAQKFQARVTITRDTSMSTAYMELSSLRSEDTAMYYCVR-",
|
140 |
+
"IGHV2-10*01": "QVTLKESGPALVKPTQTLMLTCTFSGFSLSTSGMGVG-ICQPSAKALEWLAHIY-N---DNKYYSPSLKSRLIISKDTSKNEVVLTVINMDIVDTATHYCARR",
|
141 |
+
"IGHV2-26*01": "QVTLKESGPVLVKPTETLTLTCTVSGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
|
142 |
+
"IGHV2-26*02": "QVTLKESGPVLVKPTETLTLTCTVSGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
|
143 |
+
"IGHV2-26*03": "QVTLKESGPVLVKPTETLTLTCTISGFSLSNARMGVSWIRQPPGKALEWLAHIFSN---DEKSYSTSLKSRLTISKDTSKSQVVLTMTNMDPVDTATYYCARI",
|
144 |
+
"IGHV2-5*01": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWN---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
145 |
+
"IGHV2-5*02": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
146 |
+
"IGHV2-5*05": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
147 |
+
"IGHV2-5*06": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
148 |
+
"IGHV2-5*08": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLALIYWD---DDKRYSPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
149 |
+
"IGHV2-5*09": "QVTLKESGPTLVKPTQTLTLTCTFSGFSLSTSGVGVGWIRQPPGKALEWLALIYWD---DDKRYGPSLKSRLTITKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
150 |
+
"IGHV2-70*01": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
151 |
+
"IGHV2-70*04": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
152 |
+
"IGHV2-70*10": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWIARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
153 |
+
"IGHV2-70*11": "RVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
154 |
+
"IGHV2-70*12": "QITLKESGPTLVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCAHR",
|
155 |
+
"IGHV2-70*13": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
156 |
+
"IGHV2-70*15": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
157 |
+
"IGHV2-70*16": "QVTLKESGPVLVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
158 |
+
"IGHV2-70*17": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
159 |
+
"IGHV2-70*18": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSEMCVSWVRQPPGKALEWLALIDWD---DDKYYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
160 |
+
"IGHV2-70*19": "QVTLRESGPALVKPTQTLTLTCTFSGFSLSTSGMCVSWVRQPPGKALEWLALIDWD---DDKHYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
161 |
+
"IGHV2-70D*04": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
162 |
+
"IGHV2-70D*14": "QVTLKESGPALVKPTQTLTLTCTFSGFSLSTSGMRVSWIRQPPGKALEWLARIDWD---DDKFYSTSLKTRLTISKDTSKNQVVLTMTNMDPVDTATYYCARI",
|
163 |
+
"IGHV2/OR16-5*01": "QVTLKESGPALVKPTETLTLTCTLSGFSLSTSGMGMSWIRQPPGKALEWLAHIFLN---DKKSYSTSLKNRLIISKDTSKSQVVLTMTNMDPVDTATYYCAWR",
|
164 |
+
"IGHV3-11*01": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
165 |
+
"IGHV3-11*03": "QVQLLESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
166 |
+
"IGHV3-11*04": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
167 |
+
"IGHV3-11*05": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
168 |
+
"IGHV3-11*06": "QVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMSWIRQAPGKGLEWVSYISSS--SSYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
169 |
+
"IGHV3-13*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
|
170 |
+
"IGHV3-13*02": "EVHLVESGGGLVQPGGALRLSCAASGFTF--SNYDMHWVRQATGKGLEWVSANGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
|
171 |
+
"IGHV3-13*03": "EVQLVESGGGLVQPGGSLRLSCAACGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGQFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
|
172 |
+
"IGHV3-13*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDTYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
|
173 |
+
"IGHV3-13*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYDMHWVRQATGKGLEWVSAIGTA---GDPYYPGSVKGRFTISRENAKNSLYLQMNSLRAGDTAVYYCAR-",
|
174 |
+
"IGHV3-15*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
175 |
+
"IGHV3-15*02": "EVQLVESGGALVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
176 |
+
"IGHV3-15*03": "EVQLVESAGALVQPGGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGRIKSKANGGTTDYAAPVKGRFTISRVDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
177 |
+
"IGHV3-15*04": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIESKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
178 |
+
"IGHV3-15*05": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
179 |
+
"IGHV3-15*06": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMSWVRQAPGKGLEWVGRIKSKTDGGTTNYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
180 |
+
"IGHV3-15*07": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SNAWMNWVRQAPGKGLEWVGRIKSKTDGGTTDYAAPVKGRFTISRDDSKNTLYLQMNSLKTEDTAVYYCTT-",
|
181 |
+
"IGHV3-15*08": "EVQLVESAGGLVQPGGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGCIKSKANGGTTDYAAPVKGRFTISRDDSKNTLYLQMISLKTEDTAVYYCTT-",
|
182 |
+
"IGHV3-16*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWARKAPGKGLEWVSGVSWN--GSRTHYVDSVKRRFIISRDNSRNSLYLQKNRRRAEDMAVYYCVR-",
|
183 |
+
"IGHV3-16*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWARKAPGKGLEWVSGVSWN--GSRTHYVDSVKRRFIISRDNSRNSLYLQKNRRRAEDMAVYYCVR-",
|
184 |
+
"IGHV3-19*01": "TVQLVESGGGLVEPGGSLRLSCAASGFTF--SNSDMNWVRQAPGKGLEWVSGVSWN--GSRTHYADSVKGRFIISRDNSRNFLYQQMNSLRPEDMAVYYCVR-",
|
185 |
+
"IGHV3-20*01": "EVQLVESGGGVVRPGGSLRLSCAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYHCAR-",
|
186 |
+
"IGHV3-20*02": "EVQLVESGGGVVRPGGSLRLSFAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYHCAR-",
|
187 |
+
"IGHV3-20*03": "EVQLVESGGGVVRPGGSLRLSFAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAR-",
|
188 |
+
"IGHV3-20*04": "EVQLVESGGGVVRPGGSLRLSCAASGFTF--DDYGMSWVRQAPGKGLEWVSGINWN--GGSTGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAR-",
|
189 |
+
"IGHV3-21*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
190 |
+
"IGHV3-21*02": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
191 |
+
"IGHV3-21*03": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
192 |
+
"IGHV3-21*04": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
193 |
+
"IGHV3-21*05": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
194 |
+
"IGHV3-21*06": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSSISSS--SSYIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
195 |
+
"IGHV3-22*01": "EVHLVESGGALVQPGGSLRLSCAASGFTF--SYYYMSGVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMKSLKTEDTAVYYCSR-",
|
196 |
+
"IGHV3-22*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SYYYMSGVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMKSLKTEDTAVYYCSR-",
|
197 |
+
"IGHV3-23*01": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
198 |
+
"IGHV3-23*02": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYGDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
199 |
+
"IGHV3-23*03": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSVIYSG--GSSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
200 |
+
"IGHV3-23*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
201 |
+
"IGHV3-23D*01": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SSYAMSWVRQAPGKGLEWVSAISGS--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
202 |
+
"IGHV3-25*01": "EMQLVESGGGLQKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELV-QVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
|
203 |
+
"IGHV3-25*02": "EMQLVESGGGLAKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELV-QVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
|
204 |
+
"IGHV3-25*04": "ETQLVESGGGLAKPGRSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELVGQVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALYYCTR-",
|
205 |
+
"IGHV3-25*05": "EMQLVESGGGLAKPAWSPRLSCAASQFTF--SSYYMNCVRQAPGNGLELVGQVNPN--GGSTYLIDSGKDRFNTSRDNAKNTLHLQMNSLKTEDTALY-CTR-",
|
206 |
+
"IGHV3-29*01": "EVELIEPTEDLRQPGKFLRLSCVASRFAF--SSF-MSPVHQSAGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRTEDMAVYGCT-G",
|
207 |
+
"IGHV3-30*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
208 |
+
"IGHV3-30*02": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAFIRYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
209 |
+
"IGHV3-30*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
210 |
+
"IGHV3-30*04": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
211 |
+
"IGHV3-30*05": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEGTAVYYCAR-",
|
212 |
+
"IGHV3-30*06": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
213 |
+
"IGHV3-30*07": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
214 |
+
"IGHV3-30*08": "QVQLVDSGGGVVQPGRSLRLSCAASAFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
215 |
+
"IGHV3-30*09": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFAISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
216 |
+
"IGHV3-30*10": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYTDSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
217 |
+
"IGHV3-30*11": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
218 |
+
"IGHV3-30*12": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
219 |
+
"IGHV3-30*13": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNRLYLQMNSLRAEDTAVYYCAR-",
|
220 |
+
"IGHV3-30*14": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
221 |
+
"IGHV3-30*15": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCAR-",
|
222 |
+
"IGHV3-30*16": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
223 |
+
"IGHV3-30*17": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
224 |
+
"IGHV3-30*18": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
225 |
+
"IGHV3-30*19": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
226 |
+
"IGHV3-30-2*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RNSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAAGTAVCYCM-G",
|
227 |
+
"IGHV3-30-22*01": "EVELIESIEDLRQPGKFLRLSCVASRFAF--SSF-MSRVHQSPGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRAEDMDVYGCT-G",
|
228 |
+
"IGHV3-30-3*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
229 |
+
"IGHV3-30-3*02": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
230 |
+
"IGHV3-30-3*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
231 |
+
"IGHV3-30-33*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RSSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAEGTAVCYCM--",
|
232 |
+
"IGHV3-30-42*01": "EVELIEPTEDLRQPGKFLRLSCVASRFAF--SSF-MSPVHQSAGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNSQRTEDMAVYGCT-G",
|
233 |
+
"IGHV3-30-5*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
234 |
+
"IGHV3-30-5*02": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAFIRYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
235 |
+
"IGHV3-30-52*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-RNSVSQAPGKGLE-VVDIQCD--GSQICYA-SLKSKFTISKENAKNSLYLLMNSLRAAGTAVCYCM--",
|
236 |
+
"IGHV3-32*01": "EVELIESIEDLRQPGKFLRLSCVASRFAF--SSF-MSRVHQSPGKGLE-VIDIKDD--GSQIHHADSVKGRFSISKDNAKNSLYLQMNTQRAEDVAVYGYT-G",
|
237 |
+
"IGHV3-33*01": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
238 |
+
"IGHV3-33*02": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSAKGRFTISRDNSTNTLFLQMNSLRAEDTAVYYCAR-",
|
239 |
+
"IGHV3-33*03": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
240 |
+
"IGHV3-33*04": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
241 |
+
"IGHV3-33*05": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVISYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
242 |
+
"IGHV3-33*06": "QVQLVESGGGVVQPGRSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
243 |
+
"IGHV3-33*07": "QVQLVESGGRVVQPGRSLRLSCAASGFTF--SRYGMYWVRQAPGKGLEWVAVIWYD--GSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
244 |
+
"IGHV3-33-2*01": "EVQLVESGEDPRQPGGSLRLSCADSGLTF--SSY-MSSVSQAPGKGLE-VVDIQCD--GSQICYAQSVKSKFTISKENAKNSLYLQMNSLRAEGTAVCYCM-G",
|
245 |
+
"IGHV3-35*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWVHQAPGKGLEWVSGVSWN--GSRTHYADSVKGRFIISRDNSRNTLYLQTNSLRAEDTAVYYCVR-",
|
246 |
+
"IGHV3-38*01": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAAYYCARY",
|
247 |
+
"IGHV3-38*02": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAVYYCARY",
|
248 |
+
"IGHV3-38*03": "EVQLVESGGGLVQPRGSLRLSCAASGFTV--SSNEMSWIRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLYLQMNNLRAEGTAVYYCARY",
|
249 |
+
"IGHV3-38-3*01": "EVQLVESRGVLVQPGGSLRLSCAASGFTV--SSNEMSWVRQAPGKGLEWVSSISG----GSTYYADSRKGRFTISRDNSKNTLHLQMNSLRAEDTAVYYCKK-",
|
250 |
+
"IGHV3-41*02": "EVQLVESGGGLVQPGGSLRLSCAASGFSF--SSYGMSWVRQAPGKGLD-VAHIWND--GSQKYYADSVKGRFTISRDNSKSMLYLQMDSLKAKDTAMYYCTR-",
|
251 |
+
"IGHV3-43*01": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYTMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRTEDTALYYCAKD",
|
252 |
+
"IGHV3-43*02": "EVQLVESGGGVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISGD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRTEDTALYYCAKD",
|
253 |
+
"IGHV3-43D*03": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRAEDTALYYCAKD",
|
254 |
+
"IGHV3-43D*04": "EVQLVESGGVVVQPGGSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSLISWD--GGSTYYADSVKGRFTISRDNSKNSLYLQMNSLRAEDTALYYCAKD",
|
255 |
+
"IGHV3-47*01": "EDQLVESGGGLVQPGGSLRPSCAASGFAF--SSYALHWVRRAPGKGLEWVSAIGTG---GDTYYADSVMGRFTISRDNAKKSLYLHMNSLIAEDMAVYYCAR-",
|
256 |
+
"IGHV3-47*02": "EDQLVESGGGLVQPGGSLRPSCAASGFAF--SSYVLHWVRRAPGKGPEWVSAIGTG---GDTYYADSVMGRFTISRDNAKKSLYLQMNSLIAEDMAVYYCAR-",
|
257 |
+
"IGHV3-48*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
258 |
+
"IGHV3-48*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRDEDTAVYYCAR-",
|
259 |
+
"IGHV3-48*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYEMNWVRQAPGKGLEWVSYISSS--GSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
260 |
+
"IGHV3-48*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYSMNWVRQAPGKGLEWVSYISSS--SSTIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
261 |
+
"IGHV3-49*01": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYTASVKGRFTISRDGSKSIAYLQMNSLKTEDTAVYYCTR-",
|
262 |
+
"IGHV3-49*02": "EVQLVESGGGLVQPGPSLRLSCTASGFTF--GYYPMSWVRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
|
263 |
+
"IGHV3-49*03": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
|
264 |
+
"IGHV3-49*04": "EVQLVESGGGLVQPGRSLRLSCTASGFTF--GDYAMSWVRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
|
265 |
+
"IGHV3-49*05": "EVQLVESGGGLVKPGRSLRLSCTASGFTF--GDYAMSWFRQAPGKGLEWVGFIRSKAYGGTTEYAASVKGRFTISRDDSKSIAYLQMNSLKTEDTAVYYCTR-",
|
266 |
+
"IGHV3-52*01": "EVQLVESG-GLVQPGGSLRLSCAASGFTF--SSSWMHWVCQAPEKGLEWVADIKCD--GSEKYYVDSVKGRLTISRDNAKNSLYLQVNSLRAEDMTVYYCVR-",
|
267 |
+
"IGHV3-52*03": "EVQLVESG-GLVQPGGSLRLSCAASGFTF--SSSWMHWVCQAPEKGLEWVADIKCD--GSEKYYVDSVKGRLTISRDNAKNSLYLQVNSLRAEDMTVYYCVR-",
|
268 |
+
"IGHV3-53*01": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
269 |
+
"IGHV3-53*02": "EVQLVETGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
270 |
+
"IGHV3-53*03": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQPPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
271 |
+
"IGHV3-53*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRHNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
272 |
+
"IGHV3-53*05": "EVQLVETGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
273 |
+
"IGHV3-54*01": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI--D--RSQLCYAQSVKSRFTISKENAKNSLCLQMNSLRAEGTAVYYCM--",
|
274 |
+
"IGHV3-54*02": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI-YD--RSQICYAQSVKSRFTISKENAKNSLRLQMNSLRAEGTAVYYCM--",
|
275 |
+
"IGHV3-54*04": "EVQLVESEENQRQLGGSLRLSCADSGLTF--SSY-MSSDSQAPGKGLE-VVDI--D--RSQLCYAQSVKSRFTISKENAKNSLCLQMNSLRAEGTAVYYCM--",
|
276 |
+
"IGHV3-62*01": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGL-WVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLSLQMNSLRAEGTVVYYCVK-",
|
277 |
+
"IGHV3-62*03": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGL-WVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLYLQMNSLRADDMAVYYCVK-",
|
278 |
+
"IGHV3-62*04": "EVQLVKSGGGLVQPGGSLRLSCAASGFTF--SSSAMHWVRQAPRKGLEWVSVISTS--GDTVLYTDSVKGRFTISRDNAQNSLSLQMNSLRAEDMAVYYCVK-",
|
279 |
+
"IGHV3-63*01": "EVELIESIEGLRQLGKFLRLSCVASGFTF--SSY-MSWVNETLGKGLEGVIDVKYD--GSQIYHADSVKGRFTISKDNAKNSPYLQTNSLRAEDMTMHGCT-G",
|
280 |
+
"IGHV3-64*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYANSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
|
281 |
+
"IGHV3-64*02": "EVQLVESGEGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
|
282 |
+
"IGHV3-64*03": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYVQMSSLRAEDTAVYYCVK-",
|
283 |
+
"IGHV3-64*04": "QVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
284 |
+
"IGHV3-64*05": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYVQMSSLRAEDTAVYYCVK-",
|
285 |
+
"IGHV3-64*07": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMGSLRAEDMAVYYCAR-",
|
286 |
+
"IGHV3-64D*06": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
|
287 |
+
"IGHV3-64D*08": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
|
288 |
+
"IGHV3-64D*09": "EVQLVESGGGLVQPGGSLRLSCSASGFTF--SSYAMHWVRQAPGKGLEYVSAISSN--GGSTYYADSVKGRFTISRDNSKNTLYLQMSSLRAEDTAVYYCVK-",
|
289 |
+
"IGHV3-66*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
290 |
+
"IGHV3-66*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
291 |
+
"IGHV3-66*03": "EVQLVESGGGLIQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSC---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
292 |
+
"IGHV3-66*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTV--SSNYMSWVRQAPGKGLEWVSVIYSG---GSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAR-",
|
293 |
+
"IGHV3-69-1*01": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMNWVRQAPGKGLEWVSSISSS---STIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
294 |
+
"IGHV3-69-1*02": "EVQLVESGGGLVKPGGSLRLSCAASGFTF--SDYYMNWVRQAPGKGLEWVSSISSS---STIYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
295 |
+
"IGHV3-7*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
296 |
+
"IGHV3-7*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
297 |
+
"IGHV3-7*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
298 |
+
"IGHV3-7*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
299 |
+
"IGHV3-7*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMSWVRQAPGKGLEWVANIKQD--GSEKYYVDSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAR-",
|
300 |
+
"IGHV3-71*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
|
301 |
+
"IGHV3-71*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDMAVYYCAR-",
|
302 |
+
"IGHV3-71*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
|
303 |
+
"IGHV3-71*04": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDYYMSWVRQAPGKGLEWVGFIRNKANGGTTE-TTSVKGRFTISRDDSKSITYLQMNSLRAEDTAVYYCAR-",
|
304 |
+
"IGHV3-72*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMDWVRQAPGKGLEWVGRTRNKANSYTTEYAASVKGRFTISRDDSKNSLYLQMNSLKTEDTAVYYCAR-",
|
305 |
+
"IGHV3-73*01": "EVQLVESGGGLVQPGGSLKLSCAASGFTF--SGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYCTR-",
|
306 |
+
"IGHV3-73*02": "EVQLVESGGGLVQPGGSLKLSCAASGFTF--SGSAMHWVRQASGKGLEWVGRIRSKANSYATAYAASVKGRFTISRDDSKNTAYLQMNSLKTEDTAVYYCTR-",
|
307 |
+
"IGHV3-74*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
|
308 |
+
"IGHV3-74*02": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
|
309 |
+
"IGHV3-74*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTTYADSVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCAR-",
|
310 |
+
"IGHV3-9*01": "EVQLVESGGGLVQPGRSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAKD",
|
311 |
+
"IGHV3-9*02": "EVQLVESGGGLVQPGRSLRLSCAASGFTS--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTALYYCAKD",
|
312 |
+
"IGHV3-9*03": "EVQLVESGGGLVQPGRSLRLSCAASGFTF--DDYAMHWVRQAPGKGLEWVSGISWN--SGSIGYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMALYYCAKD",
|
313 |
+
"IGHV3-NL1*01": "QVQLVESGGGVVQPGGSLRLSCAASGFTF--SSYGMHWVRQAPGKGLEWVSVIYSG--GSSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAK-",
|
314 |
+
"IGHV3/OR15-7*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTMYLQMSNLKTEDLAVYYCAR-",
|
315 |
+
"IGHV3/OR15-7*02": "EVQLLESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSSLKTEDLAVYYCAR-",
|
316 |
+
"IGHV3/OR15-7*03": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSSLKTEDLAVYYCAR-",
|
317 |
+
"IGHV3/OR15-7*05": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SDHYMSWVRQAQGKGLELVGLIRNKANSYTTEYAASVKGRLTISREDSKNTLYLQMSNLKTEDLAVYYCAR-",
|
318 |
+
"IGHV3/OR16-10*01": "EVQLVQSGGGLVHPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
|
319 |
+
"IGHV3/OR16-10*02": "EVQLVQSGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
|
320 |
+
"IGHV3/OR16-10*03": "EVQLVESGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
|
321 |
+
"IGHV3/OR16-12*01": "EVQLVESGRGLAQPGGYLKLSGAASGFTV--GSWYMSWIHQAPGKGLEWVSYISSS--GCSTNYADSVKGRFTISTDNSKNTLYLQMNSLRVEDTAVYYCAR-",
|
322 |
+
"IGHV3/OR16-13*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQAPGKGLVWVSRINSD--GSSTSYADSMKGQFTISRDNAKNTLYLQMNSLRAEDMAVYYCTR-",
|
323 |
+
"IGHV3/OR16-14*01": "EVQLEESGGGLVQPGGSLRLSCAASGFTF--SSYWMHWVRQSPGKGLV-VSRINSD--GSSTSYADSLKGQFTISRDNAKNTLYLQMNSLRAEDMAVYYCTR-",
|
324 |
+
"IGHV3/OR16-15*01": "EVQLVESGGGLVQPGGSLRLSCAASVFTF--SNSDINWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFSISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
|
325 |
+
"IGHV3/OR16-15*02": "EVQLVESGGGLVQPGGSLRHSCAASGFTF--SNSDMNWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
|
326 |
+
"IGHV3/OR16-16*01": "EVQLVESGGGLVQPGGSLRHSCAASGFTF--SNSDMNWVL-APGKGLEWVSDISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
|
327 |
+
"IGHV3/OR16-17*01": "EVQLVESGGGLVQPGGSLRLSCPDSGFTF--SNHYMSWVRQAPGKGLEWISYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
|
328 |
+
"IGHV3/OR16-18*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNSDMNWVL-APGKGLEWVSGISWN--GGKTHYVDSVKGQFTISRDNSSKSLYLQKNRQRAKDMAVYYCVR-",
|
329 |
+
"IGHV3/OR16-20*01": "EVQLVQSGGGLVQPGGSLRLSCAGSGFTF--SSYAMHWVRQAPGKGLEWVSAIGTG---GGTYYADSVKGRFTISRDNAKNSLYLQMNSLRAEDMAVYYCAR-",
|
330 |
+
"IGHV3/OR16-6*02": "EVQLVESAGGLGTAWGSLRLSCAASGFTC--SNAWMSWVRQAPGKGLEWVGCIKSKANGGTTDYAAPVKGRFTISRDDSKNTLYLQMISLKTEDTAVYYCTT-",
|
331 |
+
"IGHV3/OR16-8*01": "EVQLVESGGGLVQPGGSLRLSCPASGFTF--SNHYMSWVRQAPGKGLEWVSYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
|
332 |
+
"IGHV3/OR16-8*02": "EVQLVESGGGLVQPGGSLRLSCPDSGFTF--SNHYMSWVRQAPGKGLEWISYISGD--SGYTNYADSVKGRFTISRDNANNSPYLQMNSLRAEDTAVYYCVK-",
|
333 |
+
"IGHV3/OR16-9*01": "EVQLVESGGGLVQPGGSLRLSCAASGFTF--SNHYTSWVRQAPGKGLEWVSYSSGN--SGYTNYADSVKGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCVK-",
|
334 |
+
"IGHV4-28*01": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
335 |
+
"IGHV4-28*02": "QVQLQESGPGLVKPSQTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSIYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
336 |
+
"IGHV4-28*03": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
337 |
+
"IGHV4-28*04": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTGVYYCAR-",
|
338 |
+
"IGHV4-28*05": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSIYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
339 |
+
"IGHV4-28*06": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTALDTAVYYCAR-",
|
340 |
+
"IGHV4-28*07": "QVQLQESGPGLVKPSDTLSLTCAVSGYSIS-SSNWWGWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
341 |
+
"IGHV4-30-2*01": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDRSKNQFSLKLSSVTAADTAVYYCAR-",
|
342 |
+
"IGHV4-30-2*03": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
343 |
+
"IGHV4-30-2*05": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
344 |
+
"IGHV4-30-2*06": "QLQLQESGSGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQSPGKGLEWIGYIYHS---GSTYYNPSLKSRVTISVDRSKNQFSLKLSSVTAADTAVYYCAR-",
|
345 |
+
"IGHV4-30-4*01": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGDYYWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
346 |
+
"IGHV4-30-4*02": "QVQLQESGPGLVKPSDTLSLTCTVSGGSISSGDYYWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
347 |
+
"IGHV4-30-4*07": "QVQLQESGPGLVKPSQTLSLTCAVSGGSISSGGYSWSWIRQPPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
348 |
+
"IGHV4-31*01": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSLVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
349 |
+
"IGHV4-31*02": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
350 |
+
"IGHV4-31*03": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGYIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
351 |
+
"IGHV4-31*10": "QVQLQESGPGLLKPSQTLSLTCTVSGGSISSGGYYWSWIRQHPGKGLEWIGCIYYS---GSTYYNPSLKSRVTISVDPSKNQFSLKPSSVTAADTAVDYCAR-",
|
352 |
+
"IGHV4-34*01": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
353 |
+
"IGHV4-34*02": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
354 |
+
"IGHV4-34*04": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNNNPSLKSRATISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
355 |
+
"IGHV4-34*05": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWCWIRQPLGKGLEWIGEINHS---GSTNNNPSLKSRATISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
356 |
+
"IGHV4-34*09": "QVQLQESGPGLVKPSQTLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
357 |
+
"IGHV4-34*10": "QVQLQESGPGLVKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEINHS---GSTNYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
|
358 |
+
"IGHV4-34*11": "QVQLQQWGAGLLKPSETLSLTCAVYGGSV--SGYYWSWIRQPPGKGLEWIGYIYYS---GSTNNNPSLKSRATISVDTSKNQFSLNLSSVTAADTAVYCCAR-",
|
359 |
+
"IGHV4-34*12": "QVQLQQWGAGLLKPSETLSLTCAVYGGSF--SGYYWSWIRQPPGKGLEWIGEIIHS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
360 |
+
"IGHV4-38-2*01": "QVQLQESGPGLVKPSETLSLTCAVSGYSIS-SGYYWGWIRQPPGKGLEWIGSIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
361 |
+
"IGHV4-38-2*02": "QVQLQESGPGLVKPSETLSLTCTVSGYSIS-SGYYWGWIRQPPGKGLEWIGSIYHS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
362 |
+
"IGHV4-39*01": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
363 |
+
"IGHV4-39*02": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNHFSLKLSSVTAADTAVYYCAR-",
|
364 |
+
"IGHV4-39*06": "RLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFPLKLSSVTAADTAVYYCAR-",
|
365 |
+
"IGHV4-39*07": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGSIYYS---GSTYYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
366 |
+
"IGHV4-4*01": "QVQLQESGPGLVKPPGTLSLTCAVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYCCAR-",
|
367 |
+
"IGHV4-4*02": "QVQLQESGPGLVKPSGTLSLTCAVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
|
368 |
+
"IGHV4-4*07": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
369 |
+
"IGHV4-4*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
370 |
+
"IGHV4-55*01": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
|
371 |
+
"IGHV4-55*02": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
|
372 |
+
"IGHV4-55*08": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFYLKLSSVTAADTAVYYCAR-",
|
373 |
+
"IGHV4-55*09": "QVQLQESGPGLVKPSETLSLICAVSGDSIS-SGNW-IWVRQPPGKGLEWIGEIHHS---GSTYYNPSLKSRITMSVDTSKNQFSLKLSSVTAVDTAVYYCAR-",
|
374 |
+
"IGHV4-59*01": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
375 |
+
"IGHV4-59*02": "QVQLQESGPGLVKPSETLSLTCTVSGGSV--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
376 |
+
"IGHV4-59*07": "QVQLQESGPGLVKPSDTLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
377 |
+
"IGHV4-59*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
378 |
+
"IGHV4-59*10": "QVQLQQWGAGLLKPSETLSLTCAVYGGSI--SSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTMSVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
379 |
+
"IGHV4-59*11": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSHYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
380 |
+
"IGHV4-59*13": "QVQLQESGPGLVKPSETLSLTCTVSGGSI--SSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
381 |
+
"IGHV4-61*01": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
382 |
+
"IGHV4-61*02": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGRIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
383 |
+
"IGHV4-61*03": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGSYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNHFSLKLSSVTAADTAVYYCAR-",
|
384 |
+
"IGHV4-61*05": "QLQLQESGPGLVKPSETLSLTCTVSGGSISSSSYYWGWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
|
385 |
+
"IGHV4-61*08": "QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGGYYWSWIRQPPGKGLEWIGYIYYS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
386 |
+
"IGHV4-61*09": "QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKGLEWIGHIYTS---GSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR-",
|
387 |
+
"IGHV4/OR15-8*01": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSPNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
|
388 |
+
"IGHV4/OR15-8*02": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GNPNYNPSLKSRVTISIDKSKNQFSLKLSSVTAADTAVYYCAR-",
|
389 |
+
"IGHV4/OR15-8*03": "QVQLQESGPGLVKPSETLSLTCVVSGGSIS-SSNWWSWVRQPPGKGLEWIGEIYHS---GSPNYNPSLKSRVTISVDKSKNQFSLKLSSVTAADTAVYYCAR-",
|
390 |
+
"IGHV5-10-1*01": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
391 |
+
"IGHV5-10-1*02": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
392 |
+
"IGHV5-10-1*03": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGHVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
393 |
+
"IGHV5-10-1*04": "EVQLVQSGAEVKKPGESLRISCKGSGYSF--TSYWISWVRQMPGKGLEWMGRIDPS--DSYTNYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
394 |
+
"IGHV5-51*01": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
395 |
+
"IGHV5-51*02": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWTGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
396 |
+
"IGHV5-51*03": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
397 |
+
"IGHV5-51*04": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVRQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKPISTAYLQWSSLKASDTAMYYCAR-",
|
398 |
+
"IGHV5-51*07": "EVQLVQSGAEVKKPGESLKISCKGSGYSF--TSYWIGWVHQMPGKGLEWMGIIYPG--DSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCAR-",
|
399 |
+
"IGHV5-78*01": "EVQLLQSAAEVKRPGESLRISCKTSGYSF--TSYWIHWVRQMPGKELEWMGSIYPG--NSDTRYSPSFQGHVTISADSSSSTAYLQWSSLKASDAAMYYCVR-",
|
400 |
+
"IGHV6-1*01": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKSRITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
|
401 |
+
"IGHV6-1*02": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKSRITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
|
402 |
+
"IGHV6-1*03": "QVQLQQSGPGLVKPSQTLSLTCAISGDSVSSNSAAWNWIRQSPSRGLEWLGRTYYRS-KWYNDYAVSVKS-ITINPDTSKNQFSLQLNSVTPEDTAVYYCAR-",
|
403 |
+
"IGHV7-34-1*01": "-LQLVQSGPEVKKPGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--TGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
|
404 |
+
"IGHV7-34-1*02": "-LQLVQSGPEVKKPGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--NGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
|
405 |
+
"IGHV7-34-1*03": "-LQLVQSGPEVKKRGASVKVSYKSSGYTF--TIYGMNWV--TPGQGFEWM-WIITY--TGNPTYTHGFTGWFVFSMDTSVSTACLQISSLKAEDTAEYYCAK-",
|
406 |
+
"IGHV7-4-1*01": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSTAYLQICSLKAEDTAVYYCAR-",
|
407 |
+
"IGHV7-4-1*02": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSTAYLQISSLKAEDTAVYYCAR-",
|
408 |
+
"IGHV7-4-1*04": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSMAYLQISSLKAEDTAVYYCAR-",
|
409 |
+
"IGHV7-4-1*05": "QVQLVQSGSELKKPGASVKVSCKASGYTF--TSYAMNWVRQAPGQGLEWMGWINTN--TGNPTYAQGFTGRFVFSLDTSVSMAYLQISSLKAEDTAVCYCAR-",
|
410 |
+
"IGHV7-40*03": "FSIEKSNNLSVNQWMIR-NMIYVNHGILC--SQYGMNSV-PAPGQGLEWMGWIITY--TGNPTYTNGFTGRFLFSMDTSVSMAYLQISSLKAEDTAVYDCMR-",
|
411 |
+
"IGHV7-81*01": "QVQLVQSGHEVKQPGASVKVSCKASGYSF--TTYGMNWVPQAPGQGLEWMGWFNTY--TGNPTYAQGFTGRFVFSMDTSASTAYLQISSLKAEDMAMYYCAR-",
|
412 |
+
"IGHV8-51-1*01": "EAQLTESGGDLVH-EGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRS--ESHQYNADYVRGRLTTSRDNTKYMLYMQMNSLRTQNMAAFNCAG-",
|
413 |
+
"IGHV8-51-1*02": "EAQLTESGGDLVHLEGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRG--ESHQYNADYVRGRLTTSRDNTKYMLYMQMISLRTQNMAAFNCAG-",
|
414 |
+
"IGHV8-51-1*03": "EAQLTESGGDLVH-EGPLRLSCAASWFTF--SIYEIHWVCQASGKGLEWVAVIWRG--ESHQYNADYVRGRLTTSRDNTKYMLYMQMNSLRTQNMAAFNCAG-"
|
415 |
+
}
|
416 |
+
},
|
417 |
+
'L': {
|
418 |
+
"positions": [
|
419 |
+
"L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20", "L21", "L22", "L23", "L24", "L25", "L26", "L27", "L28", "L29", "L30", "L31", "L32", "L33", "L34", "L35", "L36", "L37", "L38", "L39", "L40", "L41", "L42", "L43", "L44", "L45", "L46", "L47", "L48", "L49", "L50", "L51", "L52", "L53", "L54", "L55", "L56", "L57", "L58", "L59", "L62", "L63", "L64", "L65", "L66", "L67", "L68", "L69", "L70", "L71", "L72", "L74", "L75", "L76", "L77", "L78", "L79", "L80", "L81", "L82", "L83", "L84", "L85", "L86", "L87", "L88", "L89", "L90", "L91", "L92", "L93", "L94", "L95", "L96", "L97", "L98", "L99", "L100", "L101", "L102", "L103", "L104", "L105", "L106", "L107", "L108", "L109", "L110", "L111", "L111A", "L111B", "L111C", "L111D"
|
420 |
+
],
|
421 |
+
"aligned_sequences": {
|
422 |
+
"IGLV1-36*01": "QSVLTQPPS-VSEAPRQRVTISCSGSSSNI----GNNAVNWYQQLPGKAPKLLIYYD-----DLLPSGVSDRFSGSK--SGTSASLAISGLQSEDEADYYCAAWDDSLNG--",
|
423 |
+
"IGLV1-40*01": "QSVLTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGTSASLAITGLQAEDEADYYCQSYDSSLSG--",
|
424 |
+
"IGLV1-40*02": "QSVVTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGTSASLAITGLQAEDEADYYCQSYDSSLSG--",
|
425 |
+
"IGLV1-40*03": "QSVVTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYDVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDRFSGSK--SGASASLAITGLQAEDEADYYCQSYDSSLSG--",
|
426 |
+
"IGLV1-41*01": "QSVLTQPPS-VSAAPGQKVTISCSGSSSDM----GNYAVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLWPEDEADYYCLAWDTSPRA--",
|
427 |
+
"IGLV1-41*02": "QSVLTQPPS-VSAAPGQKVTISCSGSSSDM----GNYAVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLWPED-ADYYCLAWDTSLRA--",
|
428 |
+
"IGLV1-44*01": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNTVNWYQQLPGTAPKLLIYSN-----NQRPSGVPDRFSGSK--SGTSASLAISGLQSEDEADYYCAAWDDSLNG--",
|
429 |
+
"IGLV1-47*01": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYRN-----NQRPSGVPDRFSGSK--SGTSASLAISGLRSEDEADYYCAAWDDSLSG--",
|
430 |
+
"IGLV1-47*02": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYSN-----NQRPSGVPDRFSGSK--SGTSASLAISGLRSEDEADYYCAAWDDSLSG--",
|
431 |
+
"IGLV1-47*03": "QSVLTQPPS-ASGTPGQRVTISCSGSSSNI----GSNYVYWYQQLPGTAPKLLIYRN-----NQRPSGVPDRFSGSK--SGTSASLAISGLWSEDEADYYCAAWDDSLSG--",
|
432 |
+
"IGLV1-50*01": "QSVLTQPPS-VSGAPGQRVTISCTGSSSNIG---AGYVVHWYQQLPGTAPKLLIYGN-----SNRPSGVPDQFSGSK--SGTSASLAITGLQSEDEADYYCKAWDNSLNA--",
|
433 |
+
"IGLV1-51*01": "QSVLTQPPS-VSAAPGQKVTISCSGSSSNI----GNNYVSWYQQLPGTAPKLLIYDN-----NKRPSGIPDRFSGSK--SGTSATLGITGLQTGDEADYYCGTWDSSLSA--",
|
434 |
+
"IGLV1-51*02": "QSVLTQPPS-VSAAPGQKVTISCSGSSSNI----GNNYVSWYQQLPGTAPKLLIYEN-----NKRPSGIPDRFSGSK--SGTSATLGITGLQTGDEADYYCGTWDSSLSA--",
|
435 |
+
"IGLV1-62*01": "QSVLTQPPS-VSWATRQRLTVSCTGSSSNTG---TGYNVNCWQ-LPRTDPKLLRHGD-----KNWASWVSDQFSGSK--SGSLASLGTTGLWAEDKTDYHCQSRDIC-VL--",
|
436 |
+
"IGLV10-54*01": "QAGLTQPPS-VSKGLRQTATLTCTGNSNNV----GNQGAAWLQQHQGHPPKLLSYRN-----NNRPSGISERLSASR--SGNTASLTITGLQPEDEADYYCSAWDSSLSA--",
|
437 |
+
"IGLV10-54*02": "QAGLTQPPS-VSKGLRQTATLTCTGNSNIV----GNQGAAWLQQHQGHPPKLLSYRN-----NNRPSGISERFSASR--SGNTASLTITGLQPEDEADYYCSALDSSLSA--",
|
438 |
+
"IGLV10-54*03": "QAGLTQPPS-VSKGLRQTATLTCTGNSNNV----GNQGAAWPEQHQGHPPKLLSYRN-----NNRPSGISERLSASR--SGNTASLTITGLQPEDEADYYCSAWDSSLSA--",
|
439 |
+
"IGLV11-55*01": "RPVLTQPPS-LSASPGATARLPCTLSSDLSV---GGKNMFWYQQKPGSSPRLFLYHYSD-SDKQLGPGVPSRVSGSKETSSNTAFLLISGLQPEDEADYYCQVYESSAN---",
|
440 |
+
"IGLV11-55*02": "RPVLTQPPS-LSASPGATARLPCTLSSDLSV---GGKNMFWYQQKLGSSPRLFLYHYSD-SDKQLGPGVPSRVSGSKETSSNTAFLLISGLQPEDEADYYCQVYESSAN---",
|
441 |
+
"IGLV2-11*01": "QSALTQPRS-VSGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SKRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSYTF--",
|
442 |
+
"IGLV2-11*02": "QSALTQPRS-VSGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SKRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSYTF--",
|
443 |
+
"IGLV2-14*01": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SNRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
|
444 |
+
"IGLV2-14*02": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
|
445 |
+
"IGLV2-14*03": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYDV-----SNRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTL--",
|
446 |
+
"IGLV2-18*01": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCSLYTSSSTF--",
|
447 |
+
"IGLV2-18*02": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTF--",
|
448 |
+
"IGLV2-18*03": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRFSGSK--SGNTASLTTSGLQAEDEADYYCSSYTSSSTF--",
|
449 |
+
"IGLV2-18*04": "QSALTQPPS-VSGSPGQSVTISCTGTSSDVG---SYNRVSWYQQPPGTAPKLMIYEV-----SNRPSGVPDRSSGSK--SGNTASLTISGLQAEDEADYYCSSYTSSSTF--",
|
450 |
+
"IGLV2-23*01": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTL--",
|
451 |
+
"IGLV2-23*02": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEV-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTF--",
|
452 |
+
"IGLV2-23*03": "QSALTQPAS-VSGSPGQSITISCTGTSSDVG---SYNLVSWYQQHPGKAPKLMIYEG-----SKRPSGVSNRFSGSK--SGNTASLTISGLQAEDEADYYCCSYAGSSTF--",
|
453 |
+
"IGLV2-33*01": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNMASLTISGLKSEVEANYHCSLYSSSYTF--",
|
454 |
+
"IGLV2-33*02": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNMASLTISGLKSEVEANYHCSLYSSSYTF--",
|
455 |
+
"IGLV2-33*03": "QSALTQPPF-VSGAPGQSVTISCTGTSSDVG---DYDHVFWYQKRLSTTSRLLIYNV-----NTRPSGISDLFSGSK--SGNVASLTISGLKSEVEANYHCSLYSSSYTF--",
|
456 |
+
"IGLV2-34*01": "QSVLTQPRS-VSRSPGQ-VTIFCTGTSSDIG---GYDLVSWCQ-HPGKAPKLMIYDV-----ANWPSGAPGCFSGSK--SGNTASLTISGLQAEDEADYYCSSYAGSYNF--",
|
457 |
+
"IGLV2-34*02": "QSVLTQPRS-VSRSPGQ-VTIFCTGTSSDIG---GYDLVSWCQ-HPGKAPKLMIYDV-----GNWPSGAPGCFSGSK--SGNTASLTISGLQAEDEADYYCSSYAGSYNF--",
|
458 |
+
"IGLV2-5*01": "QSALIQPPS-VSGSPGQSVTISCTGTSSDVG---SYDYVSWYQQHPGTVPKPMIYNV-----NTQPSGVPDRFSGSK--SGNTASMTISGLQAEDEADY-CCSYTSSAT---",
|
459 |
+
"IGLV2-5*02": "QSALIQPPS-VSGSPGQSVTISCTGTSSDVG---SYDYVSWYQQHPGTVPKPMIYNV-----NTRPSGVPDRFSGSK--SGNTASMTISGLQAEDEADY-CCSYTSSAT---",
|
460 |
+
"IGLV2-8*01": "QSALTQPPS-ASGSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SKRPSGVPDRFSGSK--SGNTASLTVSGLQAEDEADYYCSSYAGSNNF--",
|
461 |
+
"IGLV2-8*02": "QSALTQPPS-ASRSPGQSVTISCTGTSSDVG---GYNYVSWYQQHPGKAPKLMIYEV-----SKRPSGVPDRFSGSK--SGNTASLTVSGLQAEDEADYYCSSYAGSNNF--",
|
462 |
+
"IGLV3-1*01": "SYELTQPPS-VSVSPGQTASITCSGDKLG------DKYACWYQQKPGQSPVLVIYQD-----SKRPSGIPERFSGSN--SGNTATLTISGTQAMDEADYYCQAWDSSTA---",
|
463 |
+
"IGLV3-10*01": "SYELTQPPS-VSVSPGQTARITCSGDALP------KKYAYWYQQKSGQAPVLVIYED-----SKRPSGIPERFSGSS--SGTMATLTISGAQVEDEADYYCYSTDSSGNH--",
|
464 |
+
"IGLV3-10*03": "SYELTQPPS-VSVSPGQTARITCSGDALP------KKYAYWYQQKSGQAPVLVIYED-----SKRPSGIPERFSGSS--SGTMATLTISGAQVEDEDDYYCYSTDSSGNH--",
|
465 |
+
"IGLV3-12*01": "SYELTQPHS-VSVATAQMARITCGGNNIG------SKAVHWYQQKPGQDPVLVIYSD-----SNRPSGIPERFSGSN--PGNTTTLTISRIEAGDEADYYCQVWDSSSDH--",
|
466 |
+
"IGLV3-12*02": "SYELTQPHS-VSVATAQMARITCGGNNIG------SKAVHWYQQKPGQDPVLVIYSD-----SNRPSGIPERFSGSN--PGNTATLTISRIEAGDEADYYCQVWDSSSDH--",
|
467 |
+
"IGLV3-13*01": "SYELTQPPA-VSVSPGQTARISCSGDVLR------DNYADWYPQKPGQAPVLVIYKD-----GERPSGIPERFSGST--SGNTTALTISRVLTKGGADYYCFSGD-NN----",
|
468 |
+
"IGLV3-13*02": "SYELTQPPA-VSVSPGQTARISCSGDVLR------DNYADWYPQKPGQTPVLVIYKD-----GERPSGIPERFSGST--SGNTTALTISRVLTKGGADYYCFSGD-NN----",
|
469 |
+
"IGLV3-16*01": "SYELTQPPS-VSVSLGQMARITCSGEALP------KKYAYWYQQKPGQFPVLVIYKD-----SERPSGIPERFSGSS--SGTIVTLTISGVQAEDEADYYCLSADSSGTY--",
|
470 |
+
"IGLV3-19*01": "SSELTQDPA-VSVALGQTVRITCQGDSLR------SYYASWYQQKPGQAPVLVIYGK-----NNRPSGIPDRFSGSS--SGNTASLTITGAQAEDEADYYCNSRDSSGNH--",
|
471 |
+
"IGLV3-19*02": "SSELTQDPA-VSVALGQTVRITCQGDSLR------SYYASWYQQKPGQAPVRVIYGK-----NNRPSGIPDRFSGSS--SGNTASLTITGAQAEDEADYYCNSWDSSGNH--",
|
472 |
+
"IGLV3-21*01": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVIYYD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
|
473 |
+
"IGLV3-21*02": "SYVLTQPPS-VSVAPGQTARITCGGNNIG------SKSVHWYQQKPGQAPVLVVYDD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
|
474 |
+
"IGLV3-21*03": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVVYDD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
|
475 |
+
"IGLV3-21*04": "SYVLTQPPS-VSVAPGKTARITCGGNNIG------SKSVHWYQQKPGQAPVLVIYYD-----SDRPSGIPERFSGSN--SGNTATLTISRVEAGDEADYYCQVWDSSSDH--",
|
476 |
+
"IGLV3-22*01": "SYELTQLPS-VSVSPGQTARITCSGDVLG------ENYADWYQQKPGQAPELVIYED-----SERYPGIPERFSGST--SGNTTTLTISRVLTEDEADYYCLSGDEDN----",
|
477 |
+
"IGLV3-22*03": "SYELTQLPS-VSLSPGQKARITCSGDVLG------KNYADWYQQKPGQAPELVIYED-----SERYPGIPERFSGST--SGNTTTLTISRVLTEDEADYYCLSGNEDN----",
|
478 |
+
"IGLV3-25*01": "SYELMQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
|
479 |
+
"IGLV3-25*02": "SYELTQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
|
480 |
+
"IGLV3-25*03": "SYELTQPPS-VSVSPGQTARITCSGDALP------KQYAYWYQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGVQAEDEADYYCQSADSSGTY--",
|
481 |
+
"IGLV3-27*01": "SYELTQPSS-VSVSPGQTARITCSGDVLA------KKYARWFQQKPGQAPVLVIYKD-----SERPSGIPERFSGSS--SGTTVTLTISGAQVEDEADYYCYSAADNN----",
|
482 |
+
"IGLV3-31*01": "SSELSQEPA-VSVALG-TARITCQGDSIE------DSVVNWYKQKPSQAPGLVI-LN-----SVQSSGIPKKFSGSS--SGNMATLTITGIQVEDKADYYCQSWDSSRTH--",
|
483 |
+
"IGLV3-31*02": "SSELSQEPA-VSVSLG-TARITCQGDSIE------DSVVNWYKQKPSQAPGLVI-LN-----SVQSSGIPKKFSGSS--SGNMATLTITGIQVEDKADYYCQSWDSSRTH--",
|
484 |
+
"IGLV3-32*01": "SSGPTQVPA-VSVALGQMARITCQGDSME------GSYEHWYQQKPGQAPVLVIYDS-----SDRPSRIPERFSGSK--SGNTTTLTITGAQAEDEADYYYQLIDNHATQ--",
|
485 |
+
"IGLV3-9*01": "SYELTQPLS-VSVALGQTARITCGGNNIG------SKNVHWYQQKPGQAPVLVIYRD-----SNRPSGIPERFSGSN--SGNTATLTISRAQAGDEADYYCQVWDSSTA---",
|
486 |
+
"IGLV3-9*02": "SYELTQPLS-VSVALGQAARITCGGNNLG------YKSVHWYQQKPGQAPVLVIYRD-----NNRPSGIPERFSGSN--SGNTATLTISRAQAGDEADYYCQVWDSSTAH--",
|
487 |
+
"IGLV4-3*01": "LPVLTQPPS-ASALLGASIKLTCTLSSEHS-----TYTIEWYQQRPGRSPQYIMKVKSD-GSHSKGDGIPDRFMGSS--SGADRYLTFSNLQSDDEAEYHCGESHTIDGQVG",
|
488 |
+
"IGLV4-60*01": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQLEDEADYYCETWDSNT----",
|
489 |
+
"IGLV4-60*02": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQFEDEADYYCETWDSNT----",
|
490 |
+
"IGLV4-60*03": "QPVLTQSSS-ASASLGSSVKLTCTLSSGHS-----SYIIAWHQQQPGKAPRYLMKLEGS-GSYNKGSGVPDRFSGSS--SGADRYLTISNLQSEDEADYYCETWDSNT----",
|
491 |
+
"IGLV4-69*01": "QLVLTQSPS-ASASLGASVKLTCTLSSGHS-----SYAIAWHQQQPEKGPRYLMKLNSD-GSHSKGDGIPDRFSGSS--SGAERYLTISSLQSEDEADYYCQTWGTGI----",
|
492 |
+
"IGLV4-69*02": "QLVLTQSPS-ASASLGASVKLTCTLSSGHS-----SYAIAWHQQQPEKGPRYLMKLNSD-GSHSKGDGIPDRFSGSS--SGAERYLTISSLQSEDEADYYCQTWGTGI----",
|
493 |
+
"IGLV5-37*01": "QPVLTQPPS-SSASPGESARLTCTLPSDINV---GSYNIYWYQQKPGSPPRYLLYYYSD-SDKGQGSGVPSRFSGSKDASANTGILLISGLQSEDEADYYCMIWPSNAS---",
|
494 |
+
"IGLV5-39*01": "QPVLTQPTS-LSASPGASARFTCTLRSGINV---GTYRIYWYQQKPGSLPRYLLRYKSD-SDKQQGSGVPSRFSGSKDASTNAGLLLISGLQSEDEADYYCAIWYSSTS---",
|
495 |
+
"IGLV5-39*02": "QPVLTQPTS-LSASPGASARFTCTLRSGINV---GTYRIYWYQQNPGSLPRYLLRYKSD-SDKQQGSGVPSRFSGSKDASTNAGLLLISGLQSEDEADYYCAIWYSSTS---",
|
496 |
+
"IGLV5-45*01": "QAVLTQPAS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
|
497 |
+
"IGLV5-45*02": "QAVLTQPSS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
|
498 |
+
"IGLV5-45*03": "QAVLTQPSS-LSASPGASASLTCTLRSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
|
499 |
+
"IGLV5-45*04": "QAVLTQPSS-LSASPGASASLTCTLCSGINV---GTYRIYWYQQKPGSPPQYLLRYKSD-SDKQQGSGVPSRFSGSKDASANAGILLISGLQSEDEADYYCMIWHSSAS---",
|
500 |
+
"IGLV5-48*01": "QPVLTQPTS-LSASPGASARLTCTLRSGINL---GSYRIFWYQQKPESPPRYLLSYYSD-SSKHQGSGVPSRFSGSKDASSNAGILVISGLQSEDEADYYCMIWHSSAS---",
|
501 |
+
"IGLV5-48*02": "QAVLTQPTS-LSASPGASARLTCTLRSGISV---GSYRIYWYQQKPGSPPRYLLNYYSD-SDKHQGSGVPSRFSGSKDASTNAGILFISGL-SEDEADYYCMIWHSSAS---",
|
502 |
+
"IGLV5-52*01": "QPVLTQPSS-HSASSGASVRLTCMLSSGFSV---GDFWIRWYQQKPGNPPRYLLYYHSD-SNKGQGSGVPSRFSGSNDASANAGILRISGLQPEDEADYYCGTWHSNSKT--",
|
503 |
+
"IGLV6-57*01": "NFMLTQPHS-VSESPGKTVTISCTRSSGSI----ASNYVQWYQQRPGSSPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
|
504 |
+
"IGLV6-57*02": "NFMLTQPHS-VSESPGKTVTISCTGSSGSI----ASNYVQWYQQRPGSAPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
|
505 |
+
"IGLV6-57*03": "NFMLTQPHS-VSESPGKTVTISCTRSSGSI----ASNYVQWYQQRPGSAPTTVIYED-----NQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSN----",
|
506 |
+
"IGLV7-43*01": "QTVVTQEPS-LTVSPGGTVTLTCASSTGAVT---SGYYPNWFQQKPGQAPRALIYST-----SNKHSWTPARFSGSL--LGGKAALTLSGVQPEDEAEYYCLLYYGGAQ---",
|
507 |
+
"IGLV7-46*01": "QAVVTQEPS-LTVSPGGTVTLTCGSSTGAVT---SGHYPYWFQQKPGQAPRTLIYDT-----SNKHSWTPARFSGSL--LGGKAALTLSGAQPEDEAEYYCLLSYSGAR---",
|
508 |
+
"IGLV7-46*02": "QAVVTQEPS-LTVSPGGTVTLTCGSSTGAVT---SGHYPYWFQQKPGQAPRTLIYDT-----SNKHSWTPARFSGSL--LGGKAALTLLGAQPEDEAEYYCLLSYSGAR---",
|
509 |
+
"IGLV8-61*01": "QTVVTQEPS-FSVSPGGTVTLTCGLSSGSVS---TSYYPSWYQQTPGQAPRTLIYST-----NTRSSGVPDRFSGSI--LGNKAALTITGAQADDESDYYCVLYMGSGI---",
|
510 |
+
"IGLV8-61*02": "QTVVTQEPS-FSVSPGGTVTLTCGLSSGSVS---TSYYPSWYQQTPGQAPRTLIYST-----NTRSSGVPDCFSGSI--LGNKAALTITGAQADDESDYYCVLYMGSGI---",
|
511 |
+
"IGLV8/OR8-1*02": "QSVVTQEPS-LSGSPGGTVTLTCALSSGSVS---TSHYPRWYQQTPGQAPHMLICSP-----NTCPSGVPGRFSGSI--LGNKAALTITGTQVDDDSDHYCVLYMGSGN---",
|
512 |
+
"IGLV9-49*01": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV",
|
513 |
+
"IGLV9-49*02": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV",
|
514 |
+
"IGLV9-49*03": "QPVLTQPPS-ASASLGASVTLTCTLSSGYS-----NYKVDWYQQRPGKGPRFVMRVGTGGIVGSKGDGIPDRFSVLG--SGLNRYLTIKNIQEEDESDYHCGADHGSGSNFV"
|
515 |
+
}
|
516 |
+
},
|
517 |
+
'K': {
|
518 |
+
"positions": [
|
519 |
+
"L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20", "L21", "L22", "L23", "L24", "L25", "L26", "L27", "L28", "L29", "L30", "L31", "L32", "L33", "L34", "L35", "L36", "L37", "L38", "L39", "L40", "L41", "L42", "L43", "L44", "L45", "L46", "L47", "L48", "L49", "L50", "L51", "L52", "L53", "L54", "L55", "L56", "L57", "L58", "L59", "L62", "L63", "L64", "L65", "L66", "L67", "L68", "L69", "L70", "L71", "L72", "L74", "L75", "L76", "L77", "L78", "L79", "L80", "L81", "L82", "L83", "L84", "L85", "L86", "L87", "L88", "L89", "L90", "L91", "L92", "L93", "L94", "L95", "L96", "L97", "L98", "L99", "L100", "L101", "L102", "L103", "L104", "L105", "L106", "L107", "L108", "L109", "L110", "L111", "L111A", "L111B", "L111C", "L111D"
|
520 |
+
],
|
521 |
+
"aligned_sequences": {
|
522 |
+
"IGKV1-12*01": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
|
523 |
+
"IGKV1-12*02": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
|
524 |
+
"IGKV1-13*01": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALA-YQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNNYP----",
|
525 |
+
"IGKV1-13*02": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNSYP----",
|
526 |
+
"IGKV1-16*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWFQQKPGKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
|
527 |
+
"IGKV1-16*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWFQQKPGKAPKSLIYAA-----SSLQSGVPSKFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
|
528 |
+
"IGKV1-17*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
|
529 |
+
"IGKV1-17*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISNLQPEDFATYYCLQHNSYP----",
|
530 |
+
"IGKV1-17*03": "DIQMTQSPSAMSASVGDRVTITCRASQGI------SNYLAWFQQKPGKVPKRLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
|
531 |
+
"IGKV1-27*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNYLAWYQQKPGKVPKLLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYCQKYNSAP----",
|
532 |
+
"IGKV1-33*01": "DIQMTQSPSSLSASVGDRVTITCQASQDI------SNYLNWYQQKPGKAPKLLIYDA-----SNLETGVPSRFSGSG--SGTDFTFTISSLQPEDIATYYCQQYDNLP----",
|
533 |
+
"IGKV1-37*01": "DIQLTQSPSSLSASVGDRVTITCRVSQGI------SSYLNWYRQKPGKVPKLLIYSA-----SNLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYGQRTYNAP----",
|
534 |
+
"IGKV1-39*01": "DIQMTQSPSSLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQSYSTP----",
|
535 |
+
"IGKV1-39*02": "DIQMTQSPSFLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQCGYSTP----",
|
536 |
+
"IGKV1-5*01": "DIQMTQSPSTLSASVGDRVTITCRASQSI------SSWLAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
|
537 |
+
"IGKV1-5*02": "DIQMTQSPSTLSASVGDRVTIICRASQSI------SSWLAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
|
538 |
+
"IGKV1-5*03": "DIQMTQSPSTLSASVGDRVTITCRASQSI------SSWLAWYQQKPGKAPKLLIYKA-----SSLESGVPSRFSGSG--SGTEFTLTISSLQPDDFATYYCQQYNSYS----",
|
539 |
+
"IGKV1-6*01": "AIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCLQDYNYP----",
|
540 |
+
"IGKV1-6*02": "AIQMTQSPSSLSASVGDRVTITCRASQGI------RNDLGWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCLQDYNYP----",
|
541 |
+
"IGKV1-8*01": "AIRMTQSPSSFSASTGDRVTITCRASQGI------SSYLAWYQQKPGKAPKLLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSYP----",
|
542 |
+
"IGKV1-9*01": "DIQLTQSPSFLSASVGDRVTITCRASQGI------SSYLAWYQQKPGKAPKLLIYAA-----STLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCQQLNSYP----",
|
543 |
+
"IGKV1-NL1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNSLAWYQQKPGKAPKLLLYAA-----SRLESGVPSRFSGSG--SGTDYTLTISSLQPEDFATYYCQQYYSTP----",
|
544 |
+
"IGKV1/OR-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
|
545 |
+
"IGKV1/OR-3*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFAAYYCQQSDSTP----",
|
546 |
+
"IGKV1/OR-4*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKFLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSTP----",
|
547 |
+
"IGKV1/OR1-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATYYCQQSDSTP----",
|
548 |
+
"IGKV1/OR10-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSTS----",
|
549 |
+
"IGKV1/OR15-118*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----PSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATY-CQQSDSTP----",
|
550 |
+
"IGKV1/OR2-0*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----PSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFATYYCQQSDSTP----",
|
551 |
+
"IGKV1/OR2-1*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTISSLQPEDFAAYYCQQSDSTP----",
|
552 |
+
"IGKV1/OR2-108*01": "DIQVTQSPSSLSASVGDRVTITCRASQGI------SNGLSWYQQKPGQAPTLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYCLQDYTTP----",
|
553 |
+
"IGKV1/OR2-11*01": "DIQMTQPPSSLSASVGDRATVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRAMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
|
554 |
+
"IGKV1/OR2-118*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTIRSLQPEDFANYYCQQSDSTP----",
|
555 |
+
"IGKV1/OR2-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGADYTLTISSLQPEDFAAYYCQQSDSTP----",
|
556 |
+
"IGKV1/OR2-3*01": "DIQMTQPPSSLSASVGDRVTVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRGMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
|
557 |
+
"IGKV1/OR2-9*01": "DIQMTQPPSSLSASVGDRATVSCQASQSI------YNYLNWYQQKPGKAPKFLTYRA-----SSLQRAMPSQFSGSG--YGRDFTLTVSSLQPEDFATY-CQQESIFP----",
|
558 |
+
"IGKV1/OR22-5*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPGKTPKPLIYAA-----SSLQSGIPSQFSDSG--SGTD-TLTISSLQPEDFATYYCQQSYSTP----",
|
559 |
+
"IGKV1/OR22-5*02": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SNNLNWYQQKPGKTPKLLIYAA-----SSLQSGIPSQFSDSG--SGTD-TLTISSLQPEDFTTYYCQQSYSTP----",
|
560 |
+
"IGKV1/OR9-1*01": "DIQMTQSPSSLSASVGGRVTITCRVSQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
|
561 |
+
"IGKV1/OR9-2*01": "DIQMTQSPSSLSASVGGRVTITCRASQGI------SNNLNWYQQKPRKTPKLLIYAA-----SSLQSGIPSRFSDSG--SGTDYTLTISSLQPEDFATYYCQQSDSNP----",
|
562 |
+
"IGKV1/ORY-1*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------INNLNWYQKKPGKTPKLLIYAA-----SSLQSGIPTRFSDSG--SGTDYTPTISSLQPEDFATYYCQQSDSTP----",
|
563 |
+
"IGKV1D-12*01": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
|
564 |
+
"IGKV1D-12*02": "DIQMTQSPSSVSASVGDRVTITCRASQGI------SSWLAWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQANSFP----",
|
565 |
+
"IGKV1D-13*01": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNNYP----",
|
566 |
+
"IGKV1D-13*02": "AIQLTQSPSSLSASVGDRVTITCRASQGI------SSALAWYQQKPGKAPKLLIYDA-----SSLESGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQFNSYP----",
|
567 |
+
"IGKV1D-16*01": "DIQMTQSPSSLSASVGDRVTITCRASQGI------SSWLAWYQQKPEKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
|
568 |
+
"IGKV1D-16*02": "DIQMTQSPSSLSASVGDRVTITCRARQGI------SSWLAWYQQKPEKAPKSLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQYNSYP----",
|
569 |
+
"IGKV1D-17*01": "NIQMTQSPSAMSASVGDRVTITCRARQGI------SNYLAWFQQKPGKVPKHLIYAA-----SSLQSGVPSRFSGSG--SGTEFTLTISSLQPEDFATYYCLQHNSYP----",
|
570 |
+
"IGKV1D-33*01": "DIQMTQSPSSLSASVGDRVTITCQASQDI------SNYLNWYQQKPGKAPKLLIYDA-----SNLETGVPSRFSGSG--SGTDFTFTISSLQPEDIATYYCQQYDNLP----",
|
571 |
+
"IGKV1D-37*01": "DIQLTQSPSSLSASVGDRVTITCRVSQGI------SSYLNWYRQKPGKVPKLLIYSA-----SNLQSGVPSRFSGSG--SGTDFTLTISSLQPEDVATYYGQRTYNAP----",
|
572 |
+
"IGKV1D-39*01": "DIQMTQSPSSLSASVGDRVTITCRASQSI------SSYLNWYQQKPGKAPKLLIYAA-----SSLQSGVPSRFSGSG--SGTDFTLTISSLQPEDFATYYCQQSYSTP----",
|
573 |
+
"IGKV1D-42*01": "DIQMIQSPSFLSASVGDRVSIICWASEGI------SSNLAWYLQKPGKSPKLFLYDA-----KDLHPGVSSRFSGRG--SGTDFTLTIISLKPEDFAAYYCKQDFSYP----",
|
574 |
+
"IGKV1D-42*02": "DIQMTQSPSFLSASVGDRVSIICWASEGI------SSNLAWYLQKPGKSPKLFLYDA-----KDLHPGVSSRFSGRG--SGTDFTLTIISLKPEDFAAYYCKQDFSYP----",
|
575 |
+
"IGKV1D-43*01": "AIRMTQSPFSLSASVGDRVTITCWASQGI------SSYLAWYQQKPAKAPKLFIYYA-----SSLQSGVPSRFSGSG--SGTDYTLTISSLQPEDFATYYCQQYYSTP----",
|
576 |
+
"IGKV1D-8*01": "VIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
|
577 |
+
"IGKV1D-8*02": "AIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
|
578 |
+
"IGKV1D-8*03": "VIWMTQSPSLLSASTGDRVTISCRMSQGI------SSYLAWYQQKPGKAPELLIYAA-----STLQSGVPSRFSGSG--SGTDFTLTISCLQSEDFATYYCQQYYSFP----",
|
579 |
+
"IGKV2-18*01": "DIVMTQTPPSLPVNPGEPASISCRSSQSLLHS-NGYTYLHWYLQKPGQSPQLLIYRV-----SNHLSGVPDRFSGSG--SGSDFTLKISWVEAEDVGVYYCMQATQFP----",
|
580 |
+
"IGKV2-24*01": "DIVMTQTPLSSPVTLGQPASISCRSSQSLVHS-DGNTYLSWLQQRPGQPPRLLIYKI-----SNRFSGVPDRFSGSG--AGTDFTLKISRVEAEDVGVYYCMQATQFP----",
|
581 |
+
"IGKV2-28*01": "DIVMTQSPLSLPVTPGEPASISCRSSQSLLHS-NGYNYLDWYLQKPGQSPQLLIYLG-----SNRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQALQTP----",
|
582 |
+
"IGKV2-29*01": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYY-MQGIHLP----",
|
583 |
+
"IGKV2-29*02": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGIHLP----",
|
584 |
+
"IGKV2-29*03": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SSRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGIHLP----",
|
585 |
+
"IGKV2-30*01": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVYS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNRDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
|
586 |
+
"IGKV2-30*02": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVHS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNRDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
|
587 |
+
"IGKV2-4*01": "DIVMTQHLLSLPIPLGEPASISCRSSQSLLHS-DGNTYLDWYLQKPGQSPQLLIYTI-----SNKFYGVPNKFSGSR--SGTGFTLKFSKVEAEDVGVYCCEQGLQGP----",
|
588 |
+
"IGKV2-40*01": "DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSDDGNTYLDWYLQKPGQSPQLLIYTL-----SYRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQRIEFP----",
|
589 |
+
"IGKV2/OR2-7D*01": "DILLTQTPLSLSITPGEPASISCRSSRSLLHS-NGNTYLHW-LQKPGQPPQCLICKV-----SNRFSGVPDRFSGSG--SGIDFTLKISPVEAADVGVYITACKLHTGP---",
|
590 |
+
"IGKV2/OR22-4*01": "DIVMTQTPLSLPVTPGEPASISCRSSESLLDTDDEYTYLNWYLQKPGQSPQLLIYEV-----SNRASGVPDRFSGSG--SGTDFTLKISRVEA-DVGVYYCMQALQTP----",
|
591 |
+
"IGKV2D-18*01": "DIVMTQTPPSLPVNPGEPASISCRSSQSLLHS-NGYTYLHWYPQKPGQSPQLLIYRV-----SSRFSGVPDRFSGSG--SGSDFTLKISWVEAEDVGVYYCMQATQFP----",
|
592 |
+
"IGKV2D-24*01": "DIVMTQTPLSSPVTLGQPASISFRSSQSLVHS-DGNTYLSWLQQRPGQPPRLLIYKV-----SNRFSGVPDRFSGSG--AGTDFTLKISRVEAEDVGVYYCTQATQFP----",
|
593 |
+
"IGKV2D-26*01": "EIVMTQTPLSLSITPGEQASISCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
|
594 |
+
"IGKV2D-26*02": "EIVMTQTPLSLSITPGEQASMSCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLICEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
|
595 |
+
"IGKV2D-26*03": "EIVMTQTPLSLSITPGEQASMSCRSSQSLLHS-DGYTYLYWFLQKARPVSTLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDFGVYYCMQDAQDP----",
|
596 |
+
"IGKV2D-28*01": "DIVMTQSPLSLPVTPGEPASISCRSSQSLLHS-NGYNYLDWYLQKPGQSPQLLIYLG-----SNRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQALQTP----",
|
597 |
+
"IGKV2D-29*01": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQPPQLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQSIQLP----",
|
598 |
+
"IGKV2D-29*02": "DIVMTQTPLSLSVTPGQPASISCKSSQSLLHS-DGKTYLYWYLQKPGQSPQLLIYEV-----SNRFSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQSIQLP----",
|
599 |
+
"IGKV2D-30*01": "DVVMTQSPLSLPVTLGQPASISCRSSQSLVYS-DGNTYLNWFQQRPGQSPRRLIYKV-----SNWDSGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQGTHWP----",
|
600 |
+
"IGKV2D-40*01": "DIVMTQTPLSLPVTPGEPASISCRSSQSLLDSDDGNTYLDWYLQKPGQSPQLLIYTL-----SYRASGVPDRFSGSG--SGTDFTLKISRVEAEDVGVYYCMQRIEFP----",
|
601 |
+
"IGKV3-11*01": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--SGTDFTLTISSLEPEDFAVYYCQQRSNWP----",
|
602 |
+
"IGKV3-11*02": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--SGRDFTLTISSLEPEDFAVYYCQQRSNWP----",
|
603 |
+
"IGKV3-15*01": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNNWP----",
|
604 |
+
"IGKV3-20*01": "EIVLTQSPGTLSLSPGERATLSCRASQSVS-----SSYLAWYQQKPGQAPRLLIYGA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQYGSSP----",
|
605 |
+
"IGKV3-7*01": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDHNLP----",
|
606 |
+
"IGKV3-7*02": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
|
607 |
+
"IGKV3-7*03": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGRDFTLTISSLQPEDFAVYYCQQDHNLP----",
|
608 |
+
"IGKV3-7*04": "EIVMTQSPPTLSLSPGERVTLSCRASQSVS-----SSYLTWYQQKPGQAPRLLIYGA-----STRATSIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
|
609 |
+
"IGKV3/OR2-268*01": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
|
610 |
+
"IGKV3/OR2-268*02": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
|
611 |
+
"IGKV3D-11*01": "EIVLTQSPATLSLSPGERATLSCRASQGV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
|
612 |
+
"IGKV3D-11*02": "EIVLTQSPATLSLSPGERATLSCRASQSV------SSYLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
|
613 |
+
"IGKV3D-11*03": "EIVLTQSPATLSLSPGERATLSCRASQGV------SSNLAWYQQKPGQAPRLLIYDA-----SNRATGIPARFSGSG--PGTDFTLTISSLEPEDFAVYYCQQRSNWH----",
|
614 |
+
"IGKV3D-15*01": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNNWP----",
|
615 |
+
"IGKV3D-15*02": "EIVMMQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTEFTLTISSLQSEDFAVYYCQQYNN-P----",
|
616 |
+
"IGKV3D-15*03": "EIVMTQSPATLSVSPGERATLSCRASQSV------SSNLAWYQQKPGQAPRLLIYGA-----SIRATGIPARFSGSG--SGTEFTLTISILQSEDFAVYYCQQYNNWP----",
|
617 |
+
"IGKV3D-20*01": "EIVLTQSPATLSLSPGERATLSCGASQSVS-----SSYLAWYQQKPGLAPRLLIYDA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQYGSSP----",
|
618 |
+
"IGKV3D-20*02": "EIVLTQSPATLSLSPGERATLSCRASQSVS-----SSYLAWYQQKPGQAPRLLIYDA-----SSRATGIPDRFSGSG--SGTDFTLTISRLEPEDFAVYYCQQRSNWH----",
|
619 |
+
"IGKV3D-7*01": "EIVMTQSPATLSLSPGERATLSCRASQSVS-----SSYLSWYQQKPGQAPRLLIYGA-----STRATGIPARFSGSG--SGTDFTLTISSLQPEDFAVYYCQQDYNLP----",
|
620 |
+
"IGKV4-1*01": "DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLAWYQQKPGQPPKLLIYWA-----STRESGVPDRFSGSG--SGTDFTLTISSLQAEDVAVYYCQQYYSTP----",
|
621 |
+
"IGKV5-2*01": "ETTLTQSPAFMSATPGDKVNISCKASQDI------DDDMNWYQQKPGEAAIFIIQEA-----TTLVPGIPPRFSGSG--YGTDFTLTINNIESEDAAYYFCLQHDNFP----",
|
622 |
+
"IGKV6-21*01": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSFSGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
|
623 |
+
"IGKV6-21*02": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
|
624 |
+
"IGKV6D-21*01": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSFSGVPSRFSGSG--SGTDFTLTINSLEAEDAATYYCHQSSSLP----",
|
625 |
+
"IGKV6D-21*02": "EIVLTQSPDFQSVTPKEKVTITCRASQSI------GSSLHWYQQKPDQSPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTLTINSLEAEDAAAYYCHQSSSLP----",
|
626 |
+
"IGKV6D-41*01": "DVVMTQSPAFLSVTPGEKVTITCQASEGI------GNYLYWYQQKPDQAPKLLIKYA-----SQSISGVPSRFSGSG--SGTDFTFTISSLEAEDAATYYCQQGNKHP----",
|
627 |
+
"IGKV7-3*01": "DIVLTQSPASLAVSPGQRATITCRASESVSF--LGINLIHWYQQKPGQPPKLLIYQA-----SNKDTGVPARFSGSG--SGTDFTLTINPVEANDTANYYCLQSKNFP----"
|
628 |
+
}
|
629 |
+
}
|
630 |
+
}
|
631 |
+
|
632 |
+
HUMAN_IMGT_IG_J = {
|
633 |
+
'H': {
|
634 |
+
"positions": [
|
635 |
+
"H112C", "H112B", "H112A", "H112", "H113", "H114", "H115", "H116", "H117", "H118", "H119", "H120", "H121", "H122", "H123", "H124", "H125", "H126", "H127", "H128"
|
636 |
+
],
|
637 |
+
"aligned_sequences": {
|
638 |
+
"IGHJ1*01": "---AEYFQHWGQGTLVTVSS",
|
639 |
+
"IGHJ2*01": "---YWYFDLWGRGTLVTVSS",
|
640 |
+
"IGHJ3*01": "----DAFDVWGQGTMVTVSS",
|
641 |
+
"IGHJ3*02": "----DAFDIWGQGTMVTVSS",
|
642 |
+
"IGHJ4*01": "-----YFDYWGQGTLVTVSS",
|
643 |
+
"IGHJ4*02": "-----YFDYWGQGTLVTVSS",
|
644 |
+
"IGHJ4*03": "-----YFDYWGQGTLVTVSS",
|
645 |
+
"IGHJ5*01": "----NWFDSWGQGTLVTVSS",
|
646 |
+
"IGHJ5*02": "----NWFDPWGQGTLVTVSS",
|
647 |
+
"IGHJ6*01": "YYYYYGMDVWGQGTTVTVSS",
|
648 |
+
"IGHJ6*04": "YYYYYGMDVWGKGTTVTVSS"
|
649 |
+
}
|
650 |
+
},
|
651 |
+
'L': {
|
652 |
+
"positions": [
|
653 |
+
"L116", "L117", "L118", "L119", "L120", "L121", "L122", "L123", "L124", "L125", "L126", "L127"
|
654 |
+
],
|
655 |
+
"aligned_sequences": {
|
656 |
+
"IGLJ1*01": "YVFGTGTKVTVL",
|
657 |
+
"IGLJ2*01": "VVFGGGTKLTVL",
|
658 |
+
"IGLJ3*01": "VVFGGGTKLTVL",
|
659 |
+
"IGLJ3*02": "WVFGGGTKLTVL",
|
660 |
+
"IGLJ4*01": "FVFGGGTQLIIL",
|
661 |
+
"IGLJ5*01": "WVFGEGTELTVL",
|
662 |
+
"IGLJ5*02": "WVFGEGTELTVL",
|
663 |
+
"IGLJ6*01": "NVFGSGTKVTVL",
|
664 |
+
"IGLJ7*01": "AVFGGGTQLTVL",
|
665 |
+
"IGLJ7*02": "AVFGGGTQLTAL"
|
666 |
+
}
|
667 |
+
},
|
668 |
+
'K': {
|
669 |
+
"positions": [
|
670 |
+
"L116", "L117", "L118", "L119", "L120", "L121", "L122", "L123", "L124", "L125", "L126", "L127"
|
671 |
+
],
|
672 |
+
"aligned_sequences": {
|
673 |
+
"IGKJ1*01": "WTFGQGTKVEIK",
|
674 |
+
"IGKJ2*01": "YTFGQGTKLEIK",
|
675 |
+
"IGKJ2*02": "CTFGQGTKLEIK",
|
676 |
+
"IGKJ2*03": "YSFGQGTKLEIK",
|
677 |
+
"IGKJ2*04": "CSFGQGTKLEIK",
|
678 |
+
"IGKJ3*01": "FTFGPGTKVDIK",
|
679 |
+
"IGKJ4*01": "LTFGGGTKVEIK",
|
680 |
+
"IGKJ4*02": "LTFGGGTKVEIK",
|
681 |
+
"IGKJ5*01": "ITFGQGTRLEIK"
|
682 |
+
}
|
683 |
+
}
|
684 |
+
}
|
abnumber/position.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from typing import List, Union
|
3 |
+
|
4 |
+
from abnumber.common import _validate_chain_type, SCHEME_POSITION_TO_REGION, SCHEME_VERNIER, POS_REGEX
|
5 |
+
|
6 |
+
|
7 |
+
class Position:
|
8 |
+
"""Numbered position using a given numbering scheme
|
9 |
+
|
10 |
+
Used as a key to store Position -> Amino acid information.
|
11 |
+
|
12 |
+
Position objects are sortable according to the schema simply using ``sorted()``.
|
13 |
+
"""
|
14 |
+
def __init__(self, chain_type: str, number: int, letter: str, scheme: str):
|
15 |
+
_validate_chain_type(chain_type)
|
16 |
+
self.chain_type: str = chain_type
|
17 |
+
self.number: int = int(number)
|
18 |
+
self.letter: str = letter.strip()
|
19 |
+
self.scheme: str = scheme
|
20 |
+
self.cdr_definition: str = self.scheme
|
21 |
+
self.cdr_definition_position: int = self.number
|
22 |
+
|
23 |
+
def copy(self):
|
24 |
+
return copy.copy(self)
|
25 |
+
|
26 |
+
def _key(self):
|
27 |
+
# Note: We are not including chain_type, but just Heavy/Light flag, to keep Kappa and Lambda chain positions equal
|
28 |
+
return self.chain_type_prefix(), self.number, self.letter, self.scheme
|
29 |
+
|
30 |
+
def __repr__(self):
|
31 |
+
return f'{self.chain_type_prefix()}{self.number}{self.letter} ({self.scheme})'
|
32 |
+
|
33 |
+
def __str__(self):
|
34 |
+
return self.format()
|
35 |
+
|
36 |
+
def set_cdr_definition(self, cdr_definition: str, cdr_definition_position: int):
|
37 |
+
assert cdr_definition is not None, 'cdr_definition is required'
|
38 |
+
assert cdr_definition_position is not None, 'cdr_definition_position is required'
|
39 |
+
self.cdr_definition = cdr_definition
|
40 |
+
self.cdr_definition_position = cdr_definition_position
|
41 |
+
|
42 |
+
def format(self, chain_type=True, region=False, rjust=False, ljust=False, fillchar=' '):
|
43 |
+
"""Format Position to string
|
44 |
+
|
45 |
+
:param chain_type: Add chain type prefix (H/L)
|
46 |
+
:param region: Add region prefix (FR1, CDR1, ...)
|
47 |
+
:param rjust: Align text to the right
|
48 |
+
:param ljust: Align text to the left
|
49 |
+
:param fillchar: Characer to use for alignment padding
|
50 |
+
:return: formatted string
|
51 |
+
"""
|
52 |
+
formatted = f'{self.number}{self.letter}'
|
53 |
+
if chain_type:
|
54 |
+
formatted = f'{self.chain_type_prefix()}{formatted}'
|
55 |
+
if region:
|
56 |
+
formatted = f'{self.get_region()} {formatted}'
|
57 |
+
just = 4 + 1* int(chain_type) + 5 * int(region)
|
58 |
+
if rjust:
|
59 |
+
formatted = formatted.rjust(just, fillchar)
|
60 |
+
if ljust:
|
61 |
+
formatted = formatted.ljust(just, fillchar)
|
62 |
+
return formatted
|
63 |
+
|
64 |
+
def __hash__(self):
|
65 |
+
return self._key().__hash__()
|
66 |
+
|
67 |
+
def __eq__(self, other):
|
68 |
+
return isinstance(other, Position) and self._key() == other._key()
|
69 |
+
|
70 |
+
def __ge__(self, other):
|
71 |
+
return self == other or self > other
|
72 |
+
|
73 |
+
def __le__(self, other):
|
74 |
+
return self == other or self < other
|
75 |
+
|
76 |
+
def __lt__(self, other):
|
77 |
+
if not isinstance(other, Position):
|
78 |
+
raise TypeError(f'Cannot compare Position object with {type(other)}: {other}')
|
79 |
+
assert self.is_heavy_chain() == other.is_heavy_chain(), f'Positions do not come from the same chain: {self}, {other}'
|
80 |
+
assert self.scheme == other.scheme, 'Comparing positions in different schemes is not implemented'
|
81 |
+
return self._sort_key() < other._sort_key()
|
82 |
+
|
83 |
+
def chain_type_prefix(self):
|
84 |
+
if self.chain_type == 'H':
|
85 |
+
return 'H'
|
86 |
+
if self.chain_type in ['K', 'L']:
|
87 |
+
return 'L'
|
88 |
+
raise NotImplementedError(f'Unknown chain type "{self.chain_type}"')
|
89 |
+
|
90 |
+
def _sort_key(self):
|
91 |
+
letter_ord = ord(self.letter) if self.letter else 0
|
92 |
+
if self.scheme == 'imgt':
|
93 |
+
if self.number in [33, 61, 112]:
|
94 |
+
# position 112 is sorted in reverse
|
95 |
+
letter_ord = -letter_ord
|
96 |
+
elif self.scheme in ['chothia', 'kabat', 'aho']:
|
97 |
+
# all letters are sorted alphabetically for these schemes
|
98 |
+
pass
|
99 |
+
else:
|
100 |
+
raise NotImplementedError(f'Cannot compare positions of scheme: {self.scheme}')
|
101 |
+
return self.is_heavy_chain(), self.number, letter_ord
|
102 |
+
|
103 |
+
def get_region(self):
|
104 |
+
"""Get string name of this position's region
|
105 |
+
|
106 |
+
:return: uppercase string, one of: ``"FR1", "CDR1", "FR2", "CDR2", "FR3", "CDR3", "FR4"``
|
107 |
+
"""
|
108 |
+
if self.cdr_definition in SCHEME_POSITION_TO_REGION:
|
109 |
+
regions = SCHEME_POSITION_TO_REGION[self.cdr_definition]
|
110 |
+
else:
|
111 |
+
regions = SCHEME_POSITION_TO_REGION[f'{self.cdr_definition}_{self.chain_type}']
|
112 |
+
return regions[self.cdr_definition_position]
|
113 |
+
|
114 |
+
def is_in_cdr(self):
|
115 |
+
"""Check if given position is found in the CDR regions"""
|
116 |
+
return self.get_region().lower().startswith('cdr')
|
117 |
+
|
118 |
+
def is_in_vernier(self):
|
119 |
+
if self.cdr_definition != 'kabat':
|
120 |
+
raise NotImplementedError('Vernier zone identification is currently supported '
|
121 |
+
f'only with Kabat CDR definitions, got: {self.cdr_definition}')
|
122 |
+
return self.cdr_definition_position in SCHEME_VERNIER.get(f'{self.cdr_definition}_{self.chain_type}', [])
|
123 |
+
|
124 |
+
@classmethod
|
125 |
+
def from_string(cls, position, chain_type, scheme):
|
126 |
+
"""Create Position object from string, e.g. "H5"
|
127 |
+
|
128 |
+
Note that Positions parsed from string do not support separate CDR definitions.
|
129 |
+
"""
|
130 |
+
match = POS_REGEX.match(position.upper())
|
131 |
+
_validate_chain_type(chain_type)
|
132 |
+
expected_chain_prefix = 'H' if chain_type == 'H' else 'L'
|
133 |
+
if match is None:
|
134 |
+
raise IndexError(f'Expected position format chainNumberLetter '
|
135 |
+
f'(e.g. "{expected_chain_prefix}112A" or "112A"), got: "{position}"')
|
136 |
+
chain_prefix, number, letter = match.groups()
|
137 |
+
number = int(number)
|
138 |
+
if chain_prefix and expected_chain_prefix != chain_prefix:
|
139 |
+
raise IndexError(f'Use no prefix or "{expected_chain_prefix}" prefix for "{chain_type}" chain. '
|
140 |
+
f'Got: "{chain_prefix}".')
|
141 |
+
return cls(chain_type=chain_type, number=number, letter=letter, scheme=scheme)
|
142 |
+
|
143 |
+
def is_heavy_chain(self):
|
144 |
+
return self.chain_type == 'H'
|
145 |
+
|
146 |
+
def is_light_chain(self):
|
147 |
+
return self.chain_type in 'KL'
|
148 |
+
|
149 |
+
|
150 |
+
def sort_positions(positions: List[str], chain_type: str, scheme: str) -> List:
|
151 |
+
"""Sort position strings to correct order based on given scheme"""
|
152 |
+
has_prefix = [p.startswith('H') or p.startswith('L') for p in positions]
|
153 |
+
assert all(has_prefix) or not any(has_prefix), 'Inconsistent position prefix'
|
154 |
+
has_prefix = all(has_prefix)
|
155 |
+
|
156 |
+
position_objects = [Position.from_string(p, chain_type=chain_type, scheme=scheme) for p in positions]
|
157 |
+
|
158 |
+
return [p.format(chain_type=has_prefix) for p in sorted(position_objects)]
|
anarci/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
anarci/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "1.b"
|
2 |
+
__all__ = ["anarci", "schemes"]
|
3 |
+
from .anarci import *
|
anarci/anarci.py
ADDED
@@ -0,0 +1,1013 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ANARCI - Antibody Numbering and Antigen Receptor ClassIfication
|
2 |
+
# Copyright (C) 2016 Oxford Protein Informatics Group (OPIG)
|
3 |
+
#
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.#
|
13 |
+
#
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
'''
|
18 |
+
ANARCI - Antigen Receptor Numbering And ClassIfication
|
19 |
+
|
20 |
+
Oxford Protein Informatics Group (OPIG). 2015-17
|
21 |
+
|
22 |
+
ANARCI performs alignments of sequences to databases of Hidden Markov Models (HMMs).
|
23 |
+
Those that align with a significant score are classified by species and chain type.
|
24 |
+
They are then numbered with a scheme of the user's choosing.
|
25 |
+
|
26 |
+
Currently implemented schemes:
|
27 |
+
IMGT
|
28 |
+
Chothia (IGs only)
|
29 |
+
Kabat (IGs only)
|
30 |
+
Martin / Enhanced Chothia (IGs only)
|
31 |
+
AHo
|
32 |
+
Wolfguy (IGs only)
|
33 |
+
|
34 |
+
Currently recognisable species (chains):
|
35 |
+
Human (heavy, kappa, lambda, alpha, beta)
|
36 |
+
Mouse (heavy, kappa, lambda, alpha, beta)
|
37 |
+
Rat (heavy, kappa, lambda)
|
38 |
+
Rabbit (heavy, kappa, lambda)
|
39 |
+
Pig (heavy, kappa, lambda)
|
40 |
+
Rhesus Monkey (heavy, kappa)
|
41 |
+
|
42 |
+
Notes:
|
43 |
+
o Use assign_germline to get a better species assignment
|
44 |
+
o Each scheme has been implemented to follow the published specification as closely as possible. However, in places some schemes
|
45 |
+
do not specifiy where insertions should be placed (e.g. imgt FW3). In these cases the HMM alignment is used. This can give rise
|
46 |
+
to inserted positions that were not described by the respective paper.
|
47 |
+
o AHo is implemented heuristically based on chain type. If one grafted a foreign CDR1 loop onto, say, a VH domain, it will be
|
48 |
+
numbered as if it is a CDRH1 loop.
|
49 |
+
|
50 |
+
|
51 |
+
'''
|
52 |
+
|
53 |
+
import os
|
54 |
+
import sys
|
55 |
+
import tempfile
|
56 |
+
import gzip
|
57 |
+
import math
|
58 |
+
from functools import partial
|
59 |
+
from textwrap import wrap
|
60 |
+
from subprocess import Popen, PIPE
|
61 |
+
from itertools import groupby, islice
|
62 |
+
from multiprocessing import Pool
|
63 |
+
|
64 |
+
from Bio.SearchIO.HmmerIO import Hmmer3TextParser as HMMERParser
|
65 |
+
|
66 |
+
# Import from the schemes submodule
|
67 |
+
from .schemes import *
|
68 |
+
from .germlines import all_germlines
|
69 |
+
|
70 |
+
all_species = list(all_germlines['V']['H'].keys())
|
71 |
+
|
72 |
+
amino_acids = sorted(list("QWERTYIPASDFGHKLCVNM"))
|
73 |
+
set_amino_acids = set(amino_acids)
|
74 |
+
anarci_path = os.path.split(__file__)[0]
|
75 |
+
|
76 |
+
scheme_short_to_long = { "m":"martin", "c":"chothia", "k":"kabat","imgt":"imgt", "kabat":"kabat", "chothia":"chothia", "martin":"martin", "i":"imgt", "a":"aho","aho":"aho","wolfguy":"wolfguy", "w":"wolfguy"}
|
77 |
+
|
78 |
+
scheme_names = list(scheme_short_to_long.keys())
|
79 |
+
chain_type_to_class = {"H":"H", "K":"L", "L":"L", "A":"A", "B":"B", "G":"G", "D":"D"}
|
80 |
+
|
81 |
+
HMM_path = os.path.join( anarci_path, "dat", "HMMs" )
|
82 |
+
|
83 |
+
all_reference_states = list(range( 1, 129)) # These are the IMGT reference states (matches)
|
84 |
+
|
85 |
+
class HMMscanError(Exception):
|
86 |
+
def __init__(self, message):
|
87 |
+
# Call the base class constructor with the parameters it needs
|
88 |
+
super(HMMscanError, self).__init__(message)
|
89 |
+
|
90 |
+
## Utility functions ##
|
91 |
+
def read_fasta(filename):
|
92 |
+
"""
|
93 |
+
Read a sequence file and parse as description, string
|
94 |
+
"""
|
95 |
+
return [ r for r in fasta_iter(filename) ]
|
96 |
+
|
97 |
+
def fasta_iter(fasta_name):
|
98 |
+
"""
|
99 |
+
Given a fasta file. yield tuples of header, sequence
|
100 |
+
https://www.biostars.org/p/710/
|
101 |
+
"""
|
102 |
+
if fasta_name.endswith( '.gz' ): # IOError raised upon iteration if not a real gzip file.
|
103 |
+
fh = gzip.open(fasta_name)
|
104 |
+
else:
|
105 |
+
fh = open(fasta_name)
|
106 |
+
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
|
107 |
+
for header in faiter:
|
108 |
+
header = next(header)[1:].strip()
|
109 |
+
#header = header.next()[1:].strip()
|
110 |
+
seq = "".join(s.strip() for s in next(faiter))
|
111 |
+
yield header, seq
|
112 |
+
|
113 |
+
|
114 |
+
def write_fasta(sequences, f):
|
115 |
+
"""
|
116 |
+
Write a list of sequences to file.
|
117 |
+
|
118 |
+
should be a list of name, sequence tuples
|
119 |
+
|
120 |
+
f should be an open file
|
121 |
+
"""
|
122 |
+
for name, sequence in sequences:
|
123 |
+
print(">%s"%name, file=f)
|
124 |
+
print('\n'.join(['\n'.join(wrap(block, width=80)) for block in sequence.splitlines()]), file=f)
|
125 |
+
|
126 |
+
|
127 |
+
def validate_sequence(sequence):
|
128 |
+
"""
|
129 |
+
Check whether a sequence is a protein sequence or if someone has submitted something nasty.
|
130 |
+
"""
|
131 |
+
assert len(sequence) < 10000, "Sequence too long."
|
132 |
+
assert not (set( sequence.upper() ) - set_amino_acids ), "Unknown amino acid letter found in sequence: %s"% ", ".join(list((set( sequence.upper() ) - set_amino_acids )))
|
133 |
+
return True
|
134 |
+
|
135 |
+
def validate_numbering(xxx_todo_changeme, name_seq=[]):
|
136 |
+
"""
|
137 |
+
Wrapper to do some basic validation of the numbering.
|
138 |
+
|
139 |
+
Further validation could be done but at the moment we just check that the numbering indices are incremental (they should be)
|
140 |
+
"""
|
141 |
+
(numbering, start, end) = xxx_todo_changeme
|
142 |
+
name, seq = name_seq
|
143 |
+
last = -1
|
144 |
+
nseq=""
|
145 |
+
|
146 |
+
for (index, _), a in numbering:
|
147 |
+
assert index >= last, "Numbering was found to decrease along the sequence %s. Please report."%name
|
148 |
+
last = index
|
149 |
+
nseq += a.replace("-","")
|
150 |
+
|
151 |
+
assert nseq in seq.replace("-",""), "The algorithm did not number a contiguous segment for sequence %s. Please report"%name
|
152 |
+
|
153 |
+
return numbering, start, end
|
154 |
+
|
155 |
+
def grouper(n, iterable):
|
156 |
+
'''
|
157 |
+
Group entries of an iterable by n
|
158 |
+
'''
|
159 |
+
it = iter(iterable)
|
160 |
+
def take():
|
161 |
+
while 1:
|
162 |
+
yield list( islice(it,n) )
|
163 |
+
return iter(take().__next__, [] )
|
164 |
+
|
165 |
+
def anarci_output(numbered, sequences, alignment_details, outfile, sequence_id=None, domain_id=None):
|
166 |
+
"""
|
167 |
+
Outputs to open file
|
168 |
+
|
169 |
+
If sequence_id is specified as an integer then only this sequence will be printed.
|
170 |
+
Otherwise all sequences will be printed.
|
171 |
+
|
172 |
+
If domain_id is specified as an integer then only this domain will be printed.
|
173 |
+
Otherwise all domains will be printed.
|
174 |
+
|
175 |
+
If domain_id is specified then sequence_id must also be specified.
|
176 |
+
"""
|
177 |
+
assert (sequence_id is not None) or (sequence_id is None and domain_id is None), "If domain_id is specified, sequence_id must also be specified."
|
178 |
+
for i in range(len(numbered)):
|
179 |
+
if sequence_id is None:
|
180 |
+
print("# %s"%sequences[i][0], file=outfile) # print the name
|
181 |
+
if numbered[i] is not None:
|
182 |
+
if sequence_id is not None:
|
183 |
+
if i != sequence_id: continue
|
184 |
+
print("# ANARCI numbered", file=outfile)
|
185 |
+
for j in range( len(numbered[i])): # Iterate over domains
|
186 |
+
if domain_id is not None:
|
187 |
+
if j != domain_id: continue
|
188 |
+
print("# Domain %d of %d"%(j+1, len(numbered[i]) ), file=outfile)
|
189 |
+
print("# Most significant HMM hit", file=outfile)
|
190 |
+
print("#|species|chain_type|e-value|score|seqstart_index|seqend_index|", file=outfile)
|
191 |
+
alignment_details[i][j]["evalue"] = str( alignment_details[i][j]["evalue"] )
|
192 |
+
print("#|%s|%s|%s|%.1f|%d|%d|"%tuple( [alignment_details[i][j][field] for field in
|
193 |
+
["species","chain_type","evalue","bitscore"]]
|
194 |
+
+[ numbered[i][j][1], numbered[i][j][2]] ), file=outfile)
|
195 |
+
|
196 |
+
if 'germlines' in alignment_details[i][j]:
|
197 |
+
print('# Most sequence-identical germlines', file=outfile)
|
198 |
+
print('#|species|v_gene|v_identity|j_gene|j_identity|', file=outfile)
|
199 |
+
(species, vgene), vid =alignment_details[i][j]['germlines'].get('v_gene', [['','unknown'],0])
|
200 |
+
if vgene is None:
|
201 |
+
vgene, vid = 'unknown', 0
|
202 |
+
(_,jgene), jid =alignment_details[i][j]['germlines'].get('j_gene', [['','unknown'],0])
|
203 |
+
if jgene is None:
|
204 |
+
jgene, jid = 'unknown', 0
|
205 |
+
print('#|%s|%s|%.2f|%s|%.2f|'%(species, vgene, vid, jgene, jid ), file=outfile)
|
206 |
+
chain_type = chain_type_to_class[ alignment_details[i][j]["chain_type"] ]
|
207 |
+
print("# Scheme = %s"%alignment_details[i][j]["scheme"], file=outfile)
|
208 |
+
if len( numbered[i][j][0] ) == 0:
|
209 |
+
print("# Warning: %s scheme could not be applied to this sequence."%alignment_details[i][j]["scheme"], file=outfile)
|
210 |
+
for (index, insertion), aa in numbered[i][j][0]:
|
211 |
+
print(chain_type, ("%d"%index).ljust(5), insertion, aa, file=outfile)
|
212 |
+
print("//", file=outfile)
|
213 |
+
|
214 |
+
def csv_output(sequences, numbered, details, outfileroot):
|
215 |
+
'''
|
216 |
+
Write numbered sequences to csv files. A csv file is written for each chain type.
|
217 |
+
|
218 |
+
Kappa and Lambda chains are written to the same file
|
219 |
+
|
220 |
+
The sequences will written aligned to the numbering scheme. Gaps in the sequences with respect to the alignment are written
|
221 |
+
as a '-'
|
222 |
+
|
223 |
+
@param sequences: List of name, sequence tuples
|
224 |
+
@param numbered: Numbered sequences in the same order as the sequences list.
|
225 |
+
@param details: List of alignment details in the same order as the sequences list.
|
226 |
+
@param outfileroot: The file path for csv files to write. _<chain_type>.csv will be appended to this.
|
227 |
+
'''
|
228 |
+
|
229 |
+
chain_types = {}
|
230 |
+
pos_ranks = {}
|
231 |
+
all_pos = {}
|
232 |
+
_lc = {'K':'KL','L':'KL'}
|
233 |
+
|
234 |
+
|
235 |
+
# Divide the set into chain types and find how to order the numbering for each type.
|
236 |
+
for i in range( len(sequences) ): # Iterate over entries
|
237 |
+
if numbered[i] is None: continue
|
238 |
+
|
239 |
+
for j in range(len(numbered[i])): # Iterate over domains.
|
240 |
+
# Record the chain type index
|
241 |
+
c = details[i][j]['chain_type']
|
242 |
+
c = _lc.get(c, c) # Consider lambda and kappa together.
|
243 |
+
chain_types.setdefault( c, [] ).append( (i,j) )
|
244 |
+
if c not in pos_ranks:
|
245 |
+
pos_ranks[c] = {}
|
246 |
+
all_pos[c] = set()
|
247 |
+
|
248 |
+
# Update the insertion order for the scheme. i.e. is it A B C or C B A (e.g. imgt 111 and 112 repectively)
|
249 |
+
l = -1
|
250 |
+
r = 0
|
251 |
+
for p, _ in numbered[i][j][0]:
|
252 |
+
if p[0] != l:
|
253 |
+
l = p[0]
|
254 |
+
r = 0
|
255 |
+
else:
|
256 |
+
r +=1
|
257 |
+
pos_ranks[c][p] = max( r, pos_ranks[c].get( p, r ) )
|
258 |
+
all_pos[c].add( p )
|
259 |
+
|
260 |
+
# Write a new file for each chain type. Kappa and lambda are written together as light chains.
|
261 |
+
for cts in ['H','KL','A','B','G','D']:
|
262 |
+
if cts in chain_types:
|
263 |
+
with open( outfileroot + '_%s.csv'%cts, 'w' ) as out:
|
264 |
+
|
265 |
+
# Sort the positions by index and insertion order
|
266 |
+
positions = sorted( all_pos[cts], key = lambda p: (p[0], pos_ranks[cts][p]) )
|
267 |
+
|
268 |
+
# Header line
|
269 |
+
fields = ['Id','domain_no','hmm_species','chain_type','e-value','score','seqstart_index','seqend_index',
|
270 |
+
'identity_species','v_gene','v_identity','j_gene','j_identity']
|
271 |
+
fields += [ ('%d%s'%(p)).strip() for p in positions ]
|
272 |
+
print(','.join( fields ), file=out)
|
273 |
+
|
274 |
+
# Iterate over the domains identified
|
275 |
+
for i,j in chain_types[cts]:
|
276 |
+
line = [ sequences[i][0].replace(',',' '),
|
277 |
+
str(j),
|
278 |
+
details[i][j].get('species',''),
|
279 |
+
details[i][j].get('chain_type',''),
|
280 |
+
str(details[i][j].get('evalue','')),
|
281 |
+
str(details[i][j].get('bitscore','')),
|
282 |
+
str(numbered[i][j][1]),
|
283 |
+
str(numbered[i][j][2]),
|
284 |
+
details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][0],
|
285 |
+
details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[0][1],
|
286 |
+
'%.2f'%details[i][j].get('germlines',{}).get( 'v_gene',[['',''],0] )[1],
|
287 |
+
details[i][j].get('germlines',{}).get( 'j_gene',[['',''],0] )[0][1],
|
288 |
+
'%.2f'%details[i][j].get('germlines',{}).get( 'j_gene',[['',''],0] )[1] ]
|
289 |
+
|
290 |
+
# Hash the numbering. Insertion order has been preserved in the positions sort.
|
291 |
+
d = dict( numbered[i][j][0] )
|
292 |
+
line += [ d.get(p,'-') for p in positions ]
|
293 |
+
|
294 |
+
assert len( line ) == len( fields )
|
295 |
+
print(','.join( line ), file=out)
|
296 |
+
|
297 |
+
|
298 |
+
|
299 |
+
## Parsing and recognising domain hits from hmmscan ##
|
300 |
+
def _domains_are_same(dom1, dom2):
|
301 |
+
"""
|
302 |
+
Check to see if the domains are overlapping.
|
303 |
+
@param dom1:
|
304 |
+
@param dom2:
|
305 |
+
|
306 |
+
@return: True or False
|
307 |
+
"""
|
308 |
+
dom1, dom2 = sorted( [dom1, dom2], key=lambda x: x.query_start )
|
309 |
+
if dom2.query_start >= dom1.query_end:
|
310 |
+
return False
|
311 |
+
return True
|
312 |
+
|
313 |
+
|
314 |
+
def _parse_hmmer_query(query, bit_score_threshold=80, hmmer_species=None):
|
315 |
+
"""
|
316 |
+
|
317 |
+
@param query: hmmer query object from Biopython
|
318 |
+
@param bit_score_threshold: the threshold for which to consider a hit a hit.
|
319 |
+
|
320 |
+
The function will identify multiple domains if they have been found and provide the details for the best alignment for each domain.
|
321 |
+
This allows the ability to identify single chain fvs and engineered antibody sequences as well as the capability in the future for identifying constant domains.
|
322 |
+
|
323 |
+
"""
|
324 |
+
hit_table = [ ['id', 'description', 'evalue', 'bitscore', 'bias',
|
325 |
+
'query_start', 'query_end' ] ]
|
326 |
+
|
327 |
+
# Find the best hit for each domain in the sequence.
|
328 |
+
|
329 |
+
top_descriptions, domains,state_vectors = [], [], []
|
330 |
+
|
331 |
+
if query.hsps: # We have some hits
|
332 |
+
# If we have specified a species, check to see we have hits for that species
|
333 |
+
# Otherwise revert back to using any species
|
334 |
+
if hmmer_species:
|
335 |
+
#hit_correct_species = [hsp for hsp in query.hsps if hsp.hit_id.startswith(hmmer_species) and hsp.bitscore >= bit_score_threshold]
|
336 |
+
hit_correct_species = []
|
337 |
+
for hsp in query.hsps:
|
338 |
+
if hsp.bitscore >= bit_score_threshold:
|
339 |
+
for species in hmmer_species:
|
340 |
+
if hsp.hit_id.startswith(species):
|
341 |
+
hit_correct_species.append(hsp)
|
342 |
+
|
343 |
+
if hit_correct_species:
|
344 |
+
hsp_list = hit_correct_species
|
345 |
+
else:
|
346 |
+
print("Limiting hmmer search to species %s was requested but hits did not achieve a high enough bitscore. Reverting to using any species" %(hmmer_species))
|
347 |
+
hsp_list = query.hsps
|
348 |
+
else:
|
349 |
+
hsp_list = query.hsps
|
350 |
+
|
351 |
+
for hsp in sorted(hsp_list, key=lambda x: x.evalue): # Iterate over the matches of the domains in order of their e-value (most significant first)
|
352 |
+
new=True
|
353 |
+
if hsp.bitscore >= bit_score_threshold: # Only look at those with hits that are over the threshold bit-score.
|
354 |
+
for i in range( len(domains) ): # Check to see if we already have seen the domain
|
355 |
+
if _domains_are_same( domains[i], hsp ):
|
356 |
+
new = False
|
357 |
+
break
|
358 |
+
hit_table.append( [ hsp.hit_id, hsp.hit_description, hsp.evalue, hsp.bitscore, hsp.bias, hsp.query_start, hsp.query_end] )
|
359 |
+
if new: # It is a new domain and this is the best hit. Add it for further processing.
|
360 |
+
domains.append( hsp )
|
361 |
+
top_descriptions.append( dict( list(zip(hit_table[0], hit_table[-1])) ) ) # Add the last added to the descriptions list.
|
362 |
+
|
363 |
+
# Reorder the domains according to the order they appear in the sequence.
|
364 |
+
ordering = sorted( list(range(len(domains))), key=lambda x: domains[x].query_start)
|
365 |
+
domains = [ domains[_] for _ in ordering ]
|
366 |
+
top_descriptions = [ top_descriptions[_] for _ in ordering ]
|
367 |
+
|
368 |
+
ndomains = len( domains )
|
369 |
+
for i in range(ndomains): # If any significant hits were identified parse and align them to the reference state.
|
370 |
+
domains[i].order = i
|
371 |
+
species, chain = top_descriptions[i]["id"].split("_")
|
372 |
+
state_vectors.append( _hmm_alignment_to_states(domains[i], ndomains, query.seq_len) ) # Alignment to the reference states.
|
373 |
+
top_descriptions[i][ "species"] = species # Reparse
|
374 |
+
top_descriptions[i][ "chain_type"] = chain
|
375 |
+
top_descriptions[i][ "query_start"] = state_vectors[-1][0][-1] # Make sure the query_start agree if it was changed
|
376 |
+
|
377 |
+
return hit_table, state_vectors, top_descriptions
|
378 |
+
|
379 |
+
|
380 |
+
def _hmm_alignment_to_states(hsp, n, seq_length):
|
381 |
+
"""
|
382 |
+
Take a hit hsp and turn the alignment into a state vector with sequence indices
|
383 |
+
"""
|
384 |
+
|
385 |
+
# Extract the strings for the reference states and the posterior probability strings
|
386 |
+
reference_string = hsp.aln_annotation["RF"]
|
387 |
+
state_string = hsp.aln_annotation["PP"]
|
388 |
+
|
389 |
+
assert len(reference_string) == len(state_string), "Aligned reference and state strings had different lengths. Don't know how to handle"
|
390 |
+
|
391 |
+
# Extract the start an end points of the hmm states and the sequence
|
392 |
+
# These are python indices i.e list[ start:end ] and therefore start will be one less than in the text file
|
393 |
+
_hmm_start = hsp.hit_start
|
394 |
+
_hmm_end = hsp.hit_end
|
395 |
+
|
396 |
+
_seq_start = hsp.query_start
|
397 |
+
_seq_end = hsp.query_end
|
398 |
+
|
399 |
+
# Extact the full length of the HMM hit
|
400 |
+
species, ctype = hsp.hit_id.split('_')
|
401 |
+
_hmm_length = get_hmm_length( species, ctype )
|
402 |
+
|
403 |
+
# Handle cases where there are n terminal modifications.
|
404 |
+
# In most cases the user is going to want these included in the numbered domain even though they are not 'antibody like' and
|
405 |
+
# not matched to the germline. Only allow up to a maximum of 5 unmatched states at the start of the domain
|
406 |
+
# Adds a bug here if there is a very short linker between a scfv domains with a modified n-term second domain
|
407 |
+
# Thus this is only done for the first identified domain ( hence order attribute on hsp )
|
408 |
+
if hsp.order == 0 and _hmm_start and _hmm_start < 5:
|
409 |
+
n_extend = _hmm_start
|
410 |
+
if _hmm_start > _seq_start:
|
411 |
+
n_extend = min( _seq_start , _hmm_start - _seq_start )
|
412 |
+
state_string = '8'*n_extend + state_string
|
413 |
+
reference_string = 'x'*n_extend + reference_string
|
414 |
+
_seq_start = _seq_start - n_extend
|
415 |
+
_hmm_start = _hmm_start - n_extend
|
416 |
+
|
417 |
+
# Handle cases where the alignment should be extended to the end of the j-element
|
418 |
+
# This occurs when there a c-terminal modifications of the variable domain that are significantly different to germline
|
419 |
+
# Extension is only made when half of framework 4 has been recognised and there is only one domain recognised.
|
420 |
+
if n==1 and _seq_end < seq_length and (123 < _hmm_end < _hmm_length): # Extend forwards
|
421 |
+
n_extend = min( _hmm_length - _hmm_end, seq_length - _seq_end )
|
422 |
+
state_string = state_string + '8'*n_extend
|
423 |
+
reference_string = reference_string + 'x'*n_extend
|
424 |
+
_seq_end = _seq_end + n_extend
|
425 |
+
_hmm_end = _hmm_end + n_extend
|
426 |
+
|
427 |
+
|
428 |
+
|
429 |
+
# Generate lists for the states and the sequence indices that are included in this alignment
|
430 |
+
hmm_states = all_reference_states[ _hmm_start : _hmm_end ]
|
431 |
+
sequence_indices = list(range(_seq_start, _seq_end))
|
432 |
+
h, s = 0, 0 # initialise the current index in the hmm and the sequence
|
433 |
+
|
434 |
+
state_vector = []
|
435 |
+
# iterate over the state string (or the reference string)
|
436 |
+
for i in range( len(state_string) ):
|
437 |
+
if reference_string[i] == "x": # match state
|
438 |
+
state_type = "m"
|
439 |
+
else: # insert state
|
440 |
+
state_type = "i"
|
441 |
+
|
442 |
+
if state_string[i] == ".": # overloading if deleted relative to reference. delete_state
|
443 |
+
state_type = "d"
|
444 |
+
sequence_index = None
|
445 |
+
else:
|
446 |
+
sequence_index = sequence_indices[s]
|
447 |
+
# Store the alignment as the state identifier (uncorrected IMGT annotation) and the index of the sequence
|
448 |
+
|
449 |
+
state_vector.append( ((hmm_states[h], state_type), sequence_index ) )
|
450 |
+
|
451 |
+
# Updates to the indices
|
452 |
+
if state_type == "m":
|
453 |
+
h+=1
|
454 |
+
s+=1
|
455 |
+
elif state_type == "i":
|
456 |
+
s+=1
|
457 |
+
else: # delete state
|
458 |
+
h+=1
|
459 |
+
|
460 |
+
return state_vector
|
461 |
+
|
462 |
+
|
463 |
+
def parse_hmmer_output(filedescriptor="", bit_score_threshold=80, hmmer_species=None):
|
464 |
+
"""
|
465 |
+
Parse the output of HMMscan and return top alignment and the score table for each input sequence.
|
466 |
+
"""
|
467 |
+
results = []
|
468 |
+
if type(filedescriptor) is str:
|
469 |
+
openfile = open
|
470 |
+
elif type(filedescriptor) is int:
|
471 |
+
openfile = os.fdopen
|
472 |
+
|
473 |
+
with openfile(filedescriptor) as inputfile:
|
474 |
+
p = HMMERParser( inputfile )
|
475 |
+
for query in p:
|
476 |
+
results.append(_parse_hmmer_query(query,bit_score_threshold=bit_score_threshold,hmmer_species=hmmer_species ))
|
477 |
+
|
478 |
+
return results
|
479 |
+
|
480 |
+
|
481 |
+
def run_hmmer(sequence_list,hmm_database="ALL",hmmerpath="", ncpu=None, bit_score_threshold=80, hmmer_species=None):
|
482 |
+
"""
|
483 |
+
Run the sequences in sequence list against a precompiled hmm_database.
|
484 |
+
|
485 |
+
Those sequence that have a significant hit with a bit score over a threshold will
|
486 |
+
be recognised and an alignment given. The alignment will be used to number the
|
487 |
+
sequence.
|
488 |
+
|
489 |
+
@param sequence_list: a list of (name, sequence) tuples. Both are strings
|
490 |
+
@param hmm_database: The hmm database to use. Currently, all hmms are in the ALL database.
|
491 |
+
The code to develop new models is in build_pipeline in the git repo.
|
492 |
+
@param hmmerpath: The path to hmmer binaries if not in the path
|
493 |
+
@param ncpu: The number of cpu's to allow hmmer to use.
|
494 |
+
"""
|
495 |
+
|
496 |
+
# Check that hmm_database is available
|
497 |
+
|
498 |
+
assert hmm_database in ["ALL"], "Unknown HMM database %s"%hmm_database
|
499 |
+
HMM = os.path.join( HMM_path, "%s.hmm"%hmm_database )
|
500 |
+
|
501 |
+
|
502 |
+
# Create a fasta file for all the sequences. Label them with their sequence index
|
503 |
+
# This will go to a temp file
|
504 |
+
fasta_filehandle, fasta_filename = tempfile.mkstemp( ".fasta", text=True )
|
505 |
+
with os.fdopen(fasta_filehandle,'w') as outfile:
|
506 |
+
write_fasta(sequence_list, outfile)
|
507 |
+
|
508 |
+
output_filehandle, output_filename = tempfile.mkstemp( ".txt", text=True )
|
509 |
+
|
510 |
+
# Run hmmer as a subprocess
|
511 |
+
if hmmerpath:
|
512 |
+
hmmscan = os.path.join(hmmerpath,"hmmscan")
|
513 |
+
else:
|
514 |
+
hmmscan = "hmmscan"
|
515 |
+
try:
|
516 |
+
if ncpu is None:
|
517 |
+
command = [ hmmscan, "-o", output_filename, HMM, fasta_filename]
|
518 |
+
else:
|
519 |
+
command = [ hmmscan, "-o", output_filename, "--cpu", str(ncpu), HMM, fasta_filename]
|
520 |
+
process = Popen( command, stdout=PIPE, stderr=PIPE )
|
521 |
+
_, pr_stderr = process.communicate()
|
522 |
+
|
523 |
+
if pr_stderr:
|
524 |
+
_f = os.fdopen(output_filehandle) # This is to remove the filedescriptor from the os. I have had problems with it before.
|
525 |
+
_f.close()
|
526 |
+
|
527 |
+
raise HMMscanError(pr_stderr)
|
528 |
+
results = parse_hmmer_output(output_filehandle, bit_score_threshold=bit_score_threshold, hmmer_species=hmmer_species)
|
529 |
+
|
530 |
+
finally:
|
531 |
+
# clear up
|
532 |
+
os.remove(fasta_filename)
|
533 |
+
os.remove(output_filename)
|
534 |
+
|
535 |
+
return results
|
536 |
+
|
537 |
+
def get_hmm_length( species, ctype ):
|
538 |
+
'''
|
539 |
+
Get the length of an hmm given a species and chain type.
|
540 |
+
This tells us how many non-insertion positions there could possibly be in a domain (127 or 128 positions under imgt)
|
541 |
+
'''
|
542 |
+
try:
|
543 |
+
return len(list(all_germlines['J'][ctype][species].values())[0].rstrip('-'))
|
544 |
+
except KeyError:
|
545 |
+
return 128
|
546 |
+
|
547 |
+
|
548 |
+
def number_sequence_from_alignment(state_vector, sequence, scheme="imgt", chain_type=None):
|
549 |
+
"""
|
550 |
+
Given you have an alignment. Give back the numbering
|
551 |
+
|
552 |
+
@param state_vector: List of states from the hmm. Effectively these are imgt columns but CDR3 has not been redone.
|
553 |
+
@param sequence: The original sequence string or list.
|
554 |
+
@param scheme: The numbering scheme to apply
|
555 |
+
@param chain_type: The type of chain to apply numbering for. Some schemes do not require this (IMGT). Others (e.g. Chothia/Wolfguy) do.
|
556 |
+
|
557 |
+
@return: A list of numbering identifier / amino acids tuples over the domain that has been numbered. The indices of the start (inclusive) and end point (exclusive) in the sequence for the numbering
|
558 |
+
"""
|
559 |
+
scheme=scheme.lower()
|
560 |
+
if scheme == "imgt":
|
561 |
+
return number_imgt(state_vector, sequence)
|
562 |
+
elif scheme == "chothia":
|
563 |
+
if chain_type == "H":
|
564 |
+
return number_chothia_heavy(state_vector, sequence)
|
565 |
+
elif chain_type in "KL":
|
566 |
+
return number_chothia_light(state_vector, sequence)
|
567 |
+
else:
|
568 |
+
raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
|
569 |
+
elif scheme == "kabat":
|
570 |
+
if chain_type == "H":
|
571 |
+
return number_kabat_heavy(state_vector, sequence)
|
572 |
+
elif chain_type in "KL":
|
573 |
+
return number_kabat_light(state_vector, sequence)
|
574 |
+
else:
|
575 |
+
raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
|
576 |
+
elif scheme == "martin":
|
577 |
+
if chain_type == "H":
|
578 |
+
return number_martin_heavy(state_vector, sequence)
|
579 |
+
elif chain_type in "KL":
|
580 |
+
return number_martin_light(state_vector, sequence)
|
581 |
+
else:
|
582 |
+
raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
|
583 |
+
elif scheme == "aho":
|
584 |
+
return number_aho(state_vector, sequence, chain_type) # requires the chain type to heuristically put the CDR1 gap in position.
|
585 |
+
elif scheme == "wolfguy":
|
586 |
+
if chain_type == "H":
|
587 |
+
return number_wolfguy_heavy( state_vector, sequence )
|
588 |
+
elif chain_type in "KL":
|
589 |
+
return number_wolfguy_light( state_vector, sequence )
|
590 |
+
else:
|
591 |
+
raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
|
592 |
+
else:
|
593 |
+
raise AssertionError("Unimplemented numbering scheme %s for chain %s"%( scheme, chain_type))
|
594 |
+
|
595 |
+
def number_sequences_from_alignment(sequences, alignments, scheme="imgt", allow=set(["H","K","L","A","B","G","D"]),
|
596 |
+
assign_germline=False, allowed_species=None):
|
597 |
+
'''
|
598 |
+
Given a list of sequences and a corresponding list of alignments from run_hmmer apply a numbering scheme.
|
599 |
+
'''
|
600 |
+
|
601 |
+
# Iteration over the sequence alignments performing the desired numbering
|
602 |
+
numbered = []
|
603 |
+
alignment_details = []
|
604 |
+
hit_tables = []
|
605 |
+
for i in range(len(sequences)):
|
606 |
+
|
607 |
+
# Unpack
|
608 |
+
hit_table, state_vectors, detailss = alignments[i] # We may have multiple domains per sequence (e.g. single chain fvs).
|
609 |
+
|
610 |
+
# Iterate over all the domains in the sequence that have been recognised (typcially only 1 with the current hmms available)
|
611 |
+
hit_numbered, hit_details = [], []
|
612 |
+
for di in range( len( state_vectors ) ):
|
613 |
+
state_vector = state_vectors[di]
|
614 |
+
details = detailss[di]
|
615 |
+
details["scheme"]=scheme
|
616 |
+
details["query_name"]=sequences[i][0]
|
617 |
+
|
618 |
+
# Only number things that are allowed. We still keep the alignment details and hit_table
|
619 |
+
if state_vector and details["chain_type"] in allow:
|
620 |
+
try:
|
621 |
+
# Do the numbering and validate (for development purposes)
|
622 |
+
hit_numbered.append( validate_numbering(number_sequence_from_alignment(state_vector, sequences[i][1],
|
623 |
+
scheme=scheme, chain_type=details["chain_type"]), sequences[i] ) )
|
624 |
+
if assign_germline:
|
625 |
+
details["germlines"] = run_germline_assignment( state_vector, sequences[i][1],
|
626 |
+
details["chain_type"], allowed_species=allowed_species)
|
627 |
+
hit_details.append( details )
|
628 |
+
except AssertionError as e: # Handle errors. Those I have implemented should be assertion.
|
629 |
+
print(str(e), file=sys.stderr)
|
630 |
+
raise e # Validation went wrong. Error message will go to stderr. Want this to be fatal during development.
|
631 |
+
except Exception as e:
|
632 |
+
print("Error: Something really went wrong that has not been handled", file=sys.stderr)
|
633 |
+
print(str(e), file=sys.stderr)
|
634 |
+
raise e
|
635 |
+
|
636 |
+
if hit_numbered:
|
637 |
+
numbered.append( hit_numbered )
|
638 |
+
alignment_details.append( hit_details )
|
639 |
+
else:
|
640 |
+
numbered.append( None )
|
641 |
+
alignment_details.append( None )
|
642 |
+
hit_tables.append(hit_table)
|
643 |
+
|
644 |
+
return numbered, alignment_details, hit_tables
|
645 |
+
|
646 |
+
def get_identity( state_sequence, germline_sequence ):
|
647 |
+
"""
|
648 |
+
Get the partially matched sequence identity between two aligned sequences.
|
649 |
+
Partial in the sense that gaps can be in the state_sequence.
|
650 |
+
"""
|
651 |
+
# Ensure that the sequences are the expected length
|
652 |
+
assert len( state_sequence) == len(germline_sequence ) == 128
|
653 |
+
n, m = 0, 0
|
654 |
+
for i in range( 128 ):
|
655 |
+
if germline_sequence[i] == "-":continue
|
656 |
+
if state_sequence[i].upper() == germline_sequence[i]: m+=1
|
657 |
+
n+=1
|
658 |
+
|
659 |
+
if not n:
|
660 |
+
return 0
|
661 |
+
return float(m)/n
|
662 |
+
|
663 |
+
|
664 |
+
def run_germline_assignment(state_vector, sequence, chain_type, allowed_species=None ):
|
665 |
+
"""
|
666 |
+
Find the closest sequence identity match.
|
667 |
+
"""
|
668 |
+
genes={'v_gene': [None,None],
|
669 |
+
'j_gene': [None,None],
|
670 |
+
}
|
671 |
+
|
672 |
+
|
673 |
+
# Extract the positions that correspond to match (germline) states.
|
674 |
+
state_dict = dict( ((i, 'm'),None) for i in range(1,129))
|
675 |
+
state_dict.update(dict(state_vector))
|
676 |
+
state_sequence = "".join([ sequence[state_dict[(i, 'm')]] if state_dict[(i,'m')] is not None else "-" for i in range(1,129) ])
|
677 |
+
|
678 |
+
# Iterate over the v-germline sequences of the chain type of interest.
|
679 |
+
# The maximum sequence identity is used to assign the germline
|
680 |
+
if chain_type in all_germlines["V"]:
|
681 |
+
if allowed_species is not None:
|
682 |
+
if not all( [ sp in all_germlines['V'][chain_type] for sp in allowed_species ] ): # Made non-fatal
|
683 |
+
return {}
|
684 |
+
else:
|
685 |
+
allowed_species = all_species
|
686 |
+
seq_ids = {}
|
687 |
+
for species in allowed_species:
|
688 |
+
if species not in all_germlines["V"][ chain_type ]: continue # Previously bug.
|
689 |
+
for gene, germline_sequence in all_germlines["V"][ chain_type ][ species ].items():
|
690 |
+
seq_ids[ (species, gene) ] = get_identity( state_sequence , germline_sequence )
|
691 |
+
genes['v_gene' ][0] = max( seq_ids, key=lambda x: seq_ids[x] )
|
692 |
+
genes['v_gene' ][1] = seq_ids[ genes['v_gene' ][0] ]
|
693 |
+
|
694 |
+
# Use the assigned species for the v-gene for the j-gene.
|
695 |
+
# This assumption may affect exotically engineered abs but in general is fair.
|
696 |
+
species = genes['v_gene' ][0][0]
|
697 |
+
if chain_type in all_germlines["J"]:
|
698 |
+
if species in all_germlines["J"][chain_type]:
|
699 |
+
seq_ids = {}
|
700 |
+
for gene, germline_sequence in all_germlines["J"][ chain_type ][ species ].items():
|
701 |
+
seq_ids[ (species, gene) ] = get_identity( state_sequence , germline_sequence )
|
702 |
+
genes['j_gene' ][0] = max( seq_ids, key=lambda x: seq_ids[x] )
|
703 |
+
genes['j_gene' ][1] = seq_ids[ genes['j_gene' ][0] ]
|
704 |
+
|
705 |
+
return genes
|
706 |
+
|
707 |
+
def check_for_j( sequences, alignments, scheme ):
|
708 |
+
'''
|
709 |
+
As the length of CDR3 gets long (over 30ish) an alignment that does not include the J region becomes more favourable.
|
710 |
+
This leads to really long CDR3s not being numberable.
|
711 |
+
|
712 |
+
To overcome this problem, when no J region is detected we try without the v region.
|
713 |
+
'''
|
714 |
+
for i in range( len( sequences ) ):
|
715 |
+
# Check the alignment for J region
|
716 |
+
if len(alignments[i][1]) ==1: # Only do for single domain chains.
|
717 |
+
|
718 |
+
# Check whether a J region has been identified. If not check whether there is still a considerable amount of sequence
|
719 |
+
# remaining.
|
720 |
+
ali = alignments[i][1][0]
|
721 |
+
|
722 |
+
# Find the last match position.
|
723 |
+
last_state = ali[-1][0][0]
|
724 |
+
last_si = ali[-1][1]
|
725 |
+
if last_state < 120: # No or very little J region
|
726 |
+
if last_si + 30 < len( sequences[i][1] ): # Considerable amount of sequence left...suspicious of a long CDR3
|
727 |
+
# Find the position of the conserved cysteine (imgt 104).
|
728 |
+
cys_si = dict( ali ).get( (104,'m'), None )
|
729 |
+
if cys_si is not None: # 104 found.
|
730 |
+
|
731 |
+
# Find the corresponding index in the alignment.
|
732 |
+
cys_ai = ali.index( ((104, 'm'), cys_si) )
|
733 |
+
|
734 |
+
# Try to identify a J region in the remaining sequence after the 104. A low bit score threshold is used.
|
735 |
+
_, re_states, re_details = run_hmmer( [(sequences[i][0], sequences[i][1][cys_si+1:])],
|
736 |
+
bit_score_threshold=10 )[0]
|
737 |
+
|
738 |
+
# Check if a J region was detected in the remaining sequence.
|
739 |
+
if re_states and re_states[0][-1][0][0] >= 126 and re_states[0][0][0][0] <= 117:
|
740 |
+
|
741 |
+
# Sandwich the presumed CDR3 region between the V and J regions.
|
742 |
+
|
743 |
+
vRegion = ali[:cys_ai+1]
|
744 |
+
jRegion = [ (state, index+cys_si+1) for state, index in re_states[0] if state[0] >= 117 ]
|
745 |
+
cdrRegion = []
|
746 |
+
next = 105
|
747 |
+
for si in range( cys_si+1, jRegion[0][1] ):
|
748 |
+
if next >= 116:
|
749 |
+
cdrRegion.append( ( (116, 'i'), si ) )
|
750 |
+
else:
|
751 |
+
cdrRegion.append( ( (next, 'm'), si ) )
|
752 |
+
next +=1
|
753 |
+
|
754 |
+
# Update the alignment entry.
|
755 |
+
alignments[i][1][0] = vRegion + cdrRegion + jRegion
|
756 |
+
alignments[i][2][0]['query_end'] = jRegion[-1][1] + 1
|
757 |
+
|
758 |
+
|
759 |
+
|
760 |
+
##################################
|
761 |
+
# High level numbering functions #
|
762 |
+
##################################
|
763 |
+
|
764 |
+
# Main function for ANARCI
|
765 |
+
# Name conflict with function, module and package is kept for legacy unless issues are reported in future.
|
766 |
+
def anarci(sequences, scheme="imgt", database="ALL", output=False, outfile=None, csv=False, allow=set(["H","K","L","A","B","G","D"]),
|
767 |
+
hmmerpath="", ncpu=None, assign_germline=False, allowed_species=None, bit_score_threshold=80):
|
768 |
+
"""
|
769 |
+
The main function for anarci. Identify antibody and TCR domains, number them and annotate their germline and species.
|
770 |
+
|
771 |
+
It is advised to use one of the wrapper functions:
|
772 |
+
o run_anarci - fasta file or sequence list in. Automated multiprocessing for large jobs. Sequences, numbering, details
|
773 |
+
and hit tables out.
|
774 |
+
o number - single sequence in, numbering out
|
775 |
+
|
776 |
+
|
777 |
+
@param sequences: A list or tuple of (Id, Sequence) pairs
|
778 |
+
e.g. [ ("seq1","EVQLQQSGAEVVRSG ..."),
|
779 |
+
("seq2","DIVMTQSQKFMSTSV ...") ]
|
780 |
+
@param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
|
781 |
+
@param output: Boolean flag to say whether the result should be output.
|
782 |
+
@param outfile: The name of the file to output to. If output is True and outfile is None then output is printed
|
783 |
+
to stdout.
|
784 |
+
@param csv: Boolean flag to say whether the csv output alignment format or the vertical anarci format should be used.
|
785 |
+
@param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
|
786 |
+
as the scheme, anarci will ignore tcr chains. Choose a subset of ["H","K","L","A","B","G","D"]
|
787 |
+
@param assign_germline: Using highest sequence identity assign the germline to the chain. Can be more accurate at identifying
|
788 |
+
species than the best HMM hit alone. (Bool)
|
789 |
+
@param allowed_species: If assign_germline is true, limit the species that can be assigned to a limited set. Useful when the
|
790 |
+
animal species is known or when performing closest germline experiments. Choose a subset of ['human',
|
791 |
+
'mouse','rat','rabbit','rhesus','pig','alpaca'].
|
792 |
+
|
793 |
+
|
794 |
+
@param bit_score_threshold: The threshold score from HMMER at which an alignment should be numbered. Lowering the threshold
|
795 |
+
means domain recognition is more permissive and can be useful for numbering heavily engineered molecules.
|
796 |
+
However, too low and false positive recognition of other ig-like molecules will occur.
|
797 |
+
@param hmmerpath: The path to hmmscan. If left unspecified then the PATH will be searched.
|
798 |
+
@param ncpu: The number of cpu's that hmmer should be allowed to use. If not specified then the hmmscan
|
799 |
+
default is used. N.B. hmmscan must be compiled with multithreading enabled for this option to have effect.
|
800 |
+
Please consider using the run_anarci function for native multiprocessing with anarci.
|
801 |
+
@param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
|
802 |
+
|
803 |
+
|
804 |
+
@return: Three lists. Numbered, Alignment_details and Hit_tables.
|
805 |
+
Each list is in the same order as the input sequences list.
|
806 |
+
A description of each entry in the three lists is as followed.
|
807 |
+
o Numbered: will be None if no domain was found for that sequence or a list of domains with their
|
808 |
+
numbering, start and finish indices.
|
809 |
+
o Alignment_details: will be None if no domain was found for that sequence or a dictionary for each
|
810 |
+
domain identified containing the details of the alignment (chain type, e-value, species etc).
|
811 |
+
o Hit_tables: None if no domain was found for that sequence or a nested list for each domain containing
|
812 |
+
the hit table from hmmscan.
|
813 |
+
|
814 |
+
"""
|
815 |
+
|
816 |
+
# Validate the input scheme
|
817 |
+
try:
|
818 |
+
scheme = scheme_short_to_long[scheme.lower()]
|
819 |
+
except KeyError:
|
820 |
+
raise AssertionError("Unrecognised or unimplemented scheme: %s"%scheme)
|
821 |
+
|
822 |
+
# Check we have arguments for output before doing work.
|
823 |
+
if csv:
|
824 |
+
assert outfile, 'If csv output is True then an outfile must be specified'
|
825 |
+
_path, _ = os.path.split(outfile)
|
826 |
+
assert (not _path) or os.path.exists(_path), 'Output directory %s does not exist'%_path
|
827 |
+
|
828 |
+
|
829 |
+
# Perform the alignments of the sequences to the hmm database
|
830 |
+
alignments = run_hmmer(sequences,hmm_database=database,hmmerpath=hmmerpath,ncpu=ncpu,bit_score_threshold=bit_score_threshold,hmmer_species=allowed_species )
|
831 |
+
|
832 |
+
# Check the numbering for likely very long CDR3s that will have been missed by the first pass.
|
833 |
+
# Modify alignments in-place
|
834 |
+
check_for_j( sequences, alignments, scheme )
|
835 |
+
|
836 |
+
# Apply the desired numbering scheme to all sequences
|
837 |
+
numbered, alignment_details, hit_tables = number_sequences_from_alignment(sequences, alignments, scheme=scheme, allow=allow,
|
838 |
+
assign_germline=assign_germline,
|
839 |
+
allowed_species=allowed_species)
|
840 |
+
|
841 |
+
# Output if necessary
|
842 |
+
if output:
|
843 |
+
if csv:
|
844 |
+
csv_output(sequences, numbered, details, outfile)
|
845 |
+
else:
|
846 |
+
outto, close=sys.stdout, False
|
847 |
+
if outfile:
|
848 |
+
outto, close = open(outfile,'w'), True
|
849 |
+
anarci_output(numbered, sequences, alignment_details, outto)
|
850 |
+
if close:
|
851 |
+
outto.close()
|
852 |
+
|
853 |
+
|
854 |
+
return numbered, alignment_details, hit_tables
|
855 |
+
|
856 |
+
# Wrapper to run anarci using multiple processes and automate fasta file reading.
|
857 |
+
def run_anarci( seq, ncpu=1, **kwargs):
|
858 |
+
'''
|
859 |
+
Run the anarci numbering protocol for single or multiple sequences.
|
860 |
+
|
861 |
+
@param sequences: A list or tuple of (Id, Sequence) pairs
|
862 |
+
e.g. [ ("seq1","EVQLQQSGAEVVRSG ..."),
|
863 |
+
("seq2","DIVMTQSQKFMSTSV ...") ]
|
864 |
+
@param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
|
865 |
+
@param output: Boolean flag to say whether the result should be output.
|
866 |
+
@param outfile: The name of the file to output to. If output is True and outfile is None then output is printed
|
867 |
+
to stdout.
|
868 |
+
@param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
|
869 |
+
as the scheme, anarci will ignore tcr chains. Choose a subset of ["H","K","L","A","B","G","D"]
|
870 |
+
@param assign_germline: Using highest sequence identity assign the germline to the chain. Can be more accurate at identifying
|
871 |
+
species than the best HMM hit alone. (Bool)
|
872 |
+
@param allowed_species: If assign_germline is true, limit the species that can be assigned to a limited set. Useful when the
|
873 |
+
animal species is known or when performing closest germline experiments. Choose a subset of ['human',
|
874 |
+
'mouse','rat','rabbit','rhesus','pig','alpaca'].
|
875 |
+
|
876 |
+
@param bit_score_threshold: The threshold score from HMMER at which an alignment should be numbered. Lowering the threshold
|
877 |
+
means domain recognition is more permissive and can be useful for numbering heavily engineered molecules.
|
878 |
+
However, too low and false positive recognition of other ig-like molecules will occur.
|
879 |
+
@param hmmerpath: The path to hmmscan. If left unspecified then the PATH will be searched.
|
880 |
+
@param ncpu: The number of cpu's that hmmer should be allowed to use. If not specified then the hmmscan
|
881 |
+
default is used. N.B. hmmscan must be compiled with multithreading enabled for this option to have effect.
|
882 |
+
Please consider using the run_anarci function for native multiprocessing with anarci.
|
883 |
+
@param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
|
884 |
+
|
885 |
+
@return: Four lists. Sequences, Numbered, Alignment_details and Hit_tables.
|
886 |
+
Each list is in the same order.
|
887 |
+
A description of each entry in the four lists is as followed.
|
888 |
+
o Sequences: The list of sequences formatted as [(Id,sequence), ...].
|
889 |
+
o Numbered: will be None if no domain was found for that sequence or a list of domains with their
|
890 |
+
numbering, start and finish indices.
|
891 |
+
o Alignment_details: will be None if no domain was found for that sequence or a dictionary for each
|
892 |
+
domain identified containing the details of the alignment (chain type, e-value, species etc).
|
893 |
+
o Hit_tables: None if no domain was found for that sequence or a nested list for each domain containing
|
894 |
+
the hit table from hmmscan.
|
895 |
+
|
896 |
+
'''
|
897 |
+
# Parse the input sequence or fasta file.
|
898 |
+
if isinstance(seq, list) or isinstance(seq,tuple): # A list (or tuple) of (name,sequence) sequences
|
899 |
+
assert all( len(_) == 2 for _ in seq ), "If list or tuple supplied as input format must be [ ('ID1','seq1'), ('ID2', 'seq2'), ... ]"
|
900 |
+
sequences = seq
|
901 |
+
elif os.path.isfile( seq ): # Fasta file.
|
902 |
+
# Read the sequences. All are read into memory currently...
|
903 |
+
sequences = read_fasta( seq )
|
904 |
+
ncpu = int(max(1, ncpu ))
|
905 |
+
elif isinstance(seq, str): # Single sequence
|
906 |
+
validate_sequence( seq )
|
907 |
+
ncpu=1
|
908 |
+
sequences = [ ["Input sequence", seq ]]
|
909 |
+
|
910 |
+
# Handle the arguments to anarci.
|
911 |
+
output = kwargs.get('output', False )
|
912 |
+
outfile = kwargs.get('outfile', False )
|
913 |
+
csv = kwargs.get( 'csv', False )
|
914 |
+
if csv: # Check output arguments before doing work.
|
915 |
+
assert outfile, 'If csv output is True then an outfile must be specified'
|
916 |
+
_path, _ = os.path.split(outfile)
|
917 |
+
assert (not _path) or os.path.exists(_path), 'Output directory %s does not exist'%_path
|
918 |
+
|
919 |
+
kwargs['ncpu'] = 1 # Set hmmscan ncpu to 1. HMMER has to be compiled appropriately for this to have an effect.
|
920 |
+
kwargs['output'] = False # Overide and write the compiled results here.
|
921 |
+
|
922 |
+
anarci_partial = partial( anarci, **kwargs )
|
923 |
+
chunksize = math.ceil( float( len(sequences) )/ncpu )
|
924 |
+
|
925 |
+
# Run the anarci function using a pool of workers. Using the map_async to get over the KeyboardInterrupt bug in python2.7
|
926 |
+
if ncpu > 1:
|
927 |
+
pool = Pool( ncpu )
|
928 |
+
results = pool.map_async( anarci_partial, grouper( chunksize, sequences ) ).get()
|
929 |
+
pool.close()
|
930 |
+
else:
|
931 |
+
results = list(map( anarci_partial, grouper( chunksize, sequences ) ))
|
932 |
+
|
933 |
+
# Reformat the results to flat lists.
|
934 |
+
numbered = sum( (_[0] for _ in results), [] )
|
935 |
+
alignment_details = sum( (_[1] for _ in results ), [] )
|
936 |
+
hit_tables = sum( (_[2] for _ in results), [] )
|
937 |
+
|
938 |
+
# Output if necessary
|
939 |
+
if output:
|
940 |
+
if csv:
|
941 |
+
csv_output(sequences, numbered, alignment_details, outfile)
|
942 |
+
else:
|
943 |
+
outto, close=sys.stdout, False
|
944 |
+
if outfile:
|
945 |
+
outto, close = open(outfile,'w'), True
|
946 |
+
anarci_output(numbered, sequences, alignment_details, outto)
|
947 |
+
if close:
|
948 |
+
outto.close()
|
949 |
+
|
950 |
+
# Return the results
|
951 |
+
return sequences, numbered, alignment_details, hit_tables
|
952 |
+
|
953 |
+
|
954 |
+
|
955 |
+
# Wrapper function for simple sequence in numbering and chain type out behaviour.
|
956 |
+
def number(sequence, scheme="imgt", database="ALL", allow=set(["H","K","L","A","B","G","D"])):
|
957 |
+
"""
|
958 |
+
Given a sequence string, use anarci to number it using the scheme of choice.
|
959 |
+
Only the first domain will be recognised and numbered
|
960 |
+
|
961 |
+
For multiple sequences it is advised to use run_anarci instead of iterative use of this function.
|
962 |
+
|
963 |
+
@param sequence: An amino acid sequence string
|
964 |
+
@param scheme: The numbering scheme that should be applied. Choose from imgt, chothia, kabat or martin
|
965 |
+
@param database: The HMMER database that should be used. Normally not changed unless a custom db is created.
|
966 |
+
@param allow: A set containing the chain types that should be recognised. If chothia, kabat or martin is used
|
967 |
+
as the scheme, anarci will ignore tcr chains.
|
968 |
+
|
969 |
+
@return: If the sequence can be numbered, a list containing the numbering and sequence; and the chain type.
|
970 |
+
Otherwise both are False.
|
971 |
+
|
972 |
+
"""
|
973 |
+
|
974 |
+
try:
|
975 |
+
validate_sequence( sequence )
|
976 |
+
scheme = scheme_short_to_long[scheme.lower()]
|
977 |
+
except KeyError:
|
978 |
+
raise AssertionError("Unrecognised to unimplemented scheme: %s"%scheme)
|
979 |
+
|
980 |
+
if len(sequence) < 70: # Length check. ANARCI can number fragments of chains well. Encourage full domain numbering.
|
981 |
+
return False, False
|
982 |
+
|
983 |
+
try:
|
984 |
+
numbered, alignment_details, _ = anarci( [("sequence_0", sequence)], scheme=scheme, database=database, output=False, allow=allow )
|
985 |
+
except AssertionError: # Catch where the user has tried to number a TCR with an antibody scheme
|
986 |
+
return False, False
|
987 |
+
|
988 |
+
|
989 |
+
# We return the numbering list and the chain type where kappa and lambda chains are both "L" for light
|
990 |
+
if numbered[0]:
|
991 |
+
return numbered[0][0][0], chain_type_to_class[alignment_details[0][0]["chain_type"]]
|
992 |
+
else:
|
993 |
+
return False, False
|
994 |
+
|
995 |
+
if __name__ == "__main__":
|
996 |
+
# Test and example useage of the anarci function.
|
997 |
+
sequences = [ ("12e8:H","EVQLQQSGAEVVRSGASVKLSCTASGFNIKDYYIHWVKQRPEKGLEWIGWIDPEIGDTEYVPKFQGKATMTADTSSNTAYLQLSSLTSEDTAVYYCNAGHDYDRGRFPYWGQGTLVTVSAAKTTPPSVYPLAP"),
|
998 |
+
("12e8:L","DIVMTQSQKFMSTSVGDRVSITCKASQNVGTAVAWYQQKPGQSPKLMIYSASNRYTGVPDRFTGSGSGTDFTLTISNMQSEDLADYFCQQYSSYPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASV"),
|
999 |
+
("scfv:A","DIQMTQSPSSLSASVGDRVTITCRTSGNIHNYLTWYQQKPGKAPQLLIYNAKTLADGVPSRFSGSGSGTQFTLTISSLQPEDFANYYCQHFWSLPFTFGQGTKVEIKRTGGGGSGGGGSGGGGSGGGGSEVQLVESGGGLVQPGGSLRLSCAASGFDFSRYDMSWVRQAPGKRLEWVAYISSGGGSTYFPDTVKGRFTISRDNAKNTLYLQMNSLRAEDTAVYYCARQNKKLTWFDYWGQGTLVTVSSHHHHHH"),
|
1000 |
+
("lysozyme:A","KVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL")]
|
1001 |
+
|
1002 |
+
results = anarci(sequences, scheme="imgt", output=True)
|
1003 |
+
numbering, alignment_details, hit_tables = results
|
1004 |
+
|
1005 |
+
expect_one_VH_domain_numbering, expect_one_VL_domain_numbering, expect_VH_then_VL_numbering, expect_None = numbering
|
1006 |
+
assert len(expect_one_VH_domain_numbering) == 1
|
1007 |
+
assert len(expect_one_VL_domain_numbering) == 1
|
1008 |
+
assert len(expect_VH_then_VL_numbering) == 2
|
1009 |
+
assert expect_None == None
|
1010 |
+
|
1011 |
+
|
1012 |
+
|
1013 |
+
|
anarci/dat/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
anarci/dat/HMMs/ALL.hmm
ADDED
The diff for this file is too large to render.
See raw diff
|
|
anarci/dat/HMMs/ALL.hmm.h3f
ADDED
Binary file (449 kB). View file
|
|
anarci/dat/HMMs/ALL.hmm.h3i
ADDED
Binary file (1.12 kB). View file
|
|
anarci/dat/HMMs/ALL.hmm.h3m
ADDED
Binary file (729 kB). View file
|
|
anarci/dat/HMMs/ALL.hmm.h3p
ADDED
Binary file (843 kB). View file
|
|
anarci/germlines.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
anarci/schemes.py
ADDED
@@ -0,0 +1,1691 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ANARCI - Antibody Numbering and Antigen Receptor ClassIfication
|
2 |
+
# Copyright (C) 2016 Oxford Protein Informatics Group (OPIG)
|
3 |
+
#
|
4 |
+
# This program is free software: you can redistribute it and/or modify
|
5 |
+
# it under the terms of the GNU General Public License as published by
|
6 |
+
# the Free Software Foundation, either version 3 of the License, or
|
7 |
+
# (at your option) any later version.
|
8 |
+
#
|
9 |
+
# This program is distributed in the hope that it will be useful,
|
10 |
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
11 |
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
12 |
+
# GNU General Public License for more details.#
|
13 |
+
#
|
14 |
+
# You should have received a copy of the GNU General Public License
|
15 |
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
16 |
+
|
17 |
+
'''
|
18 |
+
Module containing functions to convert hmm alignment to a numbering scheme.
|
19 |
+
|
20 |
+
Currently implemented
|
21 |
+
|
22 |
+
For IG's
|
23 |
+
IMGT
|
24 |
+
Chothia
|
25 |
+
Kabat
|
26 |
+
Martin (Extended Chothia)
|
27 |
+
Aho
|
28 |
+
Wolfguy
|
29 |
+
|
30 |
+
For TR's
|
31 |
+
IMGT
|
32 |
+
(Aho)
|
33 |
+
|
34 |
+
---------------------------------------------------------------------------------------------------------------------
|
35 |
+
Functions are written to a template:
|
36 |
+
|
37 |
+
There are 128 match states in the HMMs (these are the IMGT states). The alignment to these states must be converted to
|
38 |
+
correspond to the scheme of choice.
|
39 |
+
|
40 |
+
We define:
|
41 |
+
- a state string consisting of 'X' and 'I' where:
|
42 |
+
X means that for the state there is an equivalent position in the numbering scheme.
|
43 |
+
I means that for the state there is not an equivalent position in the numbering scheme. It should therefore be
|
44 |
+
considered as an insertion in the scheme.
|
45 |
+
|
46 |
+
- a region string consisting of characters (integers in the currently implemented schemes). Each character
|
47 |
+
corresponds to a contiguous region. Therefore each state can be assigned a region according to the scheme.
|
48 |
+
|
49 |
+
- a mapping between region characters and region indices as a dictionary. e.g. the first region character maps
|
50 |
+
to 0, second to 1 ...
|
51 |
+
|
52 |
+
- a dictionary containing the difference between state number (imgt) and scheme number at the *beginning* of
|
53 |
+
each region using the region indices as keys and the difference as values.
|
54 |
+
|
55 |
+
- the number of regions defined
|
56 |
+
|
57 |
+
- a list for which delete states should not be included in the numbering (typically those for the cdrs). This
|
58 |
+
will allow the length of the region to be the number of residues found instead of the number of possible states plus
|
59 |
+
insertions.
|
60 |
+
|
61 |
+
|
62 |
+
This all goes into the _number_regions function along with the sequence and the state_vector (the alignment from the
|
63 |
+
HMM).
|
64 |
+
|
65 |
+
_number regions will then divide the aligned part of the sequence into as many regions as defined above. Within each
|
66 |
+
region it will give a numbering according to the input parameters. A list of lists will be returned containing the
|
67 |
+
numbered sequence for each region.
|
68 |
+
|
69 |
+
Some of the regions will not be numbered correctly according to the scheme. For example the insertions for the CDRs
|
70 |
+
will not necessarily be on the correct residue. For each different scheme these regions are then modified (see code
|
71 |
+
for implementation)
|
72 |
+
|
73 |
+
Finally the full numbered sequence is compiled and returned to the calling function.
|
74 |
+
---------------------------------------------------------------------------------------------------------------------
|
75 |
+
|
76 |
+
Other schemes can be implemented following the template above.
|
77 |
+
|
78 |
+
|
79 |
+
'''
|
80 |
+
|
81 |
+
# Alphabet used for insertion (last (-1th) is a blank space for no insertion)
|
82 |
+
alphabet = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "AA", "BB", "CC", "DD", "EE", "FF", "GG", "HH", "II", "JJ", "KK", "LL", "MM", "NN", "OO", "PP", "QQ", "RR", "SS", "TT", "UU", "VV", "WW", "XX", "YY", "ZZ", " "]
|
83 |
+
|
84 |
+
# Blosum62 matrix. Used in some annotation methods to recognise pre-defined motifs
|
85 |
+
blosum62 = {('B', 'N'): 3, ('W', 'L'): -2, ('G', 'G'): 6, ('X', 'S'): 0, ('X', 'D'): -1, ('K', 'G'): -2, ('S', 'E'): 0, ('X', 'M'): -1, ('Y', 'E'): -2, ('W', 'R'): -3, ('I', 'R'): -3, ('X', 'Z'): -1, ('H', 'E'): 0, ('V', 'M'): 1, ('N', 'R'): 0, ('I', 'D'): -3, ('F', 'D'): -3, ('W', 'C'): -2, ('N', 'A'): -2, ('W', 'Q'): -2, ('L', 'Q'): -2, ('S', 'N'): 1, ('Z', 'K'): 1, ('V', 'N'): -3, ('Q', 'N'): 0, ('M', 'K'): -1, ('V', 'H'): -3, ('G', 'E'): -2, ('S', 'L'): -2, ('P', 'R'): -2, ('D', 'A'): -2, ('S', 'C'): -1, ('E', 'D'): 2, ('Y', 'G'): -3, ('W', 'P'): -4, ('X', 'X'): -1, ('Z', 'L'): -3, ('Q', 'A'): -1, ('V', 'Y'): -1, ('W', 'A'): -3, ('G', 'D'): -1, ('X', 'P'): -2, ('K', 'D'): -1, ('T', 'N'): 0, ('Y', 'F'): 3, ('W', 'W'): 11, ('Z', 'M'): -1, ('L', 'D'): -4, ('M', 'R'): -1, ('Y', 'K'): -2, ('F', 'E'): -3, ('M', 'E'): -2, ('S', 'S'): 4, ('X', 'C'): -2, ('Y', 'L'): -1, ('H', 'R'): 0, ('P', 'P'): 7, ('K', 'C'): -3, ('S', 'A'): 1, ('P', 'I'): -3, ('Q', 'Q'): 5, ('L', 'I'): 2, ('P', 'F'): -4, ('B', 'A'): -2, ('Z', 'N'): 0, ('M', 'Q'): 0, ('V', 'I'): 3, ('Q', 'C'): -3, ('I', 'H'): -3, ('Z', 'D'): 1, ('Z', 'P'): -1, ('Y', 'W'): 2, ('T', 'G'): -2, ('B', 'P'): -2, ('P', 'A'): -1, ('C', 'D'): -3, ('Y', 'H'): 2, ('X', 'V'): -1, ('B', 'B'): 4, ('Z', 'F'): -3, ('M', 'L'): 2, ('F', 'G'): -3, ('S', 'M'): -1, ('M', 'G'): -3, ('Z', 'Q'): 3, ('S', 'Q'): 0, ('X', 'A'): 0, ('V', 'T'): 0, ('W', 'F'): 1, ('S', 'H'): -1, ('X', 'N'): -1, ('B', 'Q'): 0, ('K', 'A'): -1, ('I', 'Q'): -3, ('X', 'W'): -2, ('N', 'N'): 6, ('W', 'T'): -2, ('P', 'D'): -1, ('B', 'C'): -3, ('I', 'C'): -1, ('V', 'K'): -2, ('X', 'Y'): -1, ('K', 'R'): 2, ('Z', 'R'): 0, ('W', 'E'): -3, ('T', 'E'): -1, ('B', 'R'): -1, ('L', 'R'): -2, ('Q', 'R'): 1, ('X', 'F'): -1, ('T', 'S'): 1, ('B', 'D'): 4, ('Z', 'A'): -1, ('M', 'N'): -2, ('V', 'D'): -3, ('F', 'A'): -2, ('X', 'E'): -1, ('F', 'H'): -1, ('M', 'A'): -1, ('K', 'Q'): 1, ('Z', 'S'): 0, ('X', 'G'): -1, ('V', 'V'): 4, ('W', 'D'): -4, ('X', 'H'): -1, ('S', 'F'): -2, ('X', 'L'): -1, ('B', 'S'): 0, ('S', 'G'): 0, ('P', 'M'): -2, ('Y', 'M'): -1, ('H', 'D'): -1, ('B', 'E'): 1, ('Z', 'B'): 1, ('I', 'E'): -3, ('V', 'E'): -2, ('X', 'T'): 0, ('X', 'R'): -1, ('R', 'R'): 5, ('Z', 'T'): -1, ('Y', 'D'): -3, ('V', 'W'): -3, ('F', 'L'): 0, ('T', 'C'): -1, ('X', 'Q'): -1, ('B', 'T'): -1, ('K', 'N'): 0, ('T', 'H'): -2, ('Y', 'I'): -1, ('F', 'Q'): -3, ('T', 'I'): -1, ('T', 'Q'): -1, ('P', 'L'): -3, ('R', 'A'): -1, ('B', 'F'): -3, ('Z', 'C'): -3, ('M', 'H'): -2, ('V', 'F'): -1, ('F', 'C'): -2, ('L', 'L'): 4, ('M', 'C'): -1, ('C', 'R'): -3, ('D', 'D'): 6, ('E', 'R'): 0, ('V', 'P'): -2, ('S', 'D'): 0, ('E', 'E'): 5, ('W', 'G'): -2, ('P', 'C'): -3, ('F', 'R'): -3, ('B', 'G'): -1, ('C', 'C'): 9, ('I', 'G'): -4, ('V', 'G'): -3, ('W', 'K'): -3, ('G', 'N'): 0, ('I', 'N'): -3, ('Z', 'V'): -2, ('A', 'A'): 4, ('V', 'Q'): -2, ('F', 'K'): -3, ('T', 'A'): 0, ('B', 'V'): -3, ('K', 'L'): -2, ('L', 'N'): -3, ('Y', 'N'): -2, ('F', 'F'): 6, ('L', 'G'): -4, ('B', 'H'): 0, ('Z', 'E'): 4, ('Q', 'D'): 0, ('X', 'B'): -1, ('Z', 'W'): -3, ('S', 'K'): 0, ('X', 'K'): -1, ('V', 'R'): -3, ('K', 'E'): 1, ('I', 'A'): -1, ('P', 'H'): -2, ('B', 'W'): -4, ('K', 'K'): 5, ('H', 'C'): -3, ('E', 'N'): 0, ('Y', 'Q'): -1, ('H', 'H'): 8, ('B', 'I'): -3, ('C', 'A'): 0, ('I', 'I'): 4, ('V', 'A'): 0, ('W', 'I'): -3, ('T', 'F'): -2, ('V', 'S'): -2, ('T', 'T'): 5, ('F', 'M'): 0, ('L', 'E'): -3, ('M', 'M'): 5, ('Z', 'G'): -2, ('D', 'R'): -2, ('M', 'D'): -3, ('W', 'H'): -2, ('G', 'C'): -3, ('S', 'R'): -1, ('S', 'I'): -2, ('P', 'Q'): -1, ('Y', 'A'): -2, ('X', 'I'): -1, ('E', 'A'): -1, ('B', 'Y'): -3, ('K', 'I'): -3, ('H', 'A'): -2, ('P', 'G'): -2, ('F', 'N'): -3, ('H', 'N'): 1, ('B', 'K'): 0, ('V', 'C'): -1, ('T', 'L'): -1, ('P', 'K'): -1, ('W', 'S'): -3, ('T', 'D'): -1, ('T', 'M'): -1, ('P', 'N'): -2, ('K', 'H'): -1, ('T', 'R'): -1, ('Y', 'R'): -2, ('L', 'C'): -1, ('B', 'L'): -4, ('Z', 'Y'): -2, ('W', 'N'): -4, ('G', 'A'): 0, ('S', 'P'): -1, ('E', 'Q'): 2, ('C', 'N'): -3, ('H', 'Q'): 0, ('D', 'N'): 1, ('Y', 'C'): -2, ('L', 'H'): -3, ('E', 'C'): -4, ('Z', 'H'): 0, ('H', 'G'): -2, ('P', 'E'): -1, ('Y', 'S'): -2, ('G', 'R'): -2, ('B', 'M'): -3, ('Z', 'Z'): 4, ('W', 'M'): -1, ('Y', 'T'): -2, ('Y', 'P'): -3, ('Y', 'Y'): 7, ('T', 'K'): -1, ('Z', 'I'): -3, ('T', 'P'): -1, ('V', 'L'): 1, ('F', 'I'): 0, ('G', 'Q'): -2, ('L', 'A'): -1, ('M', 'I'): 1}
|
86 |
+
|
87 |
+
|
88 |
+
def smooth_insertions(state_vector):
|
89 |
+
'''
|
90 |
+
The function aims to correct to the expected imgt alignment. Renumbering functions then translate from the imgt scheme to the
|
91 |
+
appropriate scheme.
|
92 |
+
|
93 |
+
Handle insertions made by HMMER that we suspect may be in the wrong position.
|
94 |
+
Edge cases include:
|
95 |
+
- Insertions at the C terminal of fw1, fw3 and fw3 regions. Can occur when 'conserved' residues have been mutated and the
|
96 |
+
same amino acid appears in the the following CDR (e.g. mutate cysteine at 104 but the CDR3 has one or more cysteines)
|
97 |
+
- Same as above possible (but not observed in structure seqs) for N terminal of fw2, fw3 and fw4... TODO
|
98 |
+
- Heavily mutated N terminal regions that are partially recognised (e.g. 3gk8 chain H). Insertions should not be allowed
|
99 |
+
before N terminal deletions have been used. Preserve deletion locations that are not N terminal (e.g. 10 in IMGT H) if
|
100 |
+
the gap has been placed by the alignment.
|
101 |
+
|
102 |
+
'''
|
103 |
+
# Small overhead doing these corrections but worth it for reducing edge cases.
|
104 |
+
|
105 |
+
# Enforce insertion patterns as below. The CDRs are renumbered in each case so that insertions are placed accoring to the scheme
|
106 |
+
# '11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777'
|
107 |
+
# ' mmmi mmmi mmmi '
|
108 |
+
# ' mmmi immm mmmi immm mmmi immm '
|
109 |
+
|
110 |
+
# Enforce any insertions at the end and beginning of framework regions to be moved into the CDR region for renumbering.
|
111 |
+
enforced_patterns = [ [(25,'m'),(26,'m'),( 27,'m'),( 28,'i')],
|
112 |
+
[(38,'i'),(38,'m'),(39,'m'),(40,'m')],
|
113 |
+
[(54,'m'),(55,'m'),(56,'m'),(57,'i')],
|
114 |
+
[(65,'i'),(65,'m'),(66,'m'),(67,'m')],
|
115 |
+
[(103,'m'),(104,'m'),(105,'m'),(106,'i')],
|
116 |
+
[(117,'i'),(117,'m'),(118,'m'),(119,'m')] ]
|
117 |
+
|
118 |
+
# Insertions in FW1 are only allowed if there are a fewer number of n-terminal deletions made.
|
119 |
+
|
120 |
+
state_buffer = []
|
121 |
+
sv = []
|
122 |
+
for (state_id, state_type ), si in state_vector:
|
123 |
+
if state_id < 23: # Everything before the cysteine at 23.
|
124 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
125 |
+
reg = -1
|
126 |
+
elif 25 <= state_id < 28: # Add to the buffer
|
127 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
128 |
+
reg = 0
|
129 |
+
elif 37 < state_id <= 40: # Add to the buffer
|
130 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
131 |
+
reg = 1
|
132 |
+
elif 54 <= state_id < 57: # Add to the buffer
|
133 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
134 |
+
reg = 2
|
135 |
+
elif 64 < state_id <= 67: # Add to the buffer
|
136 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
137 |
+
reg = 3
|
138 |
+
elif 103 <= state_id < 106: # Add to the buffer
|
139 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
140 |
+
reg = 4
|
141 |
+
elif 116 < state_id <= 119: # Add to the buffer
|
142 |
+
state_buffer.append( ((state_id, state_type ), si) )
|
143 |
+
reg = 5
|
144 |
+
elif len(state_buffer) != 0: # Add the buffer and reset
|
145 |
+
|
146 |
+
# Find the number of insertions in the buffer
|
147 |
+
nins = sum( 1 for s in state_buffer if s[0][1] == 'i' )
|
148 |
+
|
149 |
+
# If there are insertions, adjust the alignment
|
150 |
+
if nins > 0: # We have insertions
|
151 |
+
|
152 |
+
if reg == -1: # FW1, only adjust if there are the same or more N terminal deletions than insertions
|
153 |
+
nt_dels = state_buffer[0][0][0] - 1 # Missing states
|
154 |
+
for (_id, _type ), _si in state_buffer: # Explicit deletion states.
|
155 |
+
if _type == 'd' or _si == None:
|
156 |
+
nt_dels +=1
|
157 |
+
else: # First residue found
|
158 |
+
break
|
159 |
+
if nt_dels >= nins: # More n terminal deletions than insertions found. Likely misalignment.
|
160 |
+
|
161 |
+
# Preserve the deleted states structure by using the same match annotations
|
162 |
+
new_states = [ s for s, _ in state_buffer if s[1] == 'm']
|
163 |
+
_first = new_states[0][0]
|
164 |
+
|
165 |
+
# Remove the deletions so that only residue positions are included
|
166 |
+
state_buffer = [ s for s in state_buffer if s[0][1] != 'd' ]
|
167 |
+
|
168 |
+
# Extend N terminal states backwards from the first match states
|
169 |
+
_add = len( state_buffer ) - len( new_states )
|
170 |
+
assert _add >= 0, 'Implementation logic error' # Should be adding a positive number of positions
|
171 |
+
new_states = [ (_,'m') for _ in range( _first - _add, _first ) ] + new_states
|
172 |
+
assert len(new_states)==len(state_buffer), 'Implementation logic error' # Should have the same length
|
173 |
+
|
174 |
+
# Assign them preserving the order of the sequence.
|
175 |
+
for i in range( len(state_buffer ) ):
|
176 |
+
sv.append( ( new_states[i], state_buffer[i][1]) )
|
177 |
+
else:
|
178 |
+
sv += state_buffer # The insertions may be incorrect but unknown what to do. Let the alignment place.
|
179 |
+
else:
|
180 |
+
# Remove any deletions in the buffer. Unlikely to happen but do anyway
|
181 |
+
state_buffer = [ s for s in state_buffer if s[0][1] != 'd' ]
|
182 |
+
|
183 |
+
# Define the new states defined by the enforced pattern and the length of the buffer
|
184 |
+
if reg % 2: # nterm fw
|
185 |
+
new_states = [enforced_patterns[reg][0]]*max( 0, len(state_buffer)-3) + enforced_patterns[reg][ max( 4-len(state_buffer), 1):]
|
186 |
+
else: # cterm fw
|
187 |
+
new_states = enforced_patterns[reg][:3] + [enforced_patterns[reg][2]]*max( 0, len(state_buffer)-3)
|
188 |
+
# Assign them preserving the order of the sequence.
|
189 |
+
for i in range( len(state_buffer ) ):
|
190 |
+
sv.append( ( new_states[i], state_buffer[i][1]) )
|
191 |
+
|
192 |
+
else: # Nothing to do - either all match or deletion states.
|
193 |
+
sv += state_buffer
|
194 |
+
|
195 |
+
# Add the current state
|
196 |
+
sv.append( ((state_id, state_type ), si) )
|
197 |
+
|
198 |
+
# Reset state buffer
|
199 |
+
state_buffer = []
|
200 |
+
|
201 |
+
else: # Simply append
|
202 |
+
sv.append( ((state_id, state_type ), si) )
|
203 |
+
|
204 |
+
|
205 |
+
return sv
|
206 |
+
|
207 |
+
|
208 |
+
# General function to give annotations for regions that have direct mappings onto the hmm alignment (imgt states)
|
209 |
+
def _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions):
|
210 |
+
"""
|
211 |
+
General function to number a sequence and divide it into different regions
|
212 |
+
|
213 |
+
@param sequence: The sequence string
|
214 |
+
@param state_vector: The list of states from the aligned hmm
|
215 |
+
@param state_string: A string of states for the scheme relative to IMGT (this is X for a direct equivalence, I if needs to be treated as insertion)
|
216 |
+
@param region_string: A string of characters that indicate which hmm states are in each regions for this scheme (i.e. how should the sequence be divided up)
|
217 |
+
@param region_index_dict: A dictionary converting the characters in region string to an index of the regions.
|
218 |
+
@param rels: The difference of the numbering integer at the *start* of each region
|
219 |
+
@param n_regions: The number of regions
|
220 |
+
@param exclude_deletions: A list of region indices for which deletion states should not be included. Typically the CDRs.
|
221 |
+
These will be reannotated in the scheme function. Also allows the reset of insertions.
|
222 |
+
|
223 |
+
@return: A list of lists where each region has been numbered according to the scheme. Some regions will need renumbering. This should be taken care of after the function called.
|
224 |
+
|
225 |
+
"""
|
226 |
+
|
227 |
+
state_vector = smooth_insertions( state_vector )
|
228 |
+
|
229 |
+
_regions = [ [] for _ in range(n_regions) ]
|
230 |
+
|
231 |
+
# Initialise the insertion index (-1 is a blank space) and the previous state.
|
232 |
+
insertion = -1
|
233 |
+
previous_state_id = 1
|
234 |
+
previous_state_type = 'd'
|
235 |
+
start_index, end_index = None, None
|
236 |
+
|
237 |
+
region = None
|
238 |
+
|
239 |
+
# Iterate over the aligned state vector
|
240 |
+
for (state_id, state_type ), si in state_vector:
|
241 |
+
|
242 |
+
# Retrieve the region index
|
243 |
+
if state_type != "i" or region is None: # BUG_FIX - JD 9/4/15 - do not allow a new region to start as an insertion.
|
244 |
+
region = region_index_dict[region_string[state_id-1]]
|
245 |
+
|
246 |
+
|
247 |
+
# Check the state_types
|
248 |
+
if state_type == "m": # It is a match
|
249 |
+
|
250 |
+
# Check whether this position is in the scheme as an independent state
|
251 |
+
if state_string[state_id-1]=="I": # No, it should be treated as an insertion
|
252 |
+
if previous_state_type != 'd': # Unless there was a deletion beforehand in which case this should be a real pos.
|
253 |
+
insertion +=1 # Increment the insertion annotation index
|
254 |
+
rels[region] -= 1 # Update the relative numbering from the imgt states
|
255 |
+
else: # Yes
|
256 |
+
insertion = -1 # Reset the insertions
|
257 |
+
|
258 |
+
# Add the numbering annotation to the appropriate region list
|
259 |
+
_regions[region].append( ( (state_id + rels[region], alphabet[insertion] ), sequence[si] ) )
|
260 |
+
previous_state_id = state_id # Record the previous state ID
|
261 |
+
if start_index is None:
|
262 |
+
start_index = si
|
263 |
+
end_index = si
|
264 |
+
|
265 |
+
previous_state_type = state_type
|
266 |
+
|
267 |
+
elif state_type == "i": # It is an insertion
|
268 |
+
insertion +=1 # Increment the insertion annotation index
|
269 |
+
|
270 |
+
# Add the numbering annotation to the appropriate region list
|
271 |
+
_regions[region].append( ( (previous_state_id + rels[region], alphabet[insertion]), sequence[si] ) )
|
272 |
+
if start_index is None:
|
273 |
+
start_index = si
|
274 |
+
end_index = si
|
275 |
+
|
276 |
+
previous_state_type = state_type
|
277 |
+
|
278 |
+
else: # It is a deletion
|
279 |
+
previous_state_type = state_type
|
280 |
+
|
281 |
+
# Check whether this position is in the scheme as an independent state
|
282 |
+
if state_string[state_id-1]=="I": # No, therefore irrelevant to the scheme.
|
283 |
+
rels[region] -= 1 # Update the relative numbering from the imgt states
|
284 |
+
continue
|
285 |
+
|
286 |
+
insertion = -1 # Reset the insertions
|
287 |
+
previous_state_id = state_id # Record the previous state ID, should not be needed (no delete to insert state transition)
|
288 |
+
|
289 |
+
|
290 |
+
# Reset the inssertion index if necessary and allowed. (Means the insertion code is meaningless and will be reannotated)
|
291 |
+
if insertion >= 25 and region in exclude_deletions:
|
292 |
+
insertion = 0
|
293 |
+
|
294 |
+
assert insertion < 25, "Too many insertions for numbering scheme to handle" # We ran out of letters.
|
295 |
+
|
296 |
+
return _regions, start_index, end_index
|
297 |
+
|
298 |
+
|
299 |
+
# Functions to perform the numbering and the corrections for each of the implemented schemes.
|
300 |
+
# These have been written fairly verbosely so that the template of how to generate a function for a new scheme is more clear.
|
301 |
+
# They have two stages: Perform the mapping between imgt and the scheme; Renumber those regions that do not map nicely onto imgt (e.g. CDR insertions)
|
302 |
+
|
303 |
+
|
304 |
+
|
305 |
+
########
|
306 |
+
# IMGT #
|
307 |
+
########
|
308 |
+
# - Renumbering of the CDR 1 and 2 regions in IMGT has now been implemented to ensure consistency with the gapping rules of the
|
309 |
+
# scheme. Previously gaps were defined using the HMM alignment as the underlying model was already based on the IMGT scheme. This
|
310 |
+
# worked well in original test cases but appears to give inaccurate annotations in a significant number of cases in NGS size
|
311 |
+
# sequence sets. We therefore now explicitly renumber the CDR 1 and 2 as with all the other schemes.
|
312 |
+
|
313 |
+
def number_imgt(state_vector, sequence):
|
314 |
+
"""
|
315 |
+
Apply the IMGT numbering scheme for heavy or light chains
|
316 |
+
|
317 |
+
Rules should be implemented using two strings - the state string and the region string.
|
318 |
+
|
319 |
+
There are 128 states in the HMMs. Treat X as a direct match in IMGT scheme, I is an insertion. (All X's for IMGT)
|
320 |
+
XXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXX XXXXXXXXXXXXXXXXX XXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
|
321 |
+
11111111111111111111111111 222222222222 33333333333333333 4444444444 555555555555555555555555555555555555555 6666666666666 77777777777
|
322 |
+
|
323 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
324 |
+
1. All positions before CDR1
|
325 |
+
2. CDR1 positions
|
326 |
+
3. Positions between CDR1/2
|
327 |
+
4. CDR2 positions
|
328 |
+
5. Positions between CDR2/3
|
329 |
+
6. CDR positions 105 (inc) to 118 (exc)
|
330 |
+
7. Positions after CDR3
|
331 |
+
|
332 |
+
"""
|
333 |
+
|
334 |
+
# Set up the numbering
|
335 |
+
|
336 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
337 |
+
state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
|
338 |
+
|
339 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
340 |
+
region_string = '11111111111111111111111111222222222222333333333333333334444444444555555555555555555555555555555555555555666666666666677777777777'
|
341 |
+
|
342 |
+
region_index_dict = {
|
343 |
+
"1":0,
|
344 |
+
"2":1,
|
345 |
+
"3":2,
|
346 |
+
"4":3,
|
347 |
+
"5":4,
|
348 |
+
"6":5,
|
349 |
+
"7":6
|
350 |
+
}
|
351 |
+
|
352 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
353 |
+
# This is updated in the loop below
|
354 |
+
rels = {0:0,
|
355 |
+
1:0,
|
356 |
+
2:0,
|
357 |
+
3:0,
|
358 |
+
4:0,
|
359 |
+
5:0,
|
360 |
+
6:0,
|
361 |
+
7:0
|
362 |
+
}
|
363 |
+
|
364 |
+
n_regions = 7
|
365 |
+
|
366 |
+
exclude_deletions = [1,3,5]
|
367 |
+
|
368 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
369 |
+
|
370 |
+
###############
|
371 |
+
# Renumbering #
|
372 |
+
###############
|
373 |
+
|
374 |
+
_numbering = [ _regions[0], # Fw1
|
375 |
+
[], # CDR1
|
376 |
+
_regions[2], # Fw2
|
377 |
+
[], # CDR2
|
378 |
+
_regions[4], # Fw3
|
379 |
+
[], # CDR3
|
380 |
+
_regions[6], # Fw4
|
381 |
+
|
382 |
+
]
|
383 |
+
|
384 |
+
# The alignment from HMMER should be correct for CDRs 1 and 2. Testing has shown not always the case and 'manual' renumbering
|
385 |
+
# is required as with the other schemes.
|
386 |
+
|
387 |
+
# CDR1
|
388 |
+
# CDR1 has a range from 27 (inc.) to 39 (exc.) and has a theoretical maximum length of 12.
|
389 |
+
cdr1seq = "".join([ x[1] for x in _regions[1] if x[1] != "-" ])
|
390 |
+
cdr1length = len(cdr1seq)
|
391 |
+
si = 0
|
392 |
+
prev_state = 26
|
393 |
+
for ann in get_imgt_cdr(cdr1length, 12, 27, 39):
|
394 |
+
if not ann:
|
395 |
+
_numbering[1].append( ((prev_state+1, ' '), '-') )
|
396 |
+
prev_state += 1
|
397 |
+
else:
|
398 |
+
_numbering[1].append( (ann, cdr1seq[si]) )
|
399 |
+
prev_state = ann[0]
|
400 |
+
si += 1
|
401 |
+
|
402 |
+
# CDR2
|
403 |
+
# CDR2 has a range from 56 (inc.) to 66 (exc.) and has a theoretical length of 10.
|
404 |
+
cdr2seq = "".join([ x[1] for x in _regions[3] if x[1] != "-" ])
|
405 |
+
cdr2length = len(cdr2seq)
|
406 |
+
si = 0
|
407 |
+
prev_state = 55
|
408 |
+
for ann in get_imgt_cdr(cdr2length, 10, 56, 66):
|
409 |
+
if not ann:
|
410 |
+
_numbering[3].append( ((prev_state+1, ' '), '-') )
|
411 |
+
prev_state += 1
|
412 |
+
else:
|
413 |
+
_numbering[3].append( (ann, cdr2seq[si]) )
|
414 |
+
prev_state = ann[0]
|
415 |
+
si += 1
|
416 |
+
|
417 |
+
# FW3. We allow the HMM to place insertions. Technically all insertion points are taken care of but in reality insertions can
|
418 |
+
# and do occur. No specification of where the insertions should be placed.
|
419 |
+
|
420 |
+
|
421 |
+
# CDR3
|
422 |
+
# CDR3 has a range from 105 (inc.) to 118 (exc.). Insertions are placed on 112 and 111 symetrically. IMGT has a technical
|
423 |
+
# maximum length of 65 (13 positions, 26*2 insertions) . In practice ANARCI will not recognise CDR3s of this length.
|
424 |
+
cdr3seq = "".join([ x[1] for x in _regions[5] if x[1] != "-" ])
|
425 |
+
cdr3length = len(cdr3seq)
|
426 |
+
if cdr3length > 117: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
427 |
+
si = 0
|
428 |
+
previous_state_id = 104
|
429 |
+
for ann in get_imgt_cdr(cdr3length, 13, 105, 118):
|
430 |
+
if ann is None:
|
431 |
+
_numbering[5].append( ((previous_state_id+1, " "), "-" ) )
|
432 |
+
previous_state_id+=1
|
433 |
+
else:
|
434 |
+
_numbering[5].append( (ann, cdr3seq[si] ) )
|
435 |
+
previous_state_id = ann[0]
|
436 |
+
si+=1
|
437 |
+
|
438 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
439 |
+
return gap_missing( _numbering ), startindex, endindex
|
440 |
+
|
441 |
+
def get_imgt_cdr(length, maxlength, start, end):
|
442 |
+
"""
|
443 |
+
Symmetrically number a CDR loop (e.g. CDRL1/CDRH2 for IMGT)
|
444 |
+
@param length: Define the length of target CDR
|
445 |
+
@param maxlength: Define the theoretical limit (e.g. L1 = 12 for the IMGT scheme)
|
446 |
+
@param start, end: Start and end position numbers
|
447 |
+
"""
|
448 |
+
annotations = [ None for _ in range(max(length, maxlength)) ]
|
449 |
+
if length == 0:
|
450 |
+
return annotations
|
451 |
+
elif length == 1:
|
452 |
+
annotations[0] = (start, ' ')
|
453 |
+
return annotations
|
454 |
+
|
455 |
+
front, back = 0, -1
|
456 |
+
#az = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
457 |
+
#za = "ZYXWVUTSRQPONMLKJIHGFEDCBA"
|
458 |
+
|
459 |
+
az = alphabet[:-1]
|
460 |
+
za = az[::-1]
|
461 |
+
|
462 |
+
for i in range(min(length, maxlength)):
|
463 |
+
if i % 2:
|
464 |
+
annotations[back] = (end + back, " ")
|
465 |
+
back -= 1
|
466 |
+
else:
|
467 |
+
annotations[front] = (start + front, " ")
|
468 |
+
front += 1
|
469 |
+
|
470 |
+
# Add insertions around the centre point
|
471 |
+
centrepoint = [ i for i,v in enumerate(annotations) if v == None ]
|
472 |
+
if not centrepoint:
|
473 |
+
return annotations
|
474 |
+
|
475 |
+
centre_left = annotations[min(centrepoint)-1][0] # Get the index right before the first None
|
476 |
+
centre_right = annotations[max(centrepoint)+1][0] # Get the index right after the first None
|
477 |
+
|
478 |
+
# For cases with an even max length
|
479 |
+
if not maxlength % 2:
|
480 |
+
frontfactor, backfactor = maxlength//2, maxlength//2
|
481 |
+
# For cases with an odd max length
|
482 |
+
else:
|
483 |
+
frontfactor, backfactor = (maxlength//2)+1, maxlength//2
|
484 |
+
|
485 |
+
for i in range(max(0, length-maxlength)):
|
486 |
+
if not i % 2:
|
487 |
+
annotations[back] = (centre_right, za[back + backfactor])
|
488 |
+
back -= 1
|
489 |
+
else:
|
490 |
+
annotations[front] = (centre_left, az[front - frontfactor])
|
491 |
+
front += 1
|
492 |
+
|
493 |
+
return annotations
|
494 |
+
|
495 |
+
|
496 |
+
#######
|
497 |
+
# Aho #
|
498 |
+
#######
|
499 |
+
# Heuristic regapping based on the AHo specification as detailed on AAAAA website. Gap order depends on the chain type
|
500 |
+
def number_aho(state_vector, sequence, chain_type):
|
501 |
+
"""
|
502 |
+
Apply the Aho numbering scheme
|
503 |
+
|
504 |
+
Rules should be implemented using two strings - the state string and the region string.
|
505 |
+
|
506 |
+
There are 128 states in the HMMs. Treat X as a direct match in IMGT scheme, I is an insertion. (All X's for IMGT)
|
507 |
+
|
508 |
+
XXXXXXX XXX XXXXXXXXXXXXXX XXXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
|
509 |
+
AAAAAAA BBB CCCCCCCCCCCCCC DDDDDDDDDDDDDDDD EEEEEEEEEEEEEEE FFFFFFFFFFFFFFFFFFFF HHHHHHHHHHHHHHHH IIIIIIIIIIIII JJJJJJJJJJJJJ KKKKKKKKKKK
|
510 |
+
|
511 |
+
|
512 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
513 |
+
A. EMPTY (now included in B)
|
514 |
+
B. 1-10 inclusive. Indel occurs at 8
|
515 |
+
C. 11-24 inclusive.
|
516 |
+
D. 25-42 inclusive (deletion surround 28) 32-42 inclusive (deletions surround 36)
|
517 |
+
E. 43-57 inclusive
|
518 |
+
F. 58-77 inclusive (deletions surround 63). Alpha chains have deletions at 74,75
|
519 |
+
G. EMPTY (now included in H)
|
520 |
+
H. 78-93 inclusive gaps on 86 then 85, insertions on 85 linearly
|
521 |
+
I. 94-106 inclusive
|
522 |
+
J. 107-138 inclusive gaps on 123 symetrically.
|
523 |
+
K. 139-149 inclusive.
|
524 |
+
|
525 |
+
"""
|
526 |
+
|
527 |
+
# Set up the numbering
|
528 |
+
|
529 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
530 |
+
state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
|
531 |
+
|
532 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
533 |
+
region_string = 'BBBBBBBBBBCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFFHHHHHHHHHHHHHHHHIIIIIIIIIIIIIJJJJJJJJJJJJJKKKKKKKKKKK'
|
534 |
+
# 1 2 3 4 5 7 8 9 10
|
535 |
+
|
536 |
+
|
537 |
+
region_index_dict = dict( list(zip( "ABCDEFGHIJK", list(range(11)) )) )
|
538 |
+
|
539 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
540 |
+
# This is updated in the loop below
|
541 |
+
rels = {0:0,
|
542 |
+
1:0,
|
543 |
+
2:0,
|
544 |
+
3:0,
|
545 |
+
4:2,
|
546 |
+
5:2,
|
547 |
+
6:2,
|
548 |
+
7:2,
|
549 |
+
8:2,
|
550 |
+
9:2,
|
551 |
+
10:21}
|
552 |
+
|
553 |
+
n_regions = 11
|
554 |
+
|
555 |
+
exclude_deletions = [1,3,4,5,7,9]
|
556 |
+
|
557 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
558 |
+
|
559 |
+
###############
|
560 |
+
# Renumbering #
|
561 |
+
###############
|
562 |
+
|
563 |
+
_numbering = [ _regions[0], _regions[1], _regions[2],[], _regions[4], [], _regions[6], [], _regions[8],_regions[9],_regions[10] ]
|
564 |
+
|
565 |
+
##################################
|
566 |
+
# Move the indel in fw 1 onto 8 #
|
567 |
+
##################################
|
568 |
+
|
569 |
+
# Place indels on 8
|
570 |
+
# Find the first recognised residue and change the expected length of the stretch given the starting point.
|
571 |
+
# This prevents n terminal deletions being placed at 8 incorrectly.
|
572 |
+
length = len( _regions[1] )
|
573 |
+
if length > 0:
|
574 |
+
start = _regions[1][0][0][0]
|
575 |
+
stretch_len = 10 - (start -1)
|
576 |
+
if length > stretch_len: # Insertions are present. Place on 8
|
577 |
+
annotations = [ (_," ") for _ in range(start,9) ] + [ (8,alphabet[_]) for _ in range( length - stretch_len ) ] + [(9," "),(10," ")]
|
578 |
+
else:
|
579 |
+
ordered_deletions = [(8," ")] + [(_," ") for _ in range(start, 11) if _ != 8]
|
580 |
+
annotations = sorted( ordered_deletions[max(stretch_len-length, 0):] )
|
581 |
+
_numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
|
582 |
+
|
583 |
+
#########
|
584 |
+
# CDR 1 # - divided in two parts in the Aho scheme.
|
585 |
+
######### - gaps at 28 depending on the chain type.
|
586 |
+
|
587 |
+
# "VH domains, as well as the majority of the VA domains, have a one-residue gap in position 28, VK and VB domains a two-residue
|
588 |
+
# gap in position 27 and 28."
|
589 |
+
|
590 |
+
# We use the link below as the reference for the scheme.
|
591 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Numbering/Alignment.html
|
592 |
+
|
593 |
+
# Some of the header lines in these images are offset by one (VH)! The gaps really are centered at 28 and 36
|
594 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VK.html
|
595 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VL.html
|
596 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VH.html
|
597 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VA.html
|
598 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VB.html
|
599 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VG.html
|
600 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VD.html
|
601 |
+
|
602 |
+
# We gap the CDR1 in a heuristic way using the gaps.
|
603 |
+
# This means that CDR1 gapping will not always be correct. For example if one grafts a Kappa CDR1 loop onto a Lambda framework
|
604 |
+
# the gapping patter might now be incorrect.
|
605 |
+
# Not a fan of being so prescriptive.
|
606 |
+
|
607 |
+
# The CDR1 region included here ranges from AHo 25 to AHo 42 inclusive
|
608 |
+
|
609 |
+
# The order in which the two loops are gapped is dependent on the chain type (see alignments in URLs above).
|
610 |
+
# Not all lengths are defined as not all lengths were crystallised in 2001 (or today). Where no example of the length was
|
611 |
+
# available the rule followed is to continue gapping the C terminal 'loop', then the N terminal 'loop', then 31 then the fw.
|
612 |
+
# In all cases I have commented where the gapping is undefined. Note that for alpha chains the gapping rules are inconsistent.
|
613 |
+
|
614 |
+
_L = 28,36,35,37,34,38,27,29,33,39,32,40,26,30,25,31,41,42
|
615 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
|
616 |
+
_K = 28,27,36,35,37,34,38,33,39,32,40,29,26,30,25,31,41,42
|
617 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
|
618 |
+
_H = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
|
619 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
|
620 |
+
# N.B. The header on the alignment image for PDB_VH is offset by 1!
|
621 |
+
_A = 28,36,35,37,34,38,33,39,27,32,40,29,26,30,25,31,41,42
|
622 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then fw.
|
623 |
+
# N.B The gapping is inconsistent for alpha chains. I follow the paper's statement that most VA have
|
624 |
+
# one gap at 28 and remove 28 and 27 before removing 40.
|
625 |
+
_B = 28,36,35,37,34,38,33,39,27,32,40,29,26,30,25,31,41,42
|
626 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
|
627 |
+
_D = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
|
628 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
|
629 |
+
# N.B only two sequence patterns available.
|
630 |
+
_G = 28,36,35,37,34,38,27,33,39,32,40,29,26,30,25,31,41,42
|
631 |
+
# |-> undefined by AHo. Gapping C terminal loop then N terminal then 31, then fw.
|
632 |
+
# N.B only one sequence patterns available. Delta copied.
|
633 |
+
|
634 |
+
ordered_deletions = { 'L':_L,'K':_K, 'H':_H, 'A':_A, 'B':_B, 'D':_D, 'G':_G }
|
635 |
+
|
636 |
+
length = len( _regions[3] )
|
637 |
+
|
638 |
+
annotations = [ (i, ' ') for i in sorted( ordered_deletions[chain_type][ max(18-length, 0): ] ) ]
|
639 |
+
|
640 |
+
# Insertions are not described in the AHo scheme but must be included as there is a significant number of CDRH1s that are
|
641 |
+
# longer than the number of positions.
|
642 |
+
insertions = max( length-18 , 0 )
|
643 |
+
if insertions > 26:
|
644 |
+
return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
645 |
+
elif insertions > 0:
|
646 |
+
# They are placed on residue 36 alphabetically.
|
647 |
+
insertat = annotations.index( (36, ' ') )+1 # Always 12
|
648 |
+
assert insertat == 12, 'AHo numbering failed'
|
649 |
+
annotations = annotations[:insertat] + [ (36, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
|
650 |
+
|
651 |
+
_numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
|
652 |
+
|
653 |
+
#########
|
654 |
+
# CDR 2 #
|
655 |
+
#########
|
656 |
+
# Gaps are placed symetically at 63.
|
657 |
+
# For VA a second gap is placed at 74 and 75 according to the text in the paper. However, all the reference sequences show a
|
658 |
+
# gap at 73 and 74 see:
|
659 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Sequences/Rearranged/PDB_VA.html
|
660 |
+
# and
|
661 |
+
# https://www.bioc.uzh.ch/plueckthun/antibody/Numbering/Alignment.html
|
662 |
+
# Either I am mis-interpreting the text in the paper or there is something a little inconsistent here...
|
663 |
+
# Given that *all* the numbered examples show the VA gap at 73 and 74 on the AAAAA website I have decided to implement this.
|
664 |
+
#
|
665 |
+
|
666 |
+
# This region describes 58 to 77 inclusive
|
667 |
+
|
668 |
+
if chain_type == 'A':
|
669 |
+
ordered_deletions = [74,73,63,62,64,61,65,60,66,59,67,58,68,69,70,71,72,75,76,77]
|
670 |
+
else:
|
671 |
+
ordered_deletions = [63,62,64,61,65,60,66,59,67,58,68,69,70,71,72,73,74,75,76,77]
|
672 |
+
|
673 |
+
length = len(_regions[5])
|
674 |
+
|
675 |
+
annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(20-length, 0): ] ) ]
|
676 |
+
|
677 |
+
# Insertions are not described in the AHo scheme but must be included.
|
678 |
+
insertions = max( length-20 , 0 )
|
679 |
+
if insertions > 26:
|
680 |
+
return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
681 |
+
elif insertions > 0:
|
682 |
+
# They are placed on residue 63 alphabetically.
|
683 |
+
insertat = annotations.index( (63, ' ') )+1 # Always 6
|
684 |
+
assert insertat == 6, 'AHo numbering failed'
|
685 |
+
annotations = annotations[:insertat] + [ (63, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
|
686 |
+
|
687 |
+
_numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
|
688 |
+
|
689 |
+
#########
|
690 |
+
# FW3 ############################################
|
691 |
+
# Move deletions onto 86 then 85. Insertions on 85 #
|
692 |
+
####################################################
|
693 |
+
ordered_deletions = [86,85,87,84,88,83,89,82,90,81,91,80,92,79,93,78]
|
694 |
+
length=len( _regions[7] )
|
695 |
+
|
696 |
+
annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(16-length, 0): ] ) ]
|
697 |
+
|
698 |
+
# Insertions are not described in the AHo scheme but must be included.
|
699 |
+
insertions = max( length-16 , 0 )
|
700 |
+
if insertions > 26:
|
701 |
+
return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
702 |
+
elif insertions > 0:
|
703 |
+
# They are placed on residue 85 alphabetically.
|
704 |
+
insertat = annotations.index( (85, ' ') )+1 # Always 8
|
705 |
+
assert insertat == 8, 'AHo numbering failed'
|
706 |
+
annotations = annotations[:insertat] + [ (85, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
|
707 |
+
|
708 |
+
_numbering[7] = [ (annotations[i], _regions[7][i][1]) for i in range(length) ]
|
709 |
+
|
710 |
+
|
711 |
+
#########
|
712 |
+
# CDR 3 #
|
713 |
+
#########
|
714 |
+
# Deletions on 123.
|
715 |
+
# Point of the Aho scheme is that they have accounted for all possible positions.
|
716 |
+
# Assumption is that no more insertions will occur....
|
717 |
+
# We'll put insertions on 123 linearly.(i.e.ABCDEF...) if they ever do.
|
718 |
+
|
719 |
+
ordered_deletions = [123,124,122,125,121,126,120,127,119,128,118,129,117,130,116,131,115,132,114,133,113,134,112,135,111,
|
720 |
+
136,110,137,109,138,108,107]
|
721 |
+
|
722 |
+
length=len( _regions[9] )
|
723 |
+
|
724 |
+
annotations = [ (i, ' ') for i in sorted( ordered_deletions[ max(32-length, 0): ] ) ]
|
725 |
+
|
726 |
+
# Insertions are not described in the AHo scheme but must be included.
|
727 |
+
insertions = max( length-32 , 0 )
|
728 |
+
if insertions > 26:
|
729 |
+
return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
730 |
+
elif insertions > 0:
|
731 |
+
# They are placed on residue 123 alphabetically.
|
732 |
+
insertat = annotations.index( (123, ' ') )+1 # Always 17
|
733 |
+
assert insertat == 17, 'AHo numbering failed'
|
734 |
+
annotations = annotations[:insertat] + [ (123, alphabet[a]) for a in range( insertions ) ] + annotations[insertat:]
|
735 |
+
|
736 |
+
_numbering[9] = [ (annotations[i], _regions[9][i][1]) for i in range(length) ]
|
737 |
+
|
738 |
+
# AHo includes one extra position than IMGT in what it considers the variable domain for light chains.
|
739 |
+
#If the last state is 148 and there is at least one more residue left, then add the residue to the numbering.
|
740 |
+
numbering = gap_missing( _numbering )
|
741 |
+
if len(numbering) > 0:
|
742 |
+
if numbering[-1][0] == (148, ' ') and numbering[-1][1] != '-' and endindex+1 < len(sequence):
|
743 |
+
numbering.append( ( (149, ' '), sequence[endindex+1]) )
|
744 |
+
endindex +=1
|
745 |
+
|
746 |
+
return numbering, startindex, endindex
|
747 |
+
|
748 |
+
|
749 |
+
###########
|
750 |
+
# Chothia #
|
751 |
+
###########
|
752 |
+
|
753 |
+
# Heavy chains
|
754 |
+
def number_chothia_heavy(state_vector, sequence):
|
755 |
+
"""
|
756 |
+
Apply the Chothia numbering scheme for heavy chains
|
757 |
+
|
758 |
+
Rules should be implemented using two strings - the state string and the region string.
|
759 |
+
|
760 |
+
There are 128 states in the HMMs. Treat X as a direct match in Chothia scheme, I is an insertion.
|
761 |
+
|
762 |
+
XXXXXXXXXI XXXXXXXXXXXXX XXXXXXXIIIIXX XXXXXXXXXXXXXXXXXX XXXIXIIXXXX XXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXX XXXXXXXXIIIXX XXXXXXXXXXX'
|
763 |
+
1111111111 2222222222222 3333333333333 444444444444444444 55555555555 666666666666666666666666666666666666666 7777777777777 88888888888'
|
764 |
+
|
765 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
766 |
+
1 - Put the insertions at Chothia position 6
|
767 |
+
2 - Simple mapping (treat "I" states as inserts and not own match states)
|
768 |
+
3 - CDRH1 - 30 (inc) to 34 (exc) put insertions on 31
|
769 |
+
4 - Simple mapping (treat "I" states as inserts and not own match states)
|
770 |
+
5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
|
771 |
+
6 - Simple mapping (treat "I" states as inserts and not own match states)
|
772 |
+
7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
|
773 |
+
8 - Simple mapping (treat "I" states as inserts and not own match states)
|
774 |
+
|
775 |
+
|
776 |
+
Regions 1,3,5 and 7 are renumbered
|
777 |
+
|
778 |
+
"""
|
779 |
+
|
780 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
781 |
+
state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
|
782 |
+
|
783 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
784 |
+
region_string = '11111111112222222222222333333333333333444444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
|
785 |
+
|
786 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
|
787 |
+
|
788 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
789 |
+
# This is updated in the loop below
|
790 |
+
rels = {0:0,
|
791 |
+
1:-1,
|
792 |
+
2:-1,
|
793 |
+
3:-5,
|
794 |
+
4:-5,
|
795 |
+
5:-8,
|
796 |
+
6:-12,
|
797 |
+
7:-15}
|
798 |
+
|
799 |
+
n_regions = 8
|
800 |
+
|
801 |
+
exclude_deletions = [0,2,4,6] # Don't put deletions in these regions
|
802 |
+
|
803 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
804 |
+
|
805 |
+
|
806 |
+
###############
|
807 |
+
# Renumbering #
|
808 |
+
###############
|
809 |
+
|
810 |
+
_numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
|
811 |
+
|
812 |
+
# Chothia H region 1 (index 0)
|
813 |
+
# Insertions are placed at Chothia position 6.
|
814 |
+
# Count how many we recognised as insertion by the hmm
|
815 |
+
insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
|
816 |
+
# We will place all insertion in this region at Chothia position 6.
|
817 |
+
if insertions:
|
818 |
+
start = _regions[0][0][0][0] # The starting Chothia number as found by the HMM (could easily start from 2 for example)
|
819 |
+
# I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
|
820 |
+
length = len( _regions[0] )
|
821 |
+
annotations = [ (_, " ") for _ in range(start, 7) ] + [ (6, alphabet[_]) for _ in range(insertions) ] + [(7," "),(8," "),(9," ")]
|
822 |
+
_numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
|
823 |
+
else:
|
824 |
+
_numbering[0] = _regions[0]
|
825 |
+
|
826 |
+
|
827 |
+
# CDR1
|
828 |
+
# Chothia H region 3 (index 2)
|
829 |
+
# put insertions onto 31
|
830 |
+
length = len( _regions[2] )
|
831 |
+
insertions = max(length - 11, 0) # Pulled back to the cysteine as heavily engineered cdr1's are not playing nicely
|
832 |
+
|
833 |
+
if insertions:
|
834 |
+
annotations = [(_, " ") for _ in range(23,32)] + [(31, alphabet[i]) for i in range(insertions) ] + [(32," "),(33," ")]
|
835 |
+
else:
|
836 |
+
annotations = [(_, " ") for _ in range(23,32)][:length-2] + [(32," "),(33," ")][:length]
|
837 |
+
|
838 |
+
_numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
|
839 |
+
|
840 |
+
# CDR2
|
841 |
+
# Chothia H region 5 (index 4)
|
842 |
+
# put insertions onto 52
|
843 |
+
length = len( _regions[4] )
|
844 |
+
# 50 to 57 inclusive
|
845 |
+
insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
|
846 |
+
# Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
|
847 |
+
annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
|
848 |
+
annotations += [(52, alphabet[i]) for i in range(insertions) ]
|
849 |
+
annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
|
850 |
+
_numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
|
851 |
+
|
852 |
+
# FW3 - insertions are annotated on 82. The first three are normal positions and annotated automatically.
|
853 |
+
# Additional insertions do not occur with the kabat or the chothia numbering scheme.
|
854 |
+
# It does not make sense to place more than A, B, C on 82 as Martin and AHo work show that this is not a place that accepts
|
855 |
+
# additional insertions.
|
856 |
+
# The decision here is to allow the alignment to place additional insertions. This is in contrast to Martin where the region
|
857 |
+
# is renumbered to place insertions on 72.
|
858 |
+
|
859 |
+
# CDR3
|
860 |
+
# Chothia H region 7 (index 6)
|
861 |
+
# put insertions onto 100
|
862 |
+
length = len( _regions[6] )
|
863 |
+
if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
864 |
+
annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy")
|
865 |
+
_numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
|
866 |
+
|
867 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
868 |
+
return gap_missing( _numbering ), startindex, endindex
|
869 |
+
|
870 |
+
# Light chains
|
871 |
+
def number_chothia_light(state_vector, sequence):
|
872 |
+
"""
|
873 |
+
Apply the Chothia numbering scheme for light chains
|
874 |
+
|
875 |
+
Rules should be implemented using two strings - the state string and the region string.
|
876 |
+
|
877 |
+
There are 128 states in the HMMs. Treat X as a direct match in Chothia scheme, I is an insertion.
|
878 |
+
XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
|
879 |
+
11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
|
880 |
+
|
881 |
+
|
882 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
883 |
+
1 - Simple mapping (treat "I" states as inserts and not own match states)
|
884 |
+
2 - CDRL1 - 24 (inc) to 35 (exc) put insertions on 30
|
885 |
+
3 - Simple mapping (treat "I" states as inserts and not own match states)
|
886 |
+
4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
|
887 |
+
5 - Simple mapping (treat "I" states as inserts and not own match states)
|
888 |
+
6 - CDRL3 89 (inc) to 98 (exc) put insertion on 95
|
889 |
+
7 - Simple mapping (treat "I" states as inserts and not own match states)
|
890 |
+
|
891 |
+
Region 2, 3 and 5 are renumbered
|
892 |
+
|
893 |
+
"""
|
894 |
+
|
895 |
+
# Set up the numbering
|
896 |
+
|
897 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
898 |
+
state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIIIXXXXXXXXXXXXXXXXXXXXXXIIIIIIIXXXXXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXX'
|
899 |
+
|
900 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
901 |
+
region_string = '11111111111111111111111222222222222222223333333333333333444444444445555555555555555555555555555555555555666666666666677777777777'
|
902 |
+
|
903 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
|
904 |
+
|
905 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
906 |
+
# This is updated in the loop below
|
907 |
+
rels = {0:0,
|
908 |
+
1: 0,
|
909 |
+
2:-6,
|
910 |
+
3:-6,
|
911 |
+
4:-13,
|
912 |
+
5:-16,
|
913 |
+
6:-20,
|
914 |
+
}
|
915 |
+
|
916 |
+
|
917 |
+
n_regions = 7
|
918 |
+
|
919 |
+
exclude_deletions = [1,3,4,5]
|
920 |
+
|
921 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
922 |
+
|
923 |
+
_numbering = [ _regions[0], [], _regions[2], [], _regions[4], [], _regions[6] ]
|
924 |
+
|
925 |
+
|
926 |
+
###############
|
927 |
+
# Renumbering #
|
928 |
+
###############
|
929 |
+
|
930 |
+
# CDR1
|
931 |
+
# Chothia L region 2 (index 1)
|
932 |
+
# put insertions onto 30
|
933 |
+
length = len( _regions[1] )
|
934 |
+
insertions = max(length - 11, 0) # Eleven positions can be accounted for, the remainder are insertions
|
935 |
+
# Delete forward from 31
|
936 |
+
annotations = [(24, " "),(25, " "), (26, " "), (27, " "), (28, " "),(29, " "),(30, " ")][:max(0,length)]
|
937 |
+
annotations += [(30, alphabet[i]) for i in range(insertions) ]
|
938 |
+
annotations += [(31, " "),(32, " "),(33, " "),(34, " ")][ abs( min(0,length-11) ):]
|
939 |
+
_numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
|
940 |
+
|
941 |
+
|
942 |
+
# CDR2
|
943 |
+
# Chothia L region 4 (index 3)
|
944 |
+
# put insertions onto 52.
|
945 |
+
length = len( _regions[3] )
|
946 |
+
insertions = max( length - 4, 0 )
|
947 |
+
if insertions > 0:
|
948 |
+
annotations = [(51, " "),(52, " ")] + [(52, alphabet[i]) for i in range(insertions) ] + [(53, " "),(54, " ")]
|
949 |
+
_numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
|
950 |
+
else: # How to gap L2 in Chothia/Kabat/Martin is unclear so we let the alignment do it.
|
951 |
+
_numbering[3] = _regions[3]
|
952 |
+
|
953 |
+
# FW3
|
954 |
+
# Insertions on 68. First deletion 68. Otherwise default to alignment
|
955 |
+
length = len( _regions[4] )
|
956 |
+
insertions = max(length - 34, 0)
|
957 |
+
if insertions > 0: # Insertions on 68
|
958 |
+
annotations = [(i," ") for i in range(55,69)]+[(68, alphabet[i]) for i in range(insertions) ]+[(i," ") for i in range(69,89)]
|
959 |
+
_numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
|
960 |
+
elif length == 33: # First deletion on 68
|
961 |
+
annotations = [(i," ") for i in range(55,68)]+[(i," ") for i in range(69,89)]
|
962 |
+
_numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
|
963 |
+
else: # More deletions - allow alignment to place them
|
964 |
+
_numbering[4] = _regions[4]
|
965 |
+
|
966 |
+
|
967 |
+
# CDR3
|
968 |
+
# Chothia L region 6 (index 5)
|
969 |
+
# put insertions onto 95
|
970 |
+
length = len( _regions[5] )
|
971 |
+
|
972 |
+
if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
973 |
+
annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="light")
|
974 |
+
_numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
|
975 |
+
|
976 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
977 |
+
|
978 |
+
return gap_missing( _numbering ), startindex, endindex
|
979 |
+
|
980 |
+
|
981 |
+
#########
|
982 |
+
# Kabat #
|
983 |
+
#########
|
984 |
+
|
985 |
+
# Heavy chains
|
986 |
+
def number_kabat_heavy(state_vector, sequence):
|
987 |
+
"""
|
988 |
+
Apply the Kabat numbering scheme for heavy chains
|
989 |
+
|
990 |
+
Rules should be implemented using two strings - the state string and the region string.
|
991 |
+
|
992 |
+
There are 128 states in the HMMs. Treat X as a direct match in Kabat scheme, I is an insertion.
|
993 |
+
XXXXXXXXXI XXXXXXXXXXXXXXXXXXXX IIIIXXXXXX XXXXXXXXXXXXXXXX XIXII XXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXX XXXXXXIII XXXXXXXXXXXXX
|
994 |
+
1111111111 22222222222222222222 3333333333 4444444444444444 55555 666666666666666666666666666666666666666666666 777777777 8888888888888
|
995 |
+
|
996 |
+
|
997 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
998 |
+
1 - Put the insertions at Chothia position 6
|
999 |
+
2 - Simple mapping (treat "I" states as inserts and not own match states)
|
1000 |
+
3 - CDRH1 - 30 (inc) to 36 (exc) put insertions on 35
|
1001 |
+
4 - Simple mapping (treat "I" states as inserts and not own match states)
|
1002 |
+
5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
|
1003 |
+
6 - Simple mapping (treat "I" states as inserts and not own match states)
|
1004 |
+
7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
|
1005 |
+
8 - Simple mapping (treat "I" states as inserts and not own match states)
|
1006 |
+
|
1007 |
+
"""
|
1008 |
+
|
1009 |
+
# Set up the numbering
|
1010 |
+
|
1011 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
1012 |
+
state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
|
1013 |
+
|
1014 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
1015 |
+
region_string = '11111111112222222222222333333333333333334444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
|
1016 |
+
|
1017 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
|
1018 |
+
|
1019 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
1020 |
+
# This is updated in the loop below
|
1021 |
+
rels = {0:0,
|
1022 |
+
1:-1,
|
1023 |
+
2:-1,
|
1024 |
+
3:-5,
|
1025 |
+
4:-5,
|
1026 |
+
5:-8,
|
1027 |
+
6:-12,
|
1028 |
+
7:-15}
|
1029 |
+
|
1030 |
+
n_regions = 8
|
1031 |
+
|
1032 |
+
exclude_deletions = [2,4,6]
|
1033 |
+
|
1034 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
1035 |
+
|
1036 |
+
|
1037 |
+
###############
|
1038 |
+
# Renumbering #
|
1039 |
+
###############
|
1040 |
+
|
1041 |
+
# Renumbering required for 0, 2, 4, 6 regions in Chothia heavy
|
1042 |
+
|
1043 |
+
_numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
|
1044 |
+
|
1045 |
+
|
1046 |
+
# Kabat H region 1 (index 0)
|
1047 |
+
# Insertions are placed at Kabat position 6.
|
1048 |
+
# Count how many we recognised as insertion by the hmm
|
1049 |
+
insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
|
1050 |
+
# We will place all insertion in this region at Kabat position 6.
|
1051 |
+
if insertions:
|
1052 |
+
start = _regions[0][0][0][0] # The starting Kabat number as found by the HMM (could easily start from 2 for example)
|
1053 |
+
# I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
|
1054 |
+
length = len( _regions[0] )
|
1055 |
+
annotations = [ (_, " ") for _ in range(start, 7) ] + [ (6, alphabet[_]) for _ in range(insertions) ] + [(7," "),(8," "),(9," ")]
|
1056 |
+
_numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
|
1057 |
+
else:
|
1058 |
+
_numbering[0] = _regions[0]
|
1059 |
+
|
1060 |
+
|
1061 |
+
# CDR1
|
1062 |
+
# Kabat H region 3 (index 2)
|
1063 |
+
# Put insertions onto 35. Delete from 35 backwards
|
1064 |
+
length = len( _regions[2] )
|
1065 |
+
insertions = max(0,length - 13)
|
1066 |
+
annotations = [(_,' ') for _ in range(23, 36)][:length]
|
1067 |
+
annotations += [(35, alphabet[i]) for i in range(insertions) ]
|
1068 |
+
_numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
|
1069 |
+
|
1070 |
+
# CDR2
|
1071 |
+
# Chothia H region 5 (index 4)
|
1072 |
+
# put insertions onto 52
|
1073 |
+
length = len( _regions[4] )
|
1074 |
+
# 50 to 57 inclusive
|
1075 |
+
insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
|
1076 |
+
# Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
|
1077 |
+
annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
|
1078 |
+
annotations += [(52, alphabet[i]) for i in range(insertions) ]
|
1079 |
+
annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
|
1080 |
+
_numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
|
1081 |
+
|
1082 |
+
# FW3 - insertions are annotated on 82. The first three are normal positions and annotated automatically.
|
1083 |
+
# Additional insertions do not occur with the kabat or the chothia numbering scheme.
|
1084 |
+
# It does not make sense to place more than A, B, C on 82 as Martin and AHo work show that this is not a place that accepts
|
1085 |
+
# additional insertions.
|
1086 |
+
# The decision here is to allow the alignment to place additional insertions. This is in contrast to Martin where the region
|
1087 |
+
# is renumbered to place insertions on 72.
|
1088 |
+
|
1089 |
+
# CDR3
|
1090 |
+
# Chothia H region 7 (index 6)
|
1091 |
+
# put insertions onto 100
|
1092 |
+
length = len( _regions[6] )
|
1093 |
+
if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
1094 |
+
annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="heavy") # Chothia and Kabat the same here
|
1095 |
+
_numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
|
1096 |
+
|
1097 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
1098 |
+
return gap_missing( _numbering ), startindex, endindex
|
1099 |
+
|
1100 |
+
# Light chains
|
1101 |
+
def number_kabat_light(state_vector, sequence):
|
1102 |
+
"""
|
1103 |
+
Apply the Kabat numbering scheme for light chains
|
1104 |
+
|
1105 |
+
Rules should be implemented using two strings - the state string and the region string.
|
1106 |
+
|
1107 |
+
There are 128 states in the HMMs. Treat X as a direct match in Kabat scheme, I is an insertion.
|
1108 |
+
XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
|
1109 |
+
11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
|
1110 |
+
|
1111 |
+
|
1112 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
1113 |
+
1 - Simple mapping (treat "I" states as inserts and not own match states)
|
1114 |
+
2 - CDRL1 - 24 (inc) to 35 (exc) put insertions on 27
|
1115 |
+
3 - Simple mapping (treat "I" states as inserts and not own match states)
|
1116 |
+
4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
|
1117 |
+
5 - Simple mapping (treat "I" states as inserts and not own match states)
|
1118 |
+
6 - CDRL3 89 (inc) to 96 (exc) put insertion on 95
|
1119 |
+
7 - Simple mapping (treat "I" states as inserts and not own match states)
|
1120 |
+
|
1121 |
+
"""
|
1122 |
+
|
1123 |
+
# Set up the numbering
|
1124 |
+
|
1125 |
+
|
1126 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
1127 |
+
state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIIIXXXXXXXXXXXXXXXXXXXXXXIIIIIIIXXXXXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXX'
|
1128 |
+
|
1129 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
1130 |
+
region_string = '11111111111111111111111222222222222222223333333333333333444444444445555555555555555555555555555555555555666666666666677777777777'
|
1131 |
+
|
1132 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
|
1133 |
+
|
1134 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
1135 |
+
# This is updated in the loop below
|
1136 |
+
rels = {0:0,
|
1137 |
+
1: 0,
|
1138 |
+
2:-6,
|
1139 |
+
3:-6,
|
1140 |
+
4:-13,
|
1141 |
+
5:-16,
|
1142 |
+
6:-20,
|
1143 |
+
}
|
1144 |
+
|
1145 |
+
n_regions = 7
|
1146 |
+
|
1147 |
+
exclude_deletions = [1,3,5]
|
1148 |
+
|
1149 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
1150 |
+
|
1151 |
+
_numbering = [ _regions[0], [], _regions[2], [], _regions[4], [], _regions[6] ]
|
1152 |
+
|
1153 |
+
|
1154 |
+
###############
|
1155 |
+
# Renumbering #
|
1156 |
+
###############
|
1157 |
+
|
1158 |
+
# CDR1
|
1159 |
+
# Kabat L region 2 (index 1)
|
1160 |
+
# put insertions onto 27
|
1161 |
+
length = len( _regions[1] )
|
1162 |
+
insertions = max(length - 11, 0) # Eleven positions can be accounted for, the remainder are insertions
|
1163 |
+
# Delete forward from 28
|
1164 |
+
annotations = [(24, " "),(25, " "), (26, " "), (27, " ")][:max(0,length)]
|
1165 |
+
annotations += [(27, alphabet[i]) for i in range(insertions) ]
|
1166 |
+
annotations += [(28, " "),(29, " "),(30, " "),(31, " "),(32, " "),(33, " "),(34, " ")][ abs( min(0,length-11) ):]
|
1167 |
+
_numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
|
1168 |
+
|
1169 |
+
# CDR2
|
1170 |
+
# Chothia L region 4 (index 3)
|
1171 |
+
# put insertions onto 52.
|
1172 |
+
length = len( _regions[3] )
|
1173 |
+
insertions = max( length - 4, 0 )
|
1174 |
+
if insertions > 0:
|
1175 |
+
annotations = [(51, " "),(52, " ")] + [(52, alphabet[i]) for i in range(insertions) ] + [(53, " "),(54, " ")]
|
1176 |
+
_numbering[3] = [ (annotations[i], _regions[3][i][1]) for i in range(length) ]
|
1177 |
+
else: # How to gap L2 in Chothia/Kabat/Martin is unclear so we let the alignment do it.
|
1178 |
+
_numbering[3] = _regions[3]
|
1179 |
+
|
1180 |
+
|
1181 |
+
# FW3
|
1182 |
+
# All insertions are placed by alignment. This is in contrast to Martin (and Chothia) where they are placed on 68.
|
1183 |
+
# The kabat scheme was defined using a sequence alignment alone. In keeping with this, insertions in FW3 are also only placed
|
1184 |
+
# with respect to the sequence alignment (the HMM).
|
1185 |
+
|
1186 |
+
# CDR3
|
1187 |
+
# Chothia L region 6 (index 5)
|
1188 |
+
# put insertions onto 95
|
1189 |
+
length = len( _regions[5] )
|
1190 |
+
|
1191 |
+
if length > 35: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
1192 |
+
annotations = get_cdr3_annotations(length, scheme="kabat", chain_type="light")
|
1193 |
+
_numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
|
1194 |
+
|
1195 |
+
return gap_missing( _numbering ), startindex, endindex
|
1196 |
+
|
1197 |
+
|
1198 |
+
|
1199 |
+
|
1200 |
+
#############################
|
1201 |
+
# Martin (extended Chothia) #
|
1202 |
+
#############################
|
1203 |
+
|
1204 |
+
# Heavy chains
|
1205 |
+
def number_martin_heavy(state_vector, sequence):
|
1206 |
+
"""
|
1207 |
+
Apply the Martin (extended Chothia) numbering scheme for heavy chains
|
1208 |
+
|
1209 |
+
Rules should be implemented using two strings - the state string and the region string.
|
1210 |
+
|
1211 |
+
There are 128 states in the HMMs. Treat X as a direct match in Martin scheme, I is an insertion.
|
1212 |
+
XXXXXXXXXI XXXXXXXXXXXXXXXXXXXX IIIIXX XXXXXXXXXXXXXXXXXXXX XIXII XXXXXXXXXXXIXXXXXXXXIIIXXXXXXXXXXXXXXXXXXXXXX XXXXXXIII XXXXXXXXXXXXX
|
1213 |
+
1111111111 22222222222222222222 333333 44444444444444444444 55555 666666666666666666666666666666666666666666666 777777777 8888888888888
|
1214 |
+
|
1215 |
+
|
1216 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
1217 |
+
1 - Put the insertions at Chothia position 8
|
1218 |
+
2 - Simple mapping (treat "I" states as inserts and not own match states)
|
1219 |
+
3 - CDRH1 - 30 (inc) to 34 (exc) put insertions on 31
|
1220 |
+
4 - Simple mapping (treat "I" states as inserts and not own match states)
|
1221 |
+
5 - CDRH2 - 52 (inc) 58 (exc) put insertions on 52
|
1222 |
+
6 - Simple mapping (treat "I" states as inserts and not own match states)
|
1223 |
+
7 - CDRH3 93 (inc) to 103 (exc) put insertion on 100
|
1224 |
+
8 - Simple mapping (treat "I" states as inserts and not own match states)
|
1225 |
+
|
1226 |
+
|
1227 |
+
Regions 1,3,5 and 7 are renumbered
|
1228 |
+
|
1229 |
+
"""
|
1230 |
+
|
1231 |
+
# Set up the numbering
|
1232 |
+
|
1233 |
+
|
1234 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
1235 |
+
state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXIIIIXXXXXXXXXXXXXXXXXXXXXXXIXIIXXXXXXXXXXXIXXXXXXXXIIIXXXXXXXXXXXXXXXXXXXXXXXXXXXXIIIXXXXXXXXXXXXX'
|
1236 |
+
|
1237 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
1238 |
+
region_string = '11111111112222222222222333333333333333444444444444444455555555555666666666666666666666666666666666666666777777777777788888888888'
|
1239 |
+
|
1240 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6,"8":7}
|
1241 |
+
|
1242 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
1243 |
+
# This is updated in the loop below
|
1244 |
+
rels = {0:0,
|
1245 |
+
1:-1,
|
1246 |
+
2:-1,
|
1247 |
+
3:-5,
|
1248 |
+
4:-5,
|
1249 |
+
5:-8,
|
1250 |
+
6:-12,
|
1251 |
+
7:-15}
|
1252 |
+
|
1253 |
+
n_regions = 8
|
1254 |
+
|
1255 |
+
exclude_deletions = [2,4,5,6]
|
1256 |
+
|
1257 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
1258 |
+
|
1259 |
+
|
1260 |
+
###############
|
1261 |
+
# Renumbering #
|
1262 |
+
###############
|
1263 |
+
|
1264 |
+
# Renumbering required for 0, 2, 4, 6 regions in Chothia heavy
|
1265 |
+
|
1266 |
+
_numbering = [ [], _regions[1] , [], _regions[3] , [], _regions[5], [], _regions[7] ]
|
1267 |
+
|
1268 |
+
# Chothia H region 1 (index 0)
|
1269 |
+
# Insertions are placed at Chothia position 8.
|
1270 |
+
# Count how many we recognised as insertion by the hmm
|
1271 |
+
insertions = len( [ 1 for _ in _regions[0] if _[0][1] != " " ] )
|
1272 |
+
# We will place all insertion in this region at Chothia position 8.
|
1273 |
+
if insertions:
|
1274 |
+
start = _regions[0][0][0][0] # The starting Chothia number as found by the HMM (could easily start from 2 for example)
|
1275 |
+
# I have a feeling this may be a source of a bug in very unusual cases. Can't break for now. Will catch mistakes in a validate function.
|
1276 |
+
length = len( _regions[0] )
|
1277 |
+
annotations = [ (_, " ") for _ in range(start, 9) ] + [ (8, alphabet[_]) for _ in range(insertions) ] + [(9," ")]
|
1278 |
+
_numbering[0] = [ (annotations[i], _regions[0][i][1]) for i in range(length) ]
|
1279 |
+
else:
|
1280 |
+
_numbering[0] = _regions[0]
|
1281 |
+
|
1282 |
+
|
1283 |
+
# CDR1
|
1284 |
+
# Chothia H region 3 (index 2)
|
1285 |
+
# put insertions onto 31
|
1286 |
+
length = len( _regions[2] )
|
1287 |
+
insertions = max(length - 11, 0) # Pulled back to the cysteine as heavily engineered cdr1's are not playing nicely
|
1288 |
+
if insertions:
|
1289 |
+
annotations = [(_, " ") for _ in range(23,32)] + [(31, alphabet[i]) for i in range(insertions) ] + [(32," "),(33," ")]
|
1290 |
+
else:
|
1291 |
+
annotations = [(_, " ") for _ in range(23,32)][:length-2] + [(32," "),(33," ")][:length]
|
1292 |
+
_numbering[2] = [ (annotations[i], _regions[2][i][1]) for i in range(length) ]
|
1293 |
+
|
1294 |
+
# CDR2
|
1295 |
+
# Chothia H region 5 (index 4)
|
1296 |
+
# put insertions onto 52
|
1297 |
+
length = len( _regions[4] )
|
1298 |
+
# 50 to 57 inclusive
|
1299 |
+
insertions = max(length - 8, 0) # Eight positions can be accounted for, the remainder are insertions
|
1300 |
+
# Delete in the order, 52, 51, 50,53, 54 ,55, 56, 57
|
1301 |
+
annotations = [(50, " "),(51, " "), (52, " ")][:max(0,length-5)]
|
1302 |
+
annotations += [(52, alphabet[i]) for i in range(insertions) ]
|
1303 |
+
annotations += [(53, " "),(54, " "),(55, " "),(56, " "),(57, " ")][ abs( min(0,length-5) ):]
|
1304 |
+
_numbering[4] = [ (annotations[i], _regions[4][i][1]) for i in range(length) ]
|
1305 |
+
|
1306 |
+
# FW3
|
1307 |
+
# Place all insertions on 72 explicitly.
|
1308 |
+
# This is in contrast to Chothia implementation where 3 insertions are on 82 and then further insertions are placed by the
|
1309 |
+
# alignment
|
1310 |
+
# Gaps are placed according to the alignment.
|
1311 |
+
length = len( _regions[5] )
|
1312 |
+
insertions = max(length - 35, 0)
|
1313 |
+
if insertions > 0: # Insertions on 72
|
1314 |
+
annotations = [(i,' ') for i in range(58,73)]+[(72, alphabet[i]) for i in range(insertions) ]+[(i,' ') for i in range(73,93)]
|
1315 |
+
_numbering[5] = [ (annotations[i], _regions[5][i][1]) for i in range(length) ]
|
1316 |
+
else: # Deletions - all alignment to place them.
|
1317 |
+
_numbering[4] = _regions[4]
|
1318 |
+
|
1319 |
+
|
1320 |
+
# CDR3
|
1321 |
+
# Chothia H region 7 (index 6)
|
1322 |
+
# put insertions onto 100
|
1323 |
+
length = len( _regions[6] )
|
1324 |
+
if length > 36: return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
1325 |
+
annotations = get_cdr3_annotations(length, scheme="chothia", chain_type="heavy")
|
1326 |
+
_numbering[6] = [ (annotations[i], _regions[6][i][1]) for i in range(length) ]
|
1327 |
+
|
1328 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
1329 |
+
return gap_missing( _numbering ), startindex, endindex
|
1330 |
+
|
1331 |
+
# Light chains
|
1332 |
+
def number_martin_light(state_vector, sequence):
|
1333 |
+
"""
|
1334 |
+
Apply the Martin numbering scheme for light chains
|
1335 |
+
|
1336 |
+
Rules should be implemented using two strings - the state string and the region string.
|
1337 |
+
|
1338 |
+
There are 128 states in the HMMs. Treat X as a direct match in Martin scheme, I is an insertion.
|
1339 |
+
XXXXXXXXXXXXXXXXXXXXXXXXXXXXX IIIIIIX XXXXXXXXXXXXXXXXXXXX XIIIIIIIXXX XXXXXIXXXXXXXIIXXXXXXXXXXXXXXXXXXXXXX XXXXXIIIIXX XXXXXXXXXXXXX
|
1340 |
+
11111111111111111111111111111 2222222 33333333333333333333 44444444444 5555555555555555555555555555555555555 66666666666 7777777777777
|
1341 |
+
|
1342 |
+
|
1343 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
1344 |
+
1 - Simple mapping (treat "I" states as inserts and not own match states)
|
1345 |
+
2 - CDRL1 - 30 (inc) to 31 (exc) put insertions on 30
|
1346 |
+
3 - Simple mapping (treat "I" states as inserts and not own match states)
|
1347 |
+
4 - CDRL2 - 51 (inc) 55 (exc) put insertions on 52
|
1348 |
+
5 - Simple mapping (treat "I" states as inserts and not own match states)
|
1349 |
+
6 - CDRL3 89 (inc) to 96 (exc) put insertion on 95
|
1350 |
+
7 - Simple mapping (treat "I" states as inserts and not own match states)
|
1351 |
+
|
1352 |
+
Region 2, 3 and 5 are renumbered
|
1353 |
+
|
1354 |
+
"""
|
1355 |
+
|
1356 |
+
# The Martin and Chothia specification for light chains are very similar. Martin is more explicit in the location of indels
|
1357 |
+
# but unlike the heavy chain these are additional instead of changes to the Chothia scheme. Thus, Chothia light is implemented
|
1358 |
+
# as martin light.
|
1359 |
+
return number_chothia_light(state_vector,sequence)
|
1360 |
+
|
1361 |
+
|
1362 |
+
###########
|
1363 |
+
# Wolfguy #
|
1364 |
+
###########
|
1365 |
+
# The Wolfguy numbering scheme is an in-house scheme used at Roche. It has been described publicly in the paper:
|
1366 |
+
# Prediction of VH-VL domain orientation for antibody variable domain modeling. Bujotzek A. et al. Protein 2015 83(4) 681-95
|
1367 |
+
#
|
1368 |
+
# It is similar in gapping as IMGT and is defined only for heavy and light antibody chains.
|
1369 |
+
# Unlike other schemes the numbering denotes both the chain (heavy 101-499, light 501-799) and the region (less than -50 framework
|
1370 |
+
# greater than -50 CDR). All CDRs of length less than 50 can be handled without the need for insertion codes. Numbering of the
|
1371 |
+
# framework behaves similarly to IMGT in that all positions are assumed to be accounted for. Framework insertions are placed by
|
1372 |
+
# the alignment.
|
1373 |
+
#
|
1374 |
+
# Numbering of all CDRs is performed symmetrically with the exception of CDRL1. In this case the CDR is numbered according to a
|
1375 |
+
# pattern specific to the canonical class. This is recognised by length and by sequence similarity to a consensus sequence. If a
|
1376 |
+
# length has not been observed it is numbered symmetrically.
|
1377 |
+
|
1378 |
+
|
1379 |
+
def number_wolfguy_heavy(state_vector, sequence):
|
1380 |
+
"""
|
1381 |
+
Apply the wolfguy numbering scheme for heavy chains
|
1382 |
+
|
1383 |
+
The scheme numbers the sequence using different segments so that the numbering tells you
|
1384 |
+
where in the antibody the sequence is describing.
|
1385 |
+
|
1386 |
+
XXXXXXXXXIXXXXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXIX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
|
1387 |
+
11111111111111111111111111 22222222222222 33333333333333 44444444444444444444 555555555555555555555555555555 6666666666666 77777777777'
|
1388 |
+
|
1389 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
1390 |
+
1 - Simple mapping (treat "I" states as inserts and not own match states)
|
1391 |
+
2 - CDRH1 - 155-199 (inc). Gap symmetrically about 175-176.
|
1392 |
+
3 - Simple mapping (treat "I" states as inserts and not own match states)
|
1393 |
+
4 - CDRH2 - 251-299 (inc). Gap symmetrically about 271-272, then gap back from 294.
|
1394 |
+
5 - Simple mapping (treat "I" states as inserts and not own match states)
|
1395 |
+
6 - CDRH3 331,332 and 351-399 (inc). Gap according to the
|
1396 |
+
7 - Simple mapping (treat "I" states as inserts and not own match states)
|
1397 |
+
|
1398 |
+
Start gaps on rhs each time.
|
1399 |
+
"""
|
1400 |
+
# Set up the numbering
|
1401 |
+
|
1402 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
1403 |
+
state_string = 'XXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
|
1404 |
+
|
1405 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
1406 |
+
region_string = '11111111111111111111111111222222222222223333333333333344444444444444444444555555555555555555555555555555666666666666677777777777'
|
1407 |
+
|
1408 |
+
region_index_dict = {"1":0,"2":1,"3":2,"4":3,"5":4,"6":5,"7":6}
|
1409 |
+
|
1410 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
1411 |
+
# This is updated in the loop below
|
1412 |
+
rels = {0:100,
|
1413 |
+
1:124,
|
1414 |
+
2:160,
|
1415 |
+
3:196,
|
1416 |
+
4:226,
|
1417 |
+
5:244,
|
1418 |
+
6:283}
|
1419 |
+
|
1420 |
+
n_regions = 7
|
1421 |
+
|
1422 |
+
exclude_deletions = [1,3,5]
|
1423 |
+
|
1424 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
1425 |
+
|
1426 |
+
###############
|
1427 |
+
# Renumbering #
|
1428 |
+
###############
|
1429 |
+
|
1430 |
+
# Renumbering required for 1, 3, 5 regions in wolfguy heavy
|
1431 |
+
_numbering = [ _regions[0], [] , _regions[2], [], _regions[4] , [], _regions[6] ]
|
1432 |
+
|
1433 |
+
# CDRH1
|
1434 |
+
# Delete symmetrically about 177. Delete right first.
|
1435 |
+
# May have to change this to reflect where the point of symmetry is
|
1436 |
+
ordered_deletions = [151]
|
1437 |
+
for p1,p2 in zip( list(range(152,176)), list(range(199, 175,-1))): ordered_deletions += [ p1,p2 ]
|
1438 |
+
length = len( _regions[1] )
|
1439 |
+
annotations = sorted(ordered_deletions[:length])
|
1440 |
+
_numbering[1] = [ ((annotations[i]," "), _regions[1][i][1]) for i in range(length) ]
|
1441 |
+
|
1442 |
+
# CDRH2
|
1443 |
+
# Delete symmetrically about 271. Delete right first.
|
1444 |
+
# Then delete right from 288
|
1445 |
+
ordered_deletions = [251]
|
1446 |
+
for p1,p2 in zip( list(range(252,271)), list(range(290, 271,-1))): ordered_deletions += [ p1,p2 ]
|
1447 |
+
ordered_deletions.append( 271 )
|
1448 |
+
ordered_deletions = list(range( 299, 290, -1)) + ordered_deletions
|
1449 |
+
length = len( _regions[3] )
|
1450 |
+
annotations = sorted(ordered_deletions[:length])
|
1451 |
+
_numbering[3] = [ ((annotations[i]," "), _regions[3][i][1]) for i in range(length) ]
|
1452 |
+
|
1453 |
+
# CDRH3
|
1454 |
+
# Delete symmetrically about 374. Delete right first.
|
1455 |
+
# Scheme changes at length 8
|
1456 |
+
# Scheme changes at length 12
|
1457 |
+
ordered_deletions = []
|
1458 |
+
for p1,p2 in zip( list(range(356,374)), list(range(391, 373,-1))): ordered_deletions += [ p1,p2 ]
|
1459 |
+
ordered_deletions = [ 354, 394, 355, 393, 392 ] + ordered_deletions
|
1460 |
+
ordered_deletions = [331,332] + [ 399, 398, 351, 352, 397, 353, 396, 395 ] + ordered_deletions
|
1461 |
+
length = len( _regions[5] )
|
1462 |
+
|
1463 |
+
if length > len(ordered_deletions): return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
1464 |
+
annotations = sorted(ordered_deletions[:length])
|
1465 |
+
_numbering[5] = [ ((annotations[i]," "), _regions[5][i][1]) for i in range(length) ]
|
1466 |
+
|
1467 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
1468 |
+
return sum( _numbering, [] ), startindex, endindex
|
1469 |
+
|
1470 |
+
|
1471 |
+
def number_wolfguy_light(state_vector, sequence):
|
1472 |
+
"""
|
1473 |
+
Apply the wolfguy numbering scheme for light chains
|
1474 |
+
|
1475 |
+
The scheme numbers the sequence using different segments so that the numbering tells you
|
1476 |
+
where in the antibody the sequence is describing.
|
1477 |
+
|
1478 |
+
XXXXXXX XXX XXXXXXXXXXXXX XXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXX XXXXXXXXXXXXXX XXXIXXXXXXX XXXX XXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXX XXXXXXXXXXX
|
1479 |
+
1111111 AAA BBBBBBBBBBBBB 22222222222222222 333333333333333 44444444444444 55555555555 6666 77777777777777777777 8888888888888 99999999999
|
1480 |
+
|
1481 |
+
Regions - (N.B These do not match up with any particular definition of CDR)
|
1482 |
+
1 - Simple mapping (treat "I" states as inserts and not own match states)
|
1483 |
+
A - Move indels onto 508
|
1484 |
+
B - Simple mapping (treat "I" states as inserts and not own match states)
|
1485 |
+
2 - CDRL1 - 551-599 (inc). Assign via the matching consensus sequence and length.
|
1486 |
+
3 - Simple mapping (treat "I" states as inserts and not own match states)
|
1487 |
+
4 - CDRL2 - 651-699 (inc). Gap about 673 then right from 694
|
1488 |
+
5 - Simple mapping (treat "I" states as inserts and not own match states)
|
1489 |
+
6 - Move indels onto 713 and 714
|
1490 |
+
7 - Simple mapping (treat "I" states as inserts and not own match states)
|
1491 |
+
8 - CDRL3 751-799 (inc). Gap symmetrically about 374-375
|
1492 |
+
9 - Simple mapping (treat "I" states as inserts and not own match states)
|
1493 |
+
|
1494 |
+
"""
|
1495 |
+
# Set up the numbering
|
1496 |
+
|
1497 |
+
# State string - 'X' means the imgt position exists in the scheme. 'I' means that it should be treated as an insertion of the previous number
|
1498 |
+
state_string = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
|
1499 |
+
|
1500 |
+
# Region string - regions that should be treated separately in putting the numbering together
|
1501 |
+
region_string = '1111111AAABBBBBBBBBBBBB222222222222222223333333333333334444444444444455555555555666677777777777777777777888888888888899999999999'
|
1502 |
+
|
1503 |
+
region_index_dict = {"1":0,"A":1,"B":2,"2":3,"3":4,"4":5,"5":6,"6":7,"7":8,"8":9,"9":10}
|
1504 |
+
|
1505 |
+
# Define how the scheme's numbering differs from IMGT at the start of each region.
|
1506 |
+
# This is updated in the loop below
|
1507 |
+
rels = {0:500,
|
1508 |
+
1:500,
|
1509 |
+
2:500,
|
1510 |
+
3:527,
|
1511 |
+
4:560,
|
1512 |
+
5:595,
|
1513 |
+
6:631,
|
1514 |
+
7:630,
|
1515 |
+
8:630,
|
1516 |
+
9:646,
|
1517 |
+
10:683}
|
1518 |
+
|
1519 |
+
n_regions = 11
|
1520 |
+
|
1521 |
+
exclude_deletions = [1,3,5,7,9]
|
1522 |
+
|
1523 |
+
_regions, startindex, endindex = _number_regions(sequence, state_vector, state_string , region_string, region_index_dict, rels, n_regions, exclude_deletions)
|
1524 |
+
|
1525 |
+
###############
|
1526 |
+
# Renumbering #
|
1527 |
+
###############
|
1528 |
+
|
1529 |
+
# Renumbering required for 1, 3, 5 regions in wolfguy heavy
|
1530 |
+
_numbering = [ _regions[0], [], _regions[2], [] , _regions[4], [], _regions[6], [], _regions[8], [], _regions[10] ]
|
1531 |
+
|
1532 |
+
|
1533 |
+
# Gaps in the first section go 508 instead of the imgt 510 equivalent
|
1534 |
+
length = len(_regions[1] )
|
1535 |
+
annotations = sorted([ (510,' '), (509, ' '), (508, ' ')][ :length ] + [(508,a) for a in alphabet[:max(0, length-3)]])
|
1536 |
+
_numbering[1] = [ (annotations[i], _regions[1][i][1]) for i in range(length) ]
|
1537 |
+
|
1538 |
+
# CDRL1
|
1539 |
+
# Number by predicting the canonical
|
1540 |
+
length = len(_regions[3] )
|
1541 |
+
annotations = _get_wolfguy_L1( _regions[3], length)
|
1542 |
+
_numbering[3] = [ ((annotations[i]," "), _regions[3][i][1]) for i in range(length) ]
|
1543 |
+
|
1544 |
+
# CDRL2
|
1545 |
+
# Delete about 673. Finally delete right from 694. Maintain 651 as the last deletion
|
1546 |
+
ordered_deletions = []
|
1547 |
+
for p1,p2 in zip( list(range(652,673)), list(range(694, 672,-1))): ordered_deletions += [ p2,p1 ]
|
1548 |
+
ordered_deletions = [651] + list(range( 699, 694, -1)) + ordered_deletions + [673]
|
1549 |
+
|
1550 |
+
length = len( _regions[5] )
|
1551 |
+
annotations = sorted(ordered_deletions[:length])
|
1552 |
+
_numbering[5] = [ ((annotations[i]," "), _regions[5][i][1]) for i in range(length) ]
|
1553 |
+
|
1554 |
+
|
1555 |
+
# The placement of the indel in wolfguy is different to that in imgt
|
1556 |
+
length = len( _regions[7] )
|
1557 |
+
insertions = max( 0, length - 4 )
|
1558 |
+
annotations = [(711, ' '), (712, ' '), (713, ' '), (714, ' ')][:length] + [ (714, a) for a in alphabet[:insertions] ]
|
1559 |
+
_numbering[7] = [ (annotations[i], _regions[7][i][1]) for i in range(length) ]
|
1560 |
+
|
1561 |
+
# CDRL3
|
1562 |
+
# Delete symmetrically about 775. Delete right first. Finally delete 798 and 799
|
1563 |
+
ordered_deletions = []
|
1564 |
+
for p1,p2 in zip( list(range(751,775)), list(range(799, 775,-1))): ordered_deletions += [ p1,p2 ]
|
1565 |
+
ordered_deletions.append( 775 )
|
1566 |
+
|
1567 |
+
length = len( _regions[9] )
|
1568 |
+
if length > len(ordered_deletions): return [], startindex, endindex # Too many insertions. Do not apply numbering.
|
1569 |
+
annotations = sorted(ordered_deletions[:length])
|
1570 |
+
_numbering[9] = [ ((annotations[i]," "), _regions[9][i][1]) for i in range(length) ]
|
1571 |
+
|
1572 |
+
# Return the full vector and the start and end indices of the numbered region of the sequence
|
1573 |
+
return sum( _numbering, [] ), startindex, endindex
|
1574 |
+
|
1575 |
+
|
1576 |
+
def _get_wolfguy_L1(seq, length):
|
1577 |
+
"""
|
1578 |
+
Wolfguy's L1 annotation is based on recognising the length and the sequence pattern defined
|
1579 |
+
by a set of rules. If the length has not been characterised, we number symmetrically about the
|
1580 |
+
middle of the loop.
|
1581 |
+
"""
|
1582 |
+
|
1583 |
+
# These are the annotations for different lengths of L1 according to the wolfguy definitions.
|
1584 |
+
L1_sequences = {
|
1585 |
+
9: [['9', 'XXXXXXXXX', [551, 552, 554, 556, 563, 572, 597, 598, 599]]],
|
1586 |
+
10: [['10', 'XXXXXXXXXX', [551, 552, 553, 556, 561, 562, 571, 597, 598, 599]]],
|
1587 |
+
11: [['11a', 'RASQDISSYLA', [551, 552, 553, 556, 561, 562, 571, 596, 597, 598, 599]],
|
1588 |
+
['11b', 'GGNNIGSKSVH', [551, 552, 554, 556, 561, 562, 571, 572, 597, 598, 599]],
|
1589 |
+
['11b.2','SGDQLPKKYAY', [551, 552, 554, 556, 561, 562, 571, 572, 597, 598, 599]]],
|
1590 |
+
12: [['12a', 'TLSSQHSTYTIE', [551, 552, 553, 554, 555, 556, 561, 563, 572, 597, 598, 599]],
|
1591 |
+
['12b', 'TASSSVSSSYLH', [551, 552, 553, 556, 561, 562, 571, 595, 596, 597, 598, 599]],
|
1592 |
+
['12c', 'RASQSVxNNYLA', [551, 552, 553, 556, 561, 562, 571, 581, 596, 597, 598, 599]],
|
1593 |
+
['12d', 'rSShSIrSrrVh', [551, 552, 553, 556, 561, 562, 571, 581, 596, 597, 598, 599]]],
|
1594 |
+
13: [['13a', 'SGSSSNIGNNYVS', [551, 552, 554, 555, 556, 557, 561, 562, 571, 572, 597, 598, 599]],
|
1595 |
+
['13b', 'TRSSGSLANYYVQ', [551, 552, 553, 554, 556, 561, 562, 563, 571, 572, 597, 598, 599]]],
|
1596 |
+
14: [['14a', 'RSSTGAVTTSNYAN', [551, 552, 553, 554, 555, 561, 562, 563, 564, 571, 572, 597, 598, 599]],
|
1597 |
+
['14b', 'TGTSSDVGGYNYVS', [551, 552, 554, 555, 556, 557, 561, 562, 571, 572, 596, 597, 598, 599]]],
|
1598 |
+
15: [['15', 'XXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 594, 595, 596, 597, 598, 599]]],
|
1599 |
+
16: [['16', 'XXXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 583, 594, 595, 596, 597, 598, 599]]],
|
1600 |
+
17: [['17', 'XXXXXXXXXXXXXXXXX', [551, 552, 553, 556, 561, 562, 563, 581, 582, 583, 584, 594, 595, 596, 597, 598, 599]]]
|
1601 |
+
}
|
1602 |
+
|
1603 |
+
if length in L1_sequences: # Use the pre-defined motif
|
1604 |
+
# Find the maximum scoring canonical form for this length.
|
1605 |
+
curr_max = None, -10000
|
1606 |
+
for canonical in L1_sequences[length]:
|
1607 |
+
sub_score = 0
|
1608 |
+
for i in range( length ):
|
1609 |
+
try:
|
1610 |
+
sub_score += blosum62[ (seq[i][1].upper(), canonical[1][i].upper() ) ]
|
1611 |
+
except KeyError:
|
1612 |
+
sub_score += blosum62[ (canonical[1][i].upper(), seq[i][1].upper() ) ]
|
1613 |
+
if sub_score > curr_max[1]:
|
1614 |
+
curr_max = canonical, sub_score
|
1615 |
+
|
1616 |
+
# return the annotations
|
1617 |
+
return curr_max[0][2]
|
1618 |
+
else: # Use a symmetric numbering about the anchors.
|
1619 |
+
ordered_deletions = []
|
1620 |
+
for p1,p2 in zip( list(range(551,575)), list(range(599, 575,-1))): ordered_deletions += [ p2,p1 ]
|
1621 |
+
ordered_deletions.append(575)
|
1622 |
+
return sorted( ordered_deletions[:length] )
|
1623 |
+
|
1624 |
+
def gap_missing( numbering ):
|
1625 |
+
'''
|
1626 |
+
Place gaps when a number is missing. All except wolfguy are continuously numbered
|
1627 |
+
'''
|
1628 |
+
# Gaps placed where a number is not present
|
1629 |
+
num = [ ((0,' '),'-') ]
|
1630 |
+
for p, a in sum( numbering, [] ):
|
1631 |
+
if p[0] > num[-1][0][0]+1:
|
1632 |
+
for _i in range( num[-1][0][0]+1, p[0] ):
|
1633 |
+
num.append( ((_i, ' '), '-' ) )
|
1634 |
+
num.append( (p,a) )
|
1635 |
+
return num[1:]
|
1636 |
+
|
1637 |
+
|
1638 |
+
######################
|
1639 |
+
# Annotation of CDR3 #
|
1640 |
+
######################
|
1641 |
+
|
1642 |
+
def get_cdr3_annotations(length, scheme="imgt", chain_type=""):
|
1643 |
+
"""
|
1644 |
+
Given a length of a cdr3 give back a list of the annotations that should be applied to the sequence.
|
1645 |
+
|
1646 |
+
This function should be depreciated
|
1647 |
+
"""
|
1648 |
+
az = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
1649 |
+
za = "ZYXWVUTSRQPONMLKJIHGFEDCBA"
|
1650 |
+
|
1651 |
+
if scheme=="imgt":
|
1652 |
+
start, end = 105, 118 # start (inclusive) end (exclusive)
|
1653 |
+
annotations = [None for _ in range(max(length,13))]
|
1654 |
+
front = 0
|
1655 |
+
back = -1
|
1656 |
+
assert (length-13) < 50, "Too many insertions for numbering scheme to handle" # We ran out of letters.
|
1657 |
+
for i in range(min(length,13)):
|
1658 |
+
if i%2:
|
1659 |
+
annotations[back] = (end+back, " ")
|
1660 |
+
back -= 1
|
1661 |
+
else:
|
1662 |
+
annotations[front] = (start+front, " ")
|
1663 |
+
front += 1
|
1664 |
+
for i in range(max(0,length-13)): # add insertions onto 111 and 112 in turn
|
1665 |
+
if i%2:
|
1666 |
+
annotations[back] = (112, za[back+6])
|
1667 |
+
back-=1
|
1668 |
+
else:
|
1669 |
+
annotations[front] = (111, az[front-7])
|
1670 |
+
front +=1
|
1671 |
+
return annotations
|
1672 |
+
|
1673 |
+
elif scheme in [ "chothia", "kabat"] and chain_type=="heavy": # For chothia and kabat
|
1674 |
+
# Number forwards from 93
|
1675 |
+
insertions = max(length - 10, 0)
|
1676 |
+
assert insertions < 27, "Too many insertions for numbering scheme to handle" # We ran out of letters.
|
1677 |
+
ordered_deletions = [ (100, ' '), (99,' '), (98,' '), (97,' '), (96,' '), (95,' '), (101,' '),(102,' '),(94,' '), (93,' ') ]
|
1678 |
+
annotations = sorted( ordered_deletions[ max(0, 10-length): ] + [ (100,a) for a in az[:insertions ] ] )
|
1679 |
+
return annotations
|
1680 |
+
|
1681 |
+
elif scheme in [ "chothia", "kabat"] and chain_type=="light":
|
1682 |
+
# Number forwards from 89
|
1683 |
+
insertions = max(length - 9, 0)
|
1684 |
+
assert insertions < 27, "Too many insertions for numbering scheme to handle" # We ran out of letters.
|
1685 |
+
ordered_deletions = [ (95,' '),(94,' '),(93,' '),( 92,' '),(91,' '),(96,' '),(97,' '),(90,' '),(89,' ') ]
|
1686 |
+
annotations = sorted( ordered_deletions[ max(0, 9-length): ] + [ (95,a) for a in az[:insertions ] ] )
|
1687 |
+
return annotations
|
1688 |
+
|
1689 |
+
else:
|
1690 |
+
raise AssertionError("Unimplemented scheme")
|
1691 |
+
|
app.py
CHANGED
@@ -194,7 +194,7 @@ def main():
|
|
194 |
)
|
195 |
|
196 |
if uploaded_file is None:
|
197 |
-
with st.expander('
|
198 |
with open('./data/examples/7DK2_AB_C.pdb', 'r') as f:
|
199 |
st.download_button(
|
200 |
'RBD + Antibody Complex',
|
|
|
194 |
)
|
195 |
|
196 |
if uploaded_file is None:
|
197 |
+
with st.expander("Don't know what to upload? Try these examples", expanded=True):
|
198 |
with open('./data/examples/7DK2_AB_C.pdb', 'r') as f:
|
199 |
st.download_button(
|
200 |
'RBD + Antibody Complex',
|
requirements.txt
CHANGED
@@ -2,8 +2,6 @@
|
|
2 |
torch
|
3 |
torchvision
|
4 |
biopython==1.79
|
5 |
-
git+https://github.com/oxpig/ANARCI.git
|
6 |
-
git+https://github.com/prihoda/AbNumber.git
|
7 |
joblib
|
8 |
lmdb
|
9 |
tqdm
|
|
|
2 |
torch
|
3 |
torchvision
|
4 |
biopython==1.79
|
|
|
|
|
5 |
joblib
|
6 |
lmdb
|
7 |
tqdm
|