Spaces:
Runtime error
Runtime error
File size: 5,594 Bytes
6d34920 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import sys
from typing import List, Tuple
import re
import numpy as np
from abnumber.exceptions import ChainParseError
try:
from anarci.anarci import anarci
except ImportError:
# Only print the error without failing - required to import
print('ANARCI module not available. Please install it separately or install AbNumber through Bioconda')
print('See: https://abnumber.readthedocs.io/')
sys.exit(1)
POS_REGEX = re.compile(r'([HL]?)(\d+)([A-Z]?)')
WHITESPACE = re.compile(r'\s+')
def _validate_chain_type(chain_type):
assert chain_type in ['H', 'L', 'K'], \
f'Invalid chain type "{chain_type}", it should be "H" (heavy), "L" (lambda light chian) or "K" (kappa light chain)'
def _anarci_align(sequence, scheme, allowed_species, assign_germline=False) -> List[Tuple]:
from abnumber.position import Position
sequence = re.sub(WHITESPACE, '', sequence)
all_numbered, all_ali, all_hits = anarci(
[('id', sequence)],
scheme=scheme,
allowed_species=allowed_species,
assign_germline=assign_germline
)
seq_numbered = all_numbered[0]
seq_ali = all_ali[0]
if seq_numbered is None:
raise ChainParseError(f'Variable chain sequence not recognized: "{sequence}"')
assert len(seq_numbered) == len(seq_ali), 'Unexpected ANARCI output'
results = []
for (positions, start, end), ali in zip(seq_numbered, seq_ali):
chain_type = ali['chain_type']
species = ali['species']
v_gene = ali['germlines']['v_gene'][0][1] if assign_germline else None
j_gene = ali['germlines']['j_gene'][0][1] if assign_germline else None
aa_dict = {Position(chain_type=chain_type, number=num, letter=letter, scheme=scheme): aa
for (num, letter), aa in positions if aa != '-'}
tail = sequence[end+1:]
results.append((aa_dict, chain_type, tail, species, v_gene, j_gene))
return results
def _get_unique_chains(chains):
seqs = set()
chains_filtered = []
for chain in chains:
if chain.seq in seqs:
continue
seqs.add(chain.seq)
chains_filtered.append(chain)
return chains_filtered
# Based on positive score in Blosum62
SIMILAR_PAIRS = {'AA', 'AS', 'CC', 'DD', 'DE', 'DN', 'ED', 'EE', 'EK', 'EQ', 'FF', 'FW', 'FY', 'GG', 'HH', 'HN', 'HY',
'II', 'IL', 'IM', 'IV', 'KE', 'KK', 'KQ', 'KR', 'LI', 'LL', 'LM', 'LV', 'MI', 'ML', 'MM', 'MV', 'ND',
'NH', 'NN', 'NS', 'PP', 'QE', 'QK', 'QQ', 'QR', 'RK', 'RQ', 'RR', 'SA', 'SN', 'SS', 'ST', 'TS', 'TT',
'VI', 'VL', 'VM', 'VV', 'WF', 'WW', 'WY', 'YF', 'YH', 'YW', 'YY'}
def is_similar_residue(a, b):
if a == '-' or b == '-':
return a == b
return a+b in SIMILAR_PAIRS
def is_integer(object):
return isinstance(object, int) or isinstance(object, np.integer)
SUPPORTED_SCHEMES = ['imgt', 'aho', 'chothia', 'kabat']
SUPPORTED_CDR_DEFINITIONS = ['imgt', 'chothia', 'kabat', 'north']
SCHEME_BORDERS = {
# Start coordinates
# CDR1, FR2, CDR2, FR3, CDR3, FR4
'imgt': [27, 39, 56, 66, 105, 118, 129],
'kabat_H': [31, 36, 50, 66, 95, 103, 114],
'kabat_K': [24, 35, 50, 57, 89, 98, 108],
'kabat_L': [24, 35, 50, 57, 89, 98, 108],
'chothia_H': [26, 33, 52, 57, 95, 103, 114],
'chothia_K': [24, 35, 50, 57, 89, 98, 108],
'chothia_L': [24, 35, 50, 57, 89, 98, 108],
'north_H': [23, 36, 50, 59, 93, 103, 114],
'north_K': [24, 35, 49, 57, 89, 98, 108],
'north_L': [24, 35, 49, 57, 89, 98, 108],
}
# { scheme -> { region -> list of position numbers } }
SCHEME_REGIONS = {
scheme: {
'FR1': list(range(1, borders[0])),
'CDR1': list(range(borders[0], borders[1])),
'FR2': list(range(borders[1], borders[2])),
'CDR2': list(range(borders[2], borders[3])),
'FR3': list(range(borders[3], borders[4])),
'CDR3': list(range(borders[4], borders[5])),
'FR4': list(range(borders[5], borders[6])),
} for scheme, borders in SCHEME_BORDERS.items()
}
# { scheme -> { position number -> region } }
SCHEME_POSITION_TO_REGION = {
scheme: {pos_num: region for region, positions in regions.items() for pos_num in positions} \
for scheme, regions in SCHEME_REGIONS.items()
}
# { scheme -> set of vernier position numbers }
SCHEME_VERNIER = {
# 'imgt_H': frozenset([2, 52, 53, 54, 76, 78, 80, 82, 87, 118]),
# 'chothia_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
# 'north_H': frozenset([2, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
'kabat_H': frozenset([2, 27, 28, 29, 30, 47, 48, 49, 67, 69, 71, 73, 78, 93, 94, 103]),
# 'imgt_K': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
# 'imgt_L': frozenset([2, 4, 41, 42, 52, 53, 54, 55, 78, 80, 84, 85, 87, 118]),
# 'chothia_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
# 'chothia_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
# 'north_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
# 'north_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
'kabat_K': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
'kabat_L': frozenset([2, 4, 35, 36, 46, 47, 48, 49, 64, 66, 68, 69, 71, 98]),
}
#'kabat_H': 31-35, 50-65, 95-102
#'kabat_K': 24-34, 50-56, 89-97
|