Spaces:
Runtime error
Runtime error
import numpy as np | |
from Bio.PDB import PDBParser, Selection | |
from Bio.PDB.Polypeptide import three_to_one | |
from Bio import pairwise2 | |
from Bio.Align import substitution_matrices | |
from diffab.tools.eval.base import EvalTask | |
def reslist_rmsd(res_list1, res_list2): | |
res_short, res_long = (res_list1, res_list2) if len(res_list1) < len(res_list2) else (res_list2, res_list1) | |
M, N = len(res_short), len(res_long) | |
def d(i, j): | |
coord_i = np.array(res_short[i]['CA'].get_coord()) | |
coord_j = np.array(res_long[j]['CA'].get_coord()) | |
return ((coord_i - coord_j) ** 2).sum() | |
SD = np.full([M, N], np.inf) | |
for i in range(M): | |
j = N - (M - i) | |
SD[i, j] = sum([ d(i+k, j+k) for k in range(N-j) ]) | |
for j in range(N): | |
SD[M-1, j] = d(M-1, j) | |
for i in range(M-2, -1, -1): | |
for j in range((N-(M-i))-1, -1, -1): | |
SD[i, j] = min( | |
d(i, j) + SD[i+1, j+1], | |
SD[i, j+1] | |
) | |
min_SD = SD[0, :N-M+1].min() | |
best_RMSD = np.sqrt(min_SD / M) | |
return best_RMSD | |
def entity_to_seq(entity): | |
seq = '' | |
mapping = [] | |
for res in Selection.unfold_entities(entity, 'R'): | |
try: | |
seq += three_to_one(res.get_resname()) | |
mapping.append(res.get_id()) | |
except KeyError: | |
pass | |
assert len(seq) == len(mapping) | |
return seq, mapping | |
def reslist_seqid(res_list1, res_list2): | |
seq1, _ = entity_to_seq(res_list1) | |
seq2, _ = entity_to_seq(res_list2) | |
_, seq_id = align_sequences(seq1, seq2) | |
return seq_id | |
def align_sequences(sequence_A, sequence_B, **kwargs): | |
""" | |
Performs a global pairwise alignment between two sequences | |
using the BLOSUM62 matrix and the Needleman-Wunsch algorithm | |
as implemented in Biopython. Returns the alignment, the sequence | |
identity and the residue mapping between both original sequences. | |
""" | |
def _calculate_identity(sequenceA, sequenceB): | |
""" | |
Returns the percentage of identical characters between two sequences. | |
Assumes the sequences are aligned. | |
""" | |
sa, sb, sl = sequenceA, sequenceB, len(sequenceA) | |
matches = [sa[i] == sb[i] for i in range(sl)] | |
seq_id = (100 * sum(matches)) / sl | |
return seq_id | |
# gapless_sl = sum([1 for i in range(sl) if (sa[i] != '-' and sb[i] != '-')]) | |
# gap_id = (100 * sum(matches)) / gapless_sl | |
# return (seq_id, gap_id) | |
# | |
matrix = kwargs.get('matrix', substitution_matrices.load("BLOSUM62")) | |
gap_open = kwargs.get('gap_open', -10.0) | |
gap_extend = kwargs.get('gap_extend', -0.5) | |
alns = pairwise2.align.globalds(sequence_A, sequence_B, | |
matrix, gap_open, gap_extend, | |
penalize_end_gaps=(False, False) ) | |
best_aln = alns[0] | |
aligned_A, aligned_B, score, begin, end = best_aln | |
# Calculate sequence identity | |
seq_id = _calculate_identity(aligned_A, aligned_B) | |
return (aligned_A, aligned_B), seq_id | |
def extract_reslist(model, residue_first, residue_last): | |
assert residue_first[0] == residue_last[0] | |
residue_first, residue_last = tuple(residue_first), tuple(residue_last) | |
chain_id = residue_first[0] | |
pos_first, pos_last = residue_first[1:], residue_last[1:] | |
chain = model[chain_id] | |
reslist = [] | |
for res in Selection.unfold_entities(chain, 'R'): | |
pos_current = (res.id[1], res.id[2]) | |
if pos_first <= pos_current <= pos_last: | |
reslist.append(res) | |
return reslist | |
def eval_similarity(task: EvalTask): | |
model_gen = task.get_gen_biopython_model() | |
model_ref = task.get_ref_biopython_model() | |
reslist_gen = extract_reslist(model_gen, task.residue_first, task.residue_last) | |
reslist_ref = extract_reslist(model_ref, task.residue_first, task.residue_last) | |
task.scores.update({ | |
'rmsd': reslist_rmsd(reslist_gen, reslist_ref), | |
'seqid': reslist_seqid(reslist_gen, reslist_ref), | |
}) | |
return task | |