import numpy as np
from config import DATA_DIR
[docs]
def get_sequence(list_of_residues, max_res_list_h=None, max_res_list_l=None):
r"""Returns an amino acid sequence from an ANTIPASTI list of residues. It contains gaps for the antibody.
Parameters
----------
list_of_residues: list
Residues numbered according to the Chothia scheme with presence of 'START-Ab' and 'END-Ab' labels.
max_res_list_h: list
Heavy chain residues of all data.
max_res_list_l: list
Light chain residues of all data.
"""
# First we force unique elements
max_res_list_h = list(dict.fromkeys(max_res_list_h))
max_res_list_l = list(dict.fromkeys(max_res_list_l))
h_chain = list_of_residues[1][1]
h = len([idx for idx in list_of_residues if idx[1] == h_chain])
list_of_residues_h = list_of_residues[1:h+1]
list_of_residues_l = list_of_residues[h+1:list_of_residues.index('END-Ab')]
current_list_h = [x[2:].strip() for x in list_of_residues_h]
current_list_l = [x[2:].strip() for x in list_of_residues_l]
list_of_residues_iterator_h = iter(list_of_residues_h)
list_of_residues_iterator_l = iter(list_of_residues_l)
sequence = [next(list_of_residues_iterator_h, '-')[0] if max_res_list_h[i] in current_list_h else '-' for i in range(len(max_res_list_h))]
sequence += [':'] # Separating chains
if list_of_residues_l:
sequence += [next(list_of_residues_iterator_l, '-')[0] if max_res_list_l[i] in current_list_l else '-' for i in range(len(max_res_list_l))]
sequence += [':'] # Separating chains
sequence += [lor[0] for lor in list_of_residues[list_of_residues.index('END-Ab')+1:]]
return ''.join(map(str, sequence))
[docs]
def antibody_sequence_identity(seq1, seq2):
r"""Computes the percentage of sequence identity.
Parameters
----------
seq1: str
First sequence.
seq2: str
Second sequence.
"""
seq1 = seq1[:seq1.rfind(':')]
seq2 = seq2[:seq2.rfind(':')]
matching = sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 == ch2 and ch1 not in ('-', ':') and ch2 not in ('-', ':'))
total = sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 not in ('-', ':') or ch2 not in ('-', ':'))
return matching / total
[docs]
def antigen_identity(seq1, seq2):
r"""Tests whether two antibodies are bound to the same antigen.
Parameters
----------
seq1: str
First sequence.
seq2: str
Second sequence.
"""
return seq1.rsplit(':', 1)[1] == seq2.rsplit(':', 1)[1]
[docs]
def check_train_test_identity(training_set_ids, test_set_ids, max_res_list_h=None, max_res_list_l=None, threshold=0.9, residues_path=DATA_DIR+'lists_of_residues/', verbose=False):
r"""Tests the sequence identity of the training and test sets.
Parameters
----------
training_set_ids: list
Contains the PDB identifiers of the training set elements.
test_set_ids: list
Contains the PDB identifiers of the test set elements.
max_res_list_h: list
Heavy chain residues of all data.
max_res_list_l: list
Light chain residues of all data.
threshold: float
Highest accepted sequence identity value.
residues_path: str
Path to the folder containing the list of residues per entry.
"""
for test_element in test_set_ids:
test_seq = get_sequence(list(np.load(residues_path+test_element+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
for training_element in training_set_ids:
tr_seq = get_sequence(list(np.load(residues_path+training_element+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
identity = antibody_sequence_identity(tr_seq, test_seq)
if identity > threshold:
return False
if antigen_identity(tr_seq, test_seq):
return False
print(f'All train/test pairs passed the similarity check (Identity <= {threshold:.2%})')
return True
'''
def build_weights(pdb_codes, max_res_list_h=None, max_res_list_l=None, threshold=0.9, residues_path=DATA_DIR+'lists_of_residues/'):
r"""Generates a vector that, for each sequence, keeps track of the number of other sequences (including the current one) having a sequence identity higher than a specified threshold.
Parameters
----------
pdb_codes: list
Contains PDB identifiers.
max_res_list_h: list
Heavy chain residues of all data.
max_res_list_l: list
Light chain residues of all data.
threshold: float
Highest accepted sequence identity value.
residues_path: str
Path to the folder containing the list of residues per entry.
"""
weights = np.zeros((len(pdb_codes)))
for i, pdb_code in enumerate(pdb_codes):
main_seq = get_sequence(list(np.load(residues_path+pdb_code+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
for pdb_code_ in pdb_codes:
other_seq = get_sequence(list(np.load(residues_path+pdb_code_+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
identity = antibody_sequence_identity(main_seq, other_seq)
if identity > threshold:
weights[i] += 1
return weights
'''
[docs]
def remove_nanobodies(pdb_codes, representations, embedding=None, labels=[], numerical_values=None):
r"""Returns PDB codes and embeddings without the presence of nanobodies.
Parameters
----------
pdb_codes: list
The PDB codes of the antibodies.
representations: numpy.ndarray
Normal mode correlation maps (or transformed maps) from which it can be inferred whether a given antibody is a nanobody.
embedding: numpy.ndarray
Low-dimensional version of ``representations``.
labels: list
Data point labels.
numerical_values: list
If data is numerical (e.g., affinity values), it is necessary to include a list here. In this way, values associated to nanobodies can be removed.
"""
input_shape = representations.shape[-1]
deleted_items = 0
for i in range(len(pdb_codes)):
if np.count_nonzero(representations[i-deleted_items].reshape(input_shape, input_shape)[-40:,-40:]) == 0:
pdb_codes = np.delete(pdb_codes, i-deleted_items, axis=0)
representations = np.delete(representations, i-deleted_items, axis=0)
if embedding is not None:
embedding = np.delete(embedding, i-deleted_items, axis=0)
if len(labels):
labels = np.delete(labels, i-deleted_items, axis=0)
if numerical_values is not None:
numerical_values = np.delete(numerical_values, i-deleted_items, axis=0)
deleted_items += 1
return pdb_codes, representations, embedding, labels, numerical_values