Source code for antipasti.utils.biology_utils

import numpy as np

from config import DATA_DIR
    

[docs]
def extract_mean_region_lengths(pdb_codes, data_path=DATA_DIR):
    r"""Retrieves the FR and CDR lengths of an antibody.

    Parameters
    ----------
    pdb_code: str
        The antibody PDB code.
    data_path: str
        Path to the data folder.

    """
    region_lengths = np.zeros((14))

    for pdb_code in pdb_codes:
        res_l = list(np.load(data_path+f'lists_of_residues/{pdb_code}.npy'))
        res_l = res_l[1:res_l.index('END-Ab')]
        res_l = [el[1:] for el in res_l] # Removing amino acid type
        h = res_l[0][0]
        l = res_l[-1][0]
        
        # Problems beginning CDR-H1
        if h+' 26 ' in res_l:
            cdrh1_b = res_l.index(h+' 26 ')
        elif h+' 27 ' in res_l:
            cdrh1_b = res_l.index(h+' 27 ')
        elif h+' 28 ' in res_l:
            cdrh1_b = res_l.index(h+' 28 ')
        elif h+' 29 ' in res_l:
            cdrh1_b = res_l.index(h+' 29 ')
        else:
            cdrh1_b = res_l.index(h+' 30 ')
            
        # Problems beginning CDR-H2
        if h+' 52 ' in res_l:
            cdrh2_b = res_l.index(h+' 52 ')
        else:
            cdrh2_b = res_l.index(h+' 53 ')

        # Beginning of FR1 (light chain)
        cfr1l_b = res_l.index(next((item for item in res_l if item.startswith(l)), None))
            
        # Problems beginning CDR-L2
        if l+' 50 ' in res_l:
            cdrl2_b = res_l.index(l+' 50 ')
        elif l+' 51 ' in res_l:
            cdrl2_b = res_l.index(l+' 51 ')
        elif pdb_code in ['4hkx', '5d70', '5d71']:
            cdrl2_b = 0
            cdrl2_e = 0
        else:
            cdrl2_b = res_l.index(l+' 52 ')
        
        # Problems end CDR-L2
        if l+' 57 ' in res_l:
            cdrl2_e = res_l.index(l+' 57 ')

        frh_parts = [len(res_l[1:cdrh1_b]), len(res_l[res_l.index(h+' 33 '):cdrh2_b]), len(res_l[res_l.index(h+' 57 '):res_l.index(h+' 95 ')]), len(res_l[res_l.index(h+'103 '):cfr1l_b])]
        cdrh_parts = [len(res_l[cdrh1_b:res_l.index(h+' 33 ')]), len(res_l[cdrh2_b:res_l.index(h+' 57 ')]), len(res_l[res_l.index(h+' 95 '):res_l.index(h+'103 ')])]
        if l != h:
            cdrl_parts = [len(res_l[res_l.index(l+' 24 '):res_l.index(l+ ' 35 ')]), len(res_l[cdrl2_b:cdrl2_e]), len(res_l[res_l.index(l+' 89 '):res_l.index(l+ ' 98 ')])]
            frl_parts = [len(res_l[cfr1l_b:res_l.index(l+' 24 ')]), len(res_l[res_l.index(l+ ' 35 '):cdrl2_b]), len(res_l[cdrl2_e:res_l.index(l+' 89 ')]), len(res_l[res_l.index(l+ ' 98 '):-1])]
        else:
            cdrl_parts = [0, 0, 0]
            frl_parts = [0, 0, 0, 0]

        for i in range(4):
            region_lengths[2*i] += frh_parts[i] / len(pdb_codes)
            region_lengths[2*i+7] += frl_parts[i] / len(pdb_codes)

            if i != 3:
                region_lengths[2*i+1] += cdrh_parts[i] / len(pdb_codes)
                region_lengths[2*i+8] += cdrl_parts[i] / len(pdb_codes)

    return region_lengths



[docs]
def get_sequence(list_of_residues, max_res_list_h=None, max_res_list_l=None):
    r"""Returns an amino acid sequence from an ANTIPASTI list of residues. It contains gaps for the antibody.

    Parameters
    ----------
    list_of_residues: list
        Residues numbered according to the Chothia scheme with presence of 'START-Ab' and 'END-Ab' labels.
    max_res_list_h: list
        Heavy chain residues of all data.
    max_res_list_l: list
        Light chain residues of all data.

    """
    # First we force unique elements
    max_res_list_h = list(dict.fromkeys(max_res_list_h))
    max_res_list_l = list(dict.fromkeys(max_res_list_l))

    h_chain = list_of_residues[1][1]
    h = len([idx for idx in list_of_residues if idx[1] == h_chain])
    list_of_residues_h = list_of_residues[1:h+1]
    list_of_residues_l = list_of_residues[h+1:list_of_residues.index('END-Ab')]
    current_list_h = [x[2:].strip() for x in list_of_residues_h]
    current_list_l = [x[2:].strip() for x in list_of_residues_l]

    
    list_of_residues_iterator_h = iter(list_of_residues_h)
    list_of_residues_iterator_l = iter(list_of_residues_l)
    sequence = [next(list_of_residues_iterator_h, '-')[0] if max_res_list_h[i] in current_list_h else '-' for i in range(len(max_res_list_h))]
    sequence += [':'] # Separating chains
    if list_of_residues_l:
        sequence += [next(list_of_residues_iterator_l, '-')[0] if max_res_list_l[i] in current_list_l else '-' for i in range(len(max_res_list_l))]
    sequence += [':'] # Separating chains
    sequence += [lor[0] for lor in list_of_residues[list_of_residues.index('END-Ab')+1:]]

    return ''.join(map(str, sequence))



[docs]
def antibody_sequence_identity(seq1, seq2):
    r"""Computes the percentage of sequence identity.

    Parameters
    ----------
    seq1: str
        First sequence.
    seq2: str
        Second sequence.
    
    """
    seq1 = seq1[:seq1.rfind(':')]
    seq2 = seq2[:seq2.rfind(':')]

    matching = sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 == ch2 and ch1 not in ('-', ':') and ch2 not in ('-', ':'))
    total = sum(1 for ch1, ch2 in zip(seq1, seq2) if ch1 not in ('-', ':') or ch2 not in ('-', ':'))

    return matching / total

    


[docs]
def antigen_identity(seq1, seq2):
    r"""Tests whether two antibodies are bound to the same antigen.

    Parameters
    ----------
    seq1: str
        First sequence.
    seq2: str
        Second sequence.
    
    """
    return seq1.rsplit(':', 1)[1] == seq2.rsplit(':', 1)[1]



[docs]
def check_train_test_identity(training_set_ids, test_set_ids, max_res_list_h=None, max_res_list_l=None, threshold=0.9, residues_path=DATA_DIR+'lists_of_residues/', verbose=False):
    r"""Tests the sequence identity of the training and test sets.

    Parameters
    ----------
    training_set_ids: list
        Contains the PDB identifiers of the training set elements.
    test_set_ids: list
        Contains the PDB identifiers of the test set elements.
    max_res_list_h: list
        Heavy chain residues of all data.
    max_res_list_l: list
        Light chain residues of all data.
    threshold: float
        Highest accepted sequence identity value.
    residues_path: str
        Path to the folder containing the list of residues per entry.

    """

    for test_element in test_set_ids:
        test_seq = get_sequence(list(np.load(residues_path+test_element+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
        for training_element in training_set_ids:
            tr_seq = get_sequence(list(np.load(residues_path+training_element+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
            identity = antibody_sequence_identity(tr_seq, test_seq)
            if identity > threshold:
                return False
            if antigen_identity(tr_seq, test_seq):
                return False

    print(f'All train/test pairs passed the similarity check (Identity <= {threshold:.2%})')
    return True


'''
def build_weights(pdb_codes, max_res_list_h=None, max_res_list_l=None, threshold=0.9, residues_path=DATA_DIR+'lists_of_residues/'):
    r"""Generates a vector that, for each sequence, keeps track of the number of other sequences (including the current one) having a sequence identity higher than a specified threshold.

    Parameters
    ----------
    pdb_codes: list
        Contains PDB identifiers.
    max_res_list_h: list
        Heavy chain residues of all data.
    max_res_list_l: list
        Light chain residues of all data.
    threshold: float
        Highest accepted sequence identity value.
    residues_path: str
        Path to the folder containing the list of residues per entry.

    """

    weights = np.zeros((len(pdb_codes)))

    for i, pdb_code in enumerate(pdb_codes):
        main_seq = get_sequence(list(np.load(residues_path+pdb_code+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
        for pdb_code_ in pdb_codes:
            other_seq = get_sequence(list(np.load(residues_path+pdb_code_+'.npy')), max_res_list_h=max_res_list_h, max_res_list_l=max_res_list_l)
            identity = antibody_sequence_identity(main_seq, other_seq)
            if identity > threshold:
                weights[i] += 1

    return weights
'''

[docs]
def remove_nanobodies(pdb_codes, representations, embedding=None, labels=[], numerical_values=None):
    r"""Returns PDB codes and embeddings without the presence of nanobodies.

    Parameters
    ----------
    pdb_codes: list
        The PDB codes of the antibodies.
    representations: numpy.ndarray
        Normal mode correlation maps (or transformed maps) from which it can be inferred whether a given antibody is a nanobody.
    embedding: numpy.ndarray
        Low-dimensional version of ``representations``.
    labels: list
        Data point labels.
    numerical_values: list
        If data is numerical (e.g., affinity values), it is necessary to include a list here. In this way, values associated to nanobodies can be removed.

    """
    input_shape = representations.shape[-1]
    deleted_items = 0

    for i in range(len(pdb_codes)):
        if np.count_nonzero(representations[i-deleted_items].reshape(input_shape, input_shape)[-40:,-40:]) == 0:
            pdb_codes = np.delete(pdb_codes, i-deleted_items, axis=0)
            representations = np.delete(representations, i-deleted_items, axis=0)
            if embedding is not None:
                embedding = np.delete(embedding, i-deleted_items, axis=0)
            if len(labels):
                labels = np.delete(labels, i-deleted_items, axis=0)
            if numerical_values is not None:
                numerical_values = np.delete(numerical_values, i-deleted_items, axis=0)
            deleted_items += 1
    return pdb_codes, representations, embedding, labels, numerical_values