Source code for antipasti.utils.explaining_utils

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import warnings
warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")
import umap

from copy import deepcopy
from matplotlib.colors import CenteredNorm
import matplotlib.patches as patches
from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler

from antipasti.utils.biology_utils import remove_nanobodies, extract_mean_region_lengths


[docs]
def get_maps_of_interest(preprocessed_data, learnt_filter, affinity_thr=-8):
    r"""Post-processes both raw data and results to obtain maps of interest.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    learnt_filter: numpy.ndarray
        Filters that express the learnt features during training.
    affinity_thr: float
        Affinity value separating antibodies considered to have high affinity from those considered to have low affinity.

    Returns
    -------
    mean_learnt: numpy.ndarray
        A resized version of ``learnt_filter`` to match the shape of the input normal mode correlation maps.
    mean_image: numpy.ndarray
        The mean of all the input normal mode correlation maps.
    mean_diff_image: numpy.ndarray
        Map resulting from the subtraction of the mean of the high affinity correlation maps and the mean of the low affinity correlation maps.

    """
    high_aff = []
    low_aff = []
    train_x = preprocessed_data.train_x
    train_y = preprocessed_data.train_y
    input_shape = train_x.shape[-1]

    for i in range(train_y.shape[0]):
        if train_y[i] < affinity_thr:
            high_aff.append(train_x[i])
        elif train_y[i] > affinity_thr:
            low_aff.append(train_x[i])

    # Obtaining the maps
    mean_learnt = cv2.resize(-learnt_filter, dsize=(input_shape, input_shape))
    mean_image = np.mean(train_x, axis=0).reshape(input_shape, input_shape)
    mean_diff_image = np.mean(high_aff, axis=0) - np.mean(low_aff, axis=0)

    return mean_learnt, mean_image, mean_diff_image



[docs]
def get_output_representations(preprocessed_data, model):
    r"""Returns maps that reveal the important residue interactions for the binding affinity. We call them 'output layer representations'.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    model: antipasti.model.model.ANTIPASTI
        The model class, i.e., ``ANTIPASTI``.

    """
    input_shape = preprocessed_data.test_x.shape[-1]
    each_img_enl = np.zeros((preprocessed_data.train_x.shape[0], input_shape**2))
    size_le = int(np.sqrt(model.fc1.weight.data.numpy().shape[-1] / model.n_filters))
    offset = np.zeros((input_shape**2))

    # Border effects
    preprocessed_data.train_x[:,:,0] = 0
    preprocessed_data.train_x[:,0,:] = 0
    preprocessed_data.train_x[:,:,len(preprocessed_data.max_res_list_h)-1:len(preprocessed_data.max_res_list_h)+1] = 0
    preprocessed_data.train_x[:,len(preprocessed_data.max_res_list_h)-1:len(preprocessed_data.max_res_list_h)+1, :] = 0
    preprocessed_data.train_x[:,:,-1] = 0
    preprocessed_data.train_x[:,-1,:] = 0

    inter_filter_off = model(torch.from_numpy(np.zeros((input_shape, input_shape)).reshape(1, 1, input_shape, input_shape).astype(np.float32)))[1].detach().numpy()
    for i in range(model.n_filters):
        offset += cv2.resize(np.multiply(inter_filter_off[0,i], model.fc1.weight.data.numpy().reshape(model.n_filters, size_le**2)[i].reshape(size_le, size_le)), dsize=(input_shape, input_shape)).reshape((input_shape**2))

    for j in range(preprocessed_data.train_x.shape[0]):
        inter_filter_item = model(torch.from_numpy(preprocessed_data.train_x[j].reshape(1, 1, input_shape, input_shape).astype(np.float32)))[1].detach().numpy()
        for i in range(model.n_filters):
            each_img_enl[j] += (size_le**2/input_shape**2) * cv2.resize(np.multiply(inter_filter_item[0,i], model.fc1.weight.data.numpy().reshape(model.n_filters, size_le**2)[i].reshape(size_le, size_le)), dsize=(input_shape, input_shape)).reshape((input_shape**2))
        each_img_enl[j] -= offset

    return each_img_enl



[docs]
def get_test_contribution(preprocessed_data, model):
    r"""Returns a map that reveals the important residue interactions for the binding affinity.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    model: antipasti.model.model.ANTIPASTI
        The model class, i.e., ``ANTIPASTI``.

    """
    test_x = preprocessed_data.test_x
    input_shape = preprocessed_data.test_x.shape[-1]
    n_filters = model.n_filters
    each_img_enl = np.zeros((input_shape, input_shape))
    size_le = int(np.sqrt(model.fc1.weight.data.numpy().shape[-1] / n_filters))

    inter_filter_item = model(torch.from_numpy(test_x.reshape(1, 1, input_shape, input_shape).astype(np.float32)))[1].detach().numpy()
    for i in range(n_filters):
        each_img_enl -= cv2.resize(np.multiply(inter_filter_item[0,i], model.fc1.weight.data.numpy().reshape(n_filters, size_le**2)[i].reshape(size_le, size_le)), dsize=(input_shape, input_shape))
    
    return each_img_enl



[docs]
def plot_map_with_regions(preprocessed_data, map, title='Normal mode correlation map', interactive=False):
    r"""Maps the residues to the antibody regions and plots the normal mode correlation map.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    map: numpy.ndarray
        A normal mode correlation map.
    title: str
        The image title.
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """
    # Font sizes
    title_size = 42
    font_size = 32

    # Defining the region boundaries
    mrlh = preprocessed_data.max_res_list_h
    mrll = preprocessed_data.max_res_list_l

    subgroup_boundaries_h = [mrlh.index('1'), mrlh.index('26'), mrlh.index('33'), mrlh.index('52'), mrlh.index('57'), mrlh.index('95'), mrlh.index('103'), mrlh.index('113')+1]
    subgroup_boundaries_l = [mrll.index('1'), mrll.index('24'), mrll.index('35'), mrll.index('50'), mrll.index('57'), mrll.index('89'), mrll.index('98'), mrll.index('107')+1]
    labels_h = ['FR-H1', 'CDR-H1', 'FR-H2', 'CDR-H2', 'FR-H3', 'CDR-H3', 'FR-H4']
    labels_l = ['FR-L1', 'CDR-L1', 'FR-L2', 'CDR-L2', 'FR-L3', 'CDR-L3', 'FR-L4']
    subgroup_boundaries = subgroup_boundaries_h[:-1] + [x+mrlh.index('113')+1 for x in subgroup_boundaries_l]
    labels = labels_h + labels_l
    fig = plt.figure(figsize=(20, 20))

    # Plotting the normal mode correlation map
    plt.imshow(map, origin='lower', cmap='seismic', norm=CenteredNorm())
    cbar = plt.colorbar(fraction=0.045)

    # Set the font size of the colorbar
    cbar.ax.tick_params(labelsize=18)
    for i in range(len(subgroup_boundaries) - 1):
        start_index = subgroup_boundaries[i]
        end_index = subgroup_boundaries[i+1]
        label_position = (start_index + end_index) / 2 - 0.5
        
        # Choosing the colours
        if labels[i].startswith('CDR'):
            c = 'deeppink'
        else:
            c = 'orange'
            
        # Adding rectangles for the regions
        rect = plt.Rectangle((start_index - 0.5, -6.5), end_index - start_index, 6, edgecolor='black', facecolor=c, alpha=0.7)
        plt.gca().add_patch(rect)

        rect = plt.Rectangle((-12.5, start_index - 0.5), 12, end_index - start_index, edgecolor='black', facecolor=c, alpha=0.7)
        plt.gca().add_patch(rect)

        # Add labels for the regions on the y-axis
        plt.text(-6, label_position-0.25, labels[i], ha='center', va='center', color='black', size=10)
        # Add labels for the regions on the x-axis
        plt.text(label_position, -3.7, labels[i], ha='center', va='center', color='black', size=9)

    # Adding rectangles for the chains
    rect = plt.Rectangle((-0.5, -10.5), subgroup_boundaries_h[-1], 4, edgecolor='black', facecolor='white')
    plt.gca().add_patch(rect)
    rect = plt.Rectangle((subgroup_boundaries_h[-1]-0.5, -10.5), subgroup_boundaries_l[-1], 4, edgecolor='black', facecolor='white')
    plt.gca().add_patch(rect)
    rect = plt.Rectangle((-16.5, -0.5), 4, subgroup_boundaries_h[-1], edgecolor='black', facecolor='white')
    plt.gca().add_patch(rect)
    rect = plt.Rectangle((-16.5, subgroup_boundaries_h[-1]-0.5), 4, subgroup_boundaries_l[-1], edgecolor='black', facecolor='white')
    plt.gca().add_patch(rect)

    # Adding labels for the chains on the y-axis
    plt.text(-14, subgroup_boundaries_h[-1]/2, 'Heavy chain', ha='center', va='center', color='black', rotation='vertical', size=14)
    plt.text(-14, subgroup_boundaries_h[-1]+subgroup_boundaries_l[-1]/2, 'Light chain', ha='center', va='center', color='black', rotation='vertical', size=14)
    
    # Adding labels for the chains on the x-axis
    plt.text(subgroup_boundaries_h[-1]/2, -9, 'Heavy chain', ha='center', va='center', color='black', size=12)
    plt.text(subgroup_boundaries_h[-1]+subgroup_boundaries_l[-1]/2, -9, 'Light chain', ha='center', va='center', color='black', size=12)

    # Adjusting the axis limits and labels
    plt.xlim(-16.5, 290.5)
    plt.ylim(-10.5, 290.5)
    plt.xticks([])
    plt.yticks([])

    # Adding labels and title
    plt.xlabel('Residues', size=font_size)
    plt.ylabel('Residues', size=font_size)
    plt.title(title, size=title_size)

    plt.show(block=False)
    if interactive:
        plt.pause(3)
        plt.close('all')



[docs]
def compute_umap(preprocessed_data, model, scheme='heavy_species', categorical=True, include_ellipses=False, numerical_values=None, external_cdict=None, interactive=False, exclude_nanobodies=False):
    r"""Performs UMAP dimensionality reduction calculations.

    Parameters
    ----------
    preprocessed_data: antipasti.model.model.Preprocessing
        The ``Preprocessing`` class.
    model: antipasti.model.model.ANTIPASTI
        The model class, i.e., ``ANTIPASTI``.
    scheme: str
        Category of the labels or values appearing in the UMAP representation.
    categorical: bool
        ``True`` if ``scheme`` is categorical.
    include_ellipses: bool
        ``True`` if ellipses comprising three quarters of the points of a given class are included.
    numerical_values: list
        A list of values or entries should be provided if data external to SAbDab is used.
    external_cdict: dictionary
        Option to provide an external dictionary of the UMAP labels.
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.
    exclude_nanobodies: bool
        Set to ``True`` to exclude nanobodies from the UMAP plot.

    """
    train_x = preprocessed_data.train_x
    input_shape = preprocessed_data.test_x.shape[-1]

    reducer = umap.UMAP(random_state=32, min_dist=0.1, n_neighbors=90) # Paired-HL

    labels = []
    colours = []
    pdb_codes = preprocessed_data.labels
    db = pd.read_csv(preprocessed_data.data_path+'sabdab_summary_all.tsv', sep='\t')
    if scheme in db.columns:
        db = db.loc[:,['pdb', scheme]]

    # Obtaining the labels and the output layer representations
    for j in range(train_x.shape[0]):
        if scheme in db.columns:
            labels.append(str(db[db['pdb'] == pdb_codes[j]].iloc[-1][scheme]))
    #         if pdb_codes[j] in nanobodies:
    #             labels.pop()
    #             labels.append('nanobodies')
    each_img_enl = get_output_representations(preprocessed_data, model)
    # each_img_enl = each_img_enl.reshape(-1, train_x.shape[-1], train_x.shape[-1])[:, :len(preprocessed_data.max_res_list_h), :len(preprocessed_data.max_res_list_h)].reshape(-1, len(preprocessed_data.max_res_list_h)**2)
    
    # UMAP fitting 
    scaled_each_img = StandardScaler().fit_transform(each_img_enl)
    embedding = reducer.fit_transform(scaled_each_img)

    if exclude_nanobodies:
        pdb_codes, _, embedding, labels, numerical_values = remove_nanobodies(pdb_codes, train_x, embedding, labels, numerical_values)

    if categorical:
        if scheme == 'light_subclass':
            cdict = {'IGKV1': 0,
                'IGKV2': 1,
                'IGKV3': 2,
                'IGKV4': 3,
                'IGKV5': 4,
                'IGKV6': 5,
                'IGKV7': 6,
                'IGKV8': 7,
                'IGKV9': 8,
                'IGKV10': 9,
                'IGKV14': 10,
                'IGLV1': 11,
                'IGLV2': 12,
                'IGLV6': 13,
                'Other': 14,}
            scheme = 'Light chain V gene family'

        elif scheme == 'heavy_subclass':
            cdict = {'IGHV1': 0,
                'IGHV2': 1,
                'IGHV3': 2,
                'IGHV4': 3,
                'IGHV5': 4,
                'IGHV6': 5,
                'IGHV7': 6,
                'Other': 7,}
            scheme = 'Heavy chain V gene family'

        elif scheme == 'heavy_species' or scheme == 'light_species':
            cdict = {'homo sapiens': 0,
                'mus musculus': 1,
                'Other': 2}
            scheme = 'Antibody species'

        elif scheme == 'light_ctype':
            cdict = {'Kappa': 0,
                'Lambda': 1,
                'unknown': 2,
                'NA': 3,
                'Other': 4,}
            scheme = 'Type of light chain'

        elif scheme == 'antigen_type':
            cdict = {'protein': 0,
                'peptide': 1,
                'Hapten': 2,
                'protein | protein': 3,
                'carbohydrate': 4,
        #       'nanobodies': 5,
                'Other': 5}
            scheme = 'Type of antigen'
        else:
            cdict = external_cdict

        for i in range(len(labels)):
            if labels[i] in cdict:
                colours.append(cdict[labels[i]])
            else:
                colours.append(cdict['Other'])
                labels[i] = 'Other'
    else:
        cdict = None
        deleted_items = 0
        for i, item in enumerate(numerical_values):
            if isinstance(item, (int, float, np.int64, np.float32)):
                colours.append(item)
            elif item.replace('.', '').isnumeric():
                colours.append(float(item))
            else:
                embedding = np.delete(embedding, i-deleted_items, axis=0)
                pdb_codes = np.delete(pdb_codes, i-deleted_items, axis=0)
                deleted_items += 1
    plot_umap(embedding=embedding, colours=colours, scheme=scheme, pdb_codes=pdb_codes, categorical=categorical, include_ellipses=include_ellipses, cdict=cdict, interactive=interactive)

    return colours, pdb_codes, embedding




[docs]
def plot_umap(embedding, colours, scheme, pdb_codes, categorical=True, include_ellipses=False, cdict=None, interactive=False):
    r"""Plots UMAP maps.

    Parameters
    ----------
    embedding: numpy.ndarray
        The output layer representations after dimensionality reduction.
    colours: list
        The data points labels or values.
    scheme: str
        Category of the labels or values appearing in the UMAP representation.
    pdb_codes: list
        The PDB codes of the antibodies.
    categorical: bool
        ``True`` if ``scheme`` is categorical.
    include_ellipses: bool
        ``True`` to include ellipses comprising 85% of the points of a given class.
    cdict: dictionary
        External dictionary of the UMAP labels.
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """
    fig = plt.figure(figsize=(20,20))
    ax = fig.add_subplot()

    if categorical:
        cmap = plt.get_cmap('tab10')
    else:
        cmap = plt.get_cmap('Purples')
    unique_colours = list(set(colours))
    norm = plt.Normalize(np.min(colours), np.max(colours))
    legend_patches = [patches.Patch(color=cmap(norm(color))) for color in unique_colours]
    im = ax.scatter(embedding[:, 0], embedding[:, 1] , s=50, c=colours, cmap=cmap)

    for i in range(len(pdb_codes)):
        if i % 1 == 0:
            ax.annotate(pdb_codes[i], (embedding[i, 0], embedding[i, 1]), size=8)

    if include_ellipses:
        # Inverse of the chi-squared CDF
        conf_level = 0.85
        inv_chi2 = chi2.ppf(conf_level, df=2)
        ellipses = []  # Store ellipse information

        for label in unique_colours:
            label_points = embedding[np.array(colours) == label]  # Subset of UMAP points for a specific label
            n_points = len(label_points)
            
            # Centroid and then sort
            center = np.mean(label_points, axis=0)
            covariance = np.cov(label_points.T)
            dist = np.sum(np.square(label_points - center), axis=1)
            sorted_indices = np.argsort(dist)
            
            # Calculate the number of points to include within the ellipse
            n_inside = int(np.ceil(n_points * conf_level))
            inside_points = label_points[sorted_indices[:n_inside]]
            
            # Recalculate the mean and covariance using only the inside points
            center = np.mean(inside_points, axis=0)
            covariance = np.cov(inside_points.T)
            
            # Calculate the eigenvalues and eigenvectors of the cov matrix (again)
            eigenvalues, eigenvectors = np.linalg.eig(covariance)
            angle = np.degrees(np.arctan2(*eigenvectors[:, 0][::-1]))
            
            scale_factor = np.sqrt(inv_chi2)
            radius = np.sqrt(eigenvalues) * scale_factor
            
            ellipse = patches.Ellipse(xy=center, width=2 * radius[0], height=2 * radius[1],
                                    angle=angle, fill=False, linewidth=3, alpha=0.7, color=cmap(norm(label)))
            ellipses.append(ellipse)

        for i, ellipse in enumerate(ellipses):
            if list(cdict.keys())[i] not in ['unknown', 'Other']:
                ax.add_patch(ellipse)

    if categorical:
        legend1 = ax.legend(legend_patches, cdict.keys(), loc='best')
    else:
        legend1 = ax.legend(legend_patches[:10], set(colours), loc='best')
    ax.add_artist(legend1)

    ax.set_title(scheme, size=18)
    ax.set_xlabel('UMAP 1', size=16)
    ax.set_ylabel('UMAP 2', size=16)
    plt.show(block=False)
    if interactive:
        plt.pause(3)
        plt.close('all')



[docs]
def plot_region_importance(importance_factor, importance_factor_ob, antigen_type, mode='region', interactive=False):
    r"""Plots ranking of important regions.

    Parameters
    ----------
    importance_factor: list
        Measure of importance (0-100) for each antibody region.
    importance_factor_ob: list
        Measure of importance (0-100) for each antibody region attributable to off-block correlations. This can be inter-region or inter-chain depending on the selected ``mode``.
    antigen_type: int
        Plot corresponding to antigens of a given type. These can be proteins (0), haptens (1), peptides (2) or carbohydrates (3).
    mode: str
        ``region`` to explicitely show which correlations are inter/intra-region (likewise for ``chain``).
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """

    labels = ['FR-H1', 'CDR-H1', 'FR-H2', 'CDR-H2', 'FR-H3', 'CDR-H3', 'FR-H4', 'FR-L1', 'CDR-L1', 'FR-L2', 'CDR-L2', 'FR-L3', 'CDR-L3', 'FR-L4']
    mapping_dict = {0: 0, 1: 2, 2: 1, 3: 5}

    sorted_indices = np.argsort(importance_factor)[::-1]  # Reverse order
    cmap = plt.get_cmap('tab10')

    # Create bars for each class
    fig, ax = plt.subplots()
    y_pos = np.arange(len(labels))

    bar1 = ax.barh(y_pos, np.array(importance_factor_ob)[sorted_indices], align='center', color=cmap(mapping_dict[antigen_type]), label=f'Inter-{mode}')
    bar2 = ax.barh(y_pos, np.array([importance_factor[i]-importance_factor_ob[i] for i in range(len(labels))])[sorted_indices], align='center', alpha=0.6, left=np.array(importance_factor_ob)[sorted_indices], color=cmap(mapping_dict[antigen_type]), label=f'Intra-{mode}')
    
    ax.set_xlabel('Importance (%)')
    ax.set_yticks(y_pos)
    ax.set_yticklabels([labels[np.argsort(importance_factor)[::-1][i]] for i in range(len(labels))])
    plt.tick_params(axis='y', which='both', left=False, right=False)
    plt.tick_params(axis='x', which='major', bottom=True, right=True, size=3.5)

    for i, label in enumerate(ax.get_yticklabels()):
        if np.argsort(importance_factor)[::-1][i] < 7:
            color = 'green'
        else:
            color = '#333333'
        label.set_color(color)

    plt.tight_layout()
    ax.legend()
    plt.show()
    if interactive:
        plt.pause(3)
        plt.close('all')



[docs]
def add_region_based_on_range(list_residues):
    r"""Given a list of residues in Chothia numbering, this function adds the corresponding regions in brackets for each of them."""
    output_list_residues = []

    new_coord = np.array([range(0, 26), range(26, 38), range(38, 57), range(57, 67), range(67, 116), range(116, 142),
                         range(142, 153), range(153, 176), range(176, 195), range(195, 210), range(210, 225),
                         range(225, 265), range(265, 279), range(279, 292)], dtype=object)

    regions = ['FR-H1', 'CDR-H1', 'FR-H2', 'CDR-H2', 'FR-H3', 'CDR-H3', 'FR-H4', 'FR-L1', 'CDR-L1', 'FR-L2', 'CDR-L2', 'FR-L3', 'CDR-L3', 'FR-L4']

    for index, element in enumerate(list_residues):
        for i, r in enumerate(new_coord):
            if index in r:
               output_list_residues.append(element+' (' + regions[i] + ')')

    return output_list_residues



[docs]
def plot_residue_importance(preprocessed_data, importance_factor, antigen_type, interactive=False):
    r"""Plots ranking of important residues.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    importance_factor: list
        Measure of importance (0-100) for each antibody residue.
    antigen_type: int
        Plot corresponding to antigens of a given type. These can be proteins (0), haptens (1), peptides (2) or carbohydrates (3).
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """

    res_labels = add_region_based_on_range(preprocessed_data.max_res_list_h+preprocessed_data.max_res_list_l)
    mapping_dict = {0: 0, 1: 2, 2: 1, 3: 5}
    cmap = plt.get_cmap('tab10')

    fig, ax = plt.subplots()
    y_pos = np.arange(len(res_labels[:30]))
    bar1 = ax.barh(y_pos, sorted(importance_factor, reverse=True)[:30], align='center', alpha=0.9, color=cmap(mapping_dict[antigen_type]))

    # Show top 30
    ax.set_xlabel('Importance (%)')
    ax.set_yticks(y_pos)
    ax.set_yticklabels([res_labels[np.argsort(importance_factor)[::-1][i]][:30] for i in range(len(res_labels[:30]))], fontsize=9.5)
    plt.tick_params(axis='y', which='both', left=False, right=False)
    plt.tick_params(axis='x', which='major', bottom=True, right=True, size=3.5)

    for i, label in enumerate(ax.get_yticklabels()):
        if np.argsort(importance_factor)[::-1][i] < len(preprocessed_data.max_res_list_h): 
            color = 'green'
        else:
            color = '#333333'
        label.set_color(color)

    plt.tight_layout()
    plt.show()

    if interactive:
        plt.pause(3)
        plt.close('all')




[docs]
def get_colours_ag_type(preprocessed_data):
    r"""Returns a different colour according to the antigen type.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    
    """

    cluster_according_to = 'antigen_type'
    db = pd.read_csv(preprocessed_data.data_path+'sabdab_summary_all.tsv', sep='\t')

    clusters = []
    for i in range(len(preprocessed_data.labels)):
        clusters.append(str(db[db['pdb'] == preprocessed_data.labels[i]].iloc[0][cluster_according_to]))

    cdict = {'protein': 0,
            'Hapten': 1,
            'peptide': 2,
            'carbohydrate': 3,
            'nucleic-acid': 4,
            'protein | protein': 5,
            'Other': 6}

    colours = []
    for i in range(len(clusters)):
        if clusters[i] in cdict:
            colours.append(cdict[clusters[i]])
        else:
            colours.append(cdict['Other'])
            clusters[i] = 'Other'

    return colours 



[docs]
def compute_region_importance(preprocessed_data, model, type_of_antigen, nanobodies, mode='region', interactive=False):
    r"""Computes the importance factors (0-100) of all the Fv antibody regions. Returns the importance for each region.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    model: antipasti.model.model.ANTIPASTI
        The model class, i.e., ``ANTIPASTI``.
    type_of_antigen: int
        Choose between: proteins (0), haptens (1), peptides (2) or carbohydrates (3).
    nanobodies: list
        PDB codes of nanobodies in the dataset.
    mode: str
        ``region`` to explicitely calculate which correlations are inter/intra-region (likewise for ``chain``).
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """

    colours = get_colours_ag_type(preprocessed_data)
    each_img_enl = get_output_representations(preprocessed_data, model)

    train_x = preprocessed_data.train_x
    input_shape = preprocessed_data.test_x.shape[-1]
    labels = preprocessed_data.labels
    train_y = preprocessed_data.train_y

    colours = [0 if c == 5 else c for c in colours]
    all_mse_without_region = []
    all_mse_without_region_intra = []
    all_mse_without_region_ob = []

    new_coord = np.array([range(0, 26), range(26, 38), range(38, 57), range(57, 67), range(67, 116), range(116, 142), range(142, 153),
                        range(153, 176), range(176, 195), range(195, 210), range(210, 225), range(225, 265), range(265, 279), range(279, 292)], dtype=object)
    
    for j in range(len(new_coord)+1):
        train_y_ = np.array([train_y[i] for i in range(each_img_enl.shape[0]) if colours[i] == type_of_antigen and labels[i] not in nanobodies])
        if j != len(new_coord):
            
            sums_without_region = np.array([
                each_img_enl[i].reshape((input_shape, input_shape)).sum()-(each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, :]).sum()
                for i in range(each_img_enl.shape[0]) if colours[i] == type_of_antigen and labels[i] not in nanobodies])

            if mode == 'region':
                sums_without_region_divided = np.array([
                        each_img_enl[i].reshape((input_shape, input_shape)).sum()-np.array([(each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, new_coord[0][0]:new_coord[j][0]]).sum()
                                                                    +(each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, new_coord[j][-1] + 1:new_coord[-1][-1] + 1]).sum(),
                                                                    (each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, new_coord[j][0]:new_coord[j][-1] + 1]).sum()])
                        for i in range(each_img_enl.shape[0]) if colours[i] == type_of_antigen and labels[i] not in nanobodies])

                all_mse_without_region_intra.append(np.mean((sums_without_region_divided[:,1] - train_y_)**2))
                all_mse_without_region_ob.append(np.mean((sums_without_region_divided[:,0] - train_y_)**2))
            
            else:
                sums_without_region_divided = np.array([
                        each_img_enl[i].reshape((input_shape, input_shape)).sum()-np.array([(each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, :len(preprocessed_data.max_res_list_h)]).sum(),
                                                                    (each_img_enl[i].reshape((input_shape, input_shape))[new_coord[j][0]:new_coord[j][-1] + 1, len(preprocessed_data.max_res_list_h):]).sum()])
                        for i in range(each_img_enl.shape[0]) if colours[i] == type_of_antigen and labels[i] not in nanobodies])

                index = 0 if j < 7 else 1
                all_mse_without_region_intra.append(np.mean((sums_without_region_divided[:, index] - train_y_)**2))
                all_mse_without_region_ob.append(np.mean((sums_without_region_divided[:, 1 - index] - train_y_)**2))
        else:
            sums_without_region = np.array([
                    each_img_enl[i].reshape((input_shape, input_shape)).sum()
                    for i in range(each_img_enl.shape[0]) if colours[i] == type_of_antigen and labels[i] not in nanobodies])

        all_mse_without_region.append(np.mean((sums_without_region - train_y_)**2))
    total_mse = all_mse_without_region[-1]
    all_mse_without_region.pop(-1)

    region_mean_lengths = np.array([24, 7.1, 19, 6, 41, 11.3, 10, 21.7, 12.7, 14.9, 7, 32.9, 9.2, 9.1])
    idx_best_normalised_mean_length = np.argmax(abs(all_mse_without_region-total_mse)/region_mean_lengths)

    tot = 100*region_mean_lengths[idx_best_normalised_mean_length] * abs(all_mse_without_region-total_mse) / abs(all_mse_without_region[idx_best_normalised_mean_length]-total_mse)/region_mean_lengths
    ob = tot - tot * abs(all_mse_without_region_intra-total_mse) / (abs(all_mse_without_region_intra-total_mse)+abs(all_mse_without_region_ob-total_mse))
    plot_region_importance(tot, ob, type_of_antigen, mode, interactive=interactive)

    return tot, ob



[docs]
def compute_residue_importance(preprocessed_data, model, type_of_antigen, nanobodies, interactive=False):
    r"""Computes the importance factors (0-100) of all the amino acids of the antibody variable region.

    Parameters
    ----------
    preprocessed_data: antipasti.preprocessing.preprocessing.Preprocessing
        The ``Preprocessing`` class.
    model: antipasti.model.model.ANTIPASTI
        The model class, i.e., ``ANTIPASTI``.
    type_of_antigen: int
        Choose between: proteins (0), haptens (1), peptides (2) or carbohydrates (3).
    nanobodies: list
        PDB codes of nanobodies in the dataset.
    interactive: bool
        Set to ``True`` when running a script or ``pytest``.

    """

    colours = get_colours_ag_type(preprocessed_data)
    each_img_enl = get_output_representations(preprocessed_data, model)

    train_x = preprocessed_data.train_x
    input_shape = preprocessed_data.test_x.shape[-1]

    colours = [0 if c == 5 else c for c in colours]
    all_mse_without_region = []

    for j in range(train_x.shape[-1]+1):
        if j != train_x.shape[-1]:
            sums_without_region = np.array([
                    each_img_enl[i].reshape((input_shape, input_shape)).sum()-(each_img_enl[i].reshape((input_shape, input_shape))[j:j+1, :]).sum()
                    for i in range(train_x.shape[0]) if colours[i] == type_of_antigen and preprocessed_data.labels[i] not in nanobodies])
        else:
            sums_without_region = np.array([
                    each_img_enl[i].reshape((input_shape, input_shape)).sum()
                    for i in range(train_x.shape[0]) if colours[i] == type_of_antigen and preprocessed_data.labels[i] not in nanobodies])
        train_y_ = np.array([preprocessed_data.train_y[i] for i in range(train_x.shape[0]) if colours[i] == type_of_antigen and preprocessed_data.labels[i] not in nanobodies])
        
        mse_without_region = np.mean((sums_without_region - train_y_)**2)
        all_mse_without_region.append(mse_without_region)
    total_mse = all_mse_without_region[-1]
    all_mse_without_region.pop(-1)

    plot_residue_importance(preprocessed_data, 100*abs(all_mse_without_region-total_mse)/abs(max(all_mse_without_region)-total_mse), type_of_antigen, interactive=interactive)