Source code for antipasti.preprocessing.preprocessing

# -*- coding: utf-8 -*-

r"""This module contains the pre-processing class. 

:Authors:   Kevin Michalewicz <k.michalewicz22@imperial.ac.uk>

"""

import glob
import numpy as np
import pandas as pd
import subprocess
import os

from config import DATA_DIR, SCRIPTS_DIR, STRUCTURES_DIR
from utils.generic_utils import remove_abc

[docs] class Preprocessing(object): r"""Generating the residue normal mode correlation maps. Parameters ---------- data_path: str Path to the data folder. scripts_path: str Path to the scripts folder. structures_path: str Path to the PDB files. df: str Name of the database containing the PDB entries. modes: int Number of considered normal modes. chain_lengths_path: str Path to the folder containing arrays with the chain lengths. dccm_map_path: str Path to the normal mode correlation maps. residues_path: str Path to the folder containing the list of residues per entry. file_type_input: str Filename extension of input structures. selection: str Considered portion of antibody chains. pathological: list PDB identifiers of antibodies that need to be excluded. renew_maps: bool Compute all the normal mode correlation maps. renew_residues: bool Retrieve the lists of residues for each entry. cmaps: bool If ``True``, ANTIPASTI computes the contact maps of the complexes instead of the Normal Modes. cmaps_thr: float Thresholding distance for alpha (α) carbons to build the contact maps. ag_agnostic: bool If ``True``, Normal Mode correlation maps are computed in complete absence of the antigen. affinity_entries_only: bool This is ``False`` in general, but the ANTIPASTI pipeline could be used to other types of projects and thus consider data without affinity values. stage: str Choose between ``training`` and ``predicting``. test_data_path: str Path to the test data folder. test_dccm_map_path: str Path to the test normal mode correlation maps. test_residues_path: str Path to the folder containing the list of residues for a test sample. test_structures_path: str Path to the test PDB file. test_pdb_id: str Test PDB ID. alphafold: bool ``True`` the test structure was folded using ``AlphaFold``. h_offset: int Amount of absent residues between positions 1 and 25 in the heavy chain. l_offset: int Amount of absent residues between positions 1 and 23 in the light chain. """ def __init__( self, data_path=DATA_DIR, scripts_path=SCRIPTS_DIR, structures_path=STRUCTURES_DIR, df='sabdab_summary_all.tsv', modes=30, chain_lengths_path='chain_lengths/', dccm_map_path='dccm_maps/', residues_path='lists_of_residues/', file_type_input='.pdb', selection='_fv', pathological=None, renew_maps=False, renew_residues=False, cmaps=False, cmaps_thr=8.0, ag_agnostic=False, affinity_entries_only=True, stage='training', test_data_path=None, test_dccm_map_path=None, test_residues_path=None, test_structure_path=None, test_pdb_id='1t66', alphafold=False, h_offset=0, l_offset=0, ag_residues=0, ): self.data_path = data_path self.scripts_path = scripts_path self.structures_path = structures_path self.chain_lengths_path = data_path + chain_lengths_path self.dccm_map_path = data_path + dccm_map_path self.residues_path = data_path + residues_path self.modes = modes self.file_type_input = file_type_input self.selection = selection self.pathological = pathological self.cmaps = cmaps self.cmaps_thr = cmaps_thr self.ag_agnostic = ag_agnostic self.affinity_entries_only = affinity_entries_only self.stage = 'training' self.file_residues_paths = sorted(glob.glob(os.path.join(self.residues_path, '*.npy'))) self.alphafold = alphafold self.h_offset = h_offset self.l_offset = l_offset self.ag_residues = ag_residues self.df_path = data_path + df self.entries, self.affinity, self.df = self.clean_df() self.heavy, self.light, self.selected_entries = self.initialisation(renew_maps, renew_residues) self.max_res_list_h, self.max_res_list_l, self.min_res_list_h, self.min_res_list_l = self.get_max_min_chains() self.train_x, self.train_y, self.labels, self.raw_imgs = self.load_training_images() self.stage = stage if self.stage != 'training': self.test_data_path = test_data_path self.test_dccm_map_path = self.test_data_path + test_dccm_map_path self.test_residues_path = self.test_data_path + test_residues_path self.test_structure_path = self.test_data_path + test_structure_path self.test_pdb_id = test_pdb_id self.test_x = self.load_test_image()
[docs] def clean_df(self): r"""Cleans the database containing the PDB entries. Returns ------- df_pdbs: list PDB entries. df_kds: list Binding affinities. df: pandas.DataFrame Cleaned database. """ df = pd.read_csv(self.df_path, sep='\t', header=0)[['pdb', 'antigen_type', 'affinity']] df.drop_duplicates(keep='first', subset='pdb', inplace=True) df['pdb'] = df['pdb'].str.lower().str.replace('+', '') # lowercase and remove '+' signs of scientific notation df = df[(df.antigen_type.notna()) & (df.antigen_type != 'NA')][['pdb', 'affinity']] if self.affinity_entries_only: df = df[(df.affinity.notna()) & (df.affinity != 'None')] df = df[~df['pdb'].isin(self.pathological)] # Removing pathological cases return list(df['pdb']), list(df['affinity']), df
[docs] def generate_fv_pdb(self, path, keepABC=True, lresidues=False, hupsymchain=None, lupsymchain=None): r"""Generates a new PDB file containing the antigen residues and the antibody variable region. Parameters ---------- path: str Path of a Chothia-numbered PDB file. keepABC: bool Keeps residues whose name ends with a letter from 'A' to 'Z'. lresidues: bool The names of each residue are stored in ``self.residues_path``. upsymchain: int Upper limit of heavy chain residues due to a change in the numbering convention. Only useful when using ``AlphaFold``. lupsymchain: int Upper limit of light chain residues due to a change in the numbering convention. Only useful when using ``AlphaFold``. """ amino_acid_dictionary = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K', 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'} if self.stage == 'training': rpath = self.residues_path else: rpath = self.test_residues_path list_residues = ['START-Ab'] with open(path, 'r') as f: # needs to be Chothia-numbered content = f.readlines() header_lines_important = range(4) header_lines = [content[i][0]=='R' for i in range(len(content))].count(True) h_range = range(1, 114) l_range = range(1, 108) residue_type_range = slice(17, 20) start_chain = 21 chain_range = slice(start_chain, start_chain+1) res_range = slice(23, 26) res_extra_letter = 26 #sometimes includes a letter 'A', 'B', 'C', ... h_chain_key = 'HCHAIN' l_chain_key = 'LCHAIN' antigen_chain_key = 'AGCHAIN' idx_list = list(header_lines_important) idx_list_l = [] idx_list_antigen = [] antigen_chains = [] new_path = path[:-4] + self.selection + path[-4:] # Getting the names of the heavy and antigen chains line = content[header_lines_important[-1]] if line.find(h_chain_key) != -1: h_pos = line.find(h_chain_key) + len(h_chain_key) + 1 h_chain = line[h_pos:h_pos+1] antigen_pos = line.find(antigen_chain_key) + len(antigen_chain_key) + 1 antigen_chains.append(line[antigen_pos:antigen_pos+1]) for i in range(3): if line[antigen_pos+2*i+1] in [',', ';']: antigen_chains.append(line[antigen_pos+2*i+2]) # If two (or more) interacting antigen chains present else: # useful when using AlphaFold h_chain = 'A' l_chain = 'B' antigen_chains = ['C', 'D', 'E'] idx_list = [0] h_range = range(1-self.h_offset, hupsymchain-self.h_offset) l_range = range(1-self.l_offset, lupsymchain-self.l_offset) h_pos = start_chain l_pos = start_chain if line.find(l_chain_key) != -1: l_pos = line.find(l_chain_key) + len(l_chain_key) + 1 l_chain = line[l_pos:l_pos+1] elif self.alphafold is False: l_chain = None # Checking if H and L chains have the same name if l_chain is not None and h_chain.upper() == l_chain.upper(): pathologic = True h_chain = h_chain.upper() l_chain = h_chain.lower() elif antigen_chains is not None and self.affinity_entries_only is False and (h_chain.upper() in antigen_chains or (l_chain is not None and l_chain.upper() in antigen_chains)): pathologic = True h_chain = h_chain.lower() if l_chain is not None: l_chain = l_chain.lower() else: pathologic = False # Checks for matching identifiers if pathologic: if 'X' not in antigen_chains: new_hchain = 'X' else: new_hchain = 'W' if 'Y' not in antigen_chains: new_lchain = 'Y' else: new_lchain = 'Z' else: new_hchain = h_chain new_lchain = l_chain # Obtaining lines for the heavy chain variable region first for i, line in enumerate(content[header_lines:]): if line[chain_range].find(h_chain) != -1 and int(line[res_range]) in h_range: if (line[res_extra_letter] == ' ' or keepABC == True) and line.find('HETATM') == -1: idx_list.append(i+header_lines) if lresidues == True: full_res = line[res_range] + line[res_extra_letter] if pathologic: full_res = new_hchain + full_res else: full_res = line[chain_range] + full_res full_res = amino_acid_dictionary[line[residue_type_range]] + full_res if full_res != list_residues[-1]: list_residues.append(full_res) # This separation ensures that heavy chain residues are enlisted first if l_chain is not None: for i, line in enumerate(content[header_lines:]): if line[chain_range].find(l_chain) != -1 and int(line[res_range]) in l_range: if (line[res_extra_letter] == ' ' or keepABC == True) and line.find('HETATM') == -1: idx_list_l.append(i+header_lines) if lresidues == True: full_res = line[res_range] + line[res_extra_letter] if pathologic: full_res = new_lchain + full_res else: full_res = line[chain_range] + full_res full_res = amino_acid_dictionary[line[residue_type_range]] + full_res if full_res != list_residues[-1]: list_residues.append(full_res) if lresidues == True: list_residues.append('END-Ab') # Obtaining antigen(s) for i, line in enumerate(content[header_lines:]): if any(line[chain_range] in agc for agc in antigen_chains) and h_chain not in antigen_chains and l_chain not in antigen_chains: idx_list_antigen.append(i+header_lines) if lresidues == True: full_res = line[chain_range] + line[res_range] + line[res_extra_letter] if line[residue_type_range] in amino_acid_dictionary: full_res = amino_acid_dictionary[line[residue_type_range]] + full_res if full_res != list_residues[-1]: list_residues.append(full_res) # List with name of every residue is saved if selected saving_path = rpath + path[-8:-4] + '.npy' #if not os.path.exists(saving_path): np.save(saving_path, list_residues) # Creating new file with open(new_path, 'w') as f_new: f_new.writelines([content[l] for l in idx_list[:header_lines_important[-1]]]) if l_chain is not None and self.alphafold is False: f_new.writelines([content[l][:h_pos]+new_hchain+content[l][h_pos+1:l_pos]+new_lchain+content[l][l_pos+1:] for l in idx_list[header_lines_important[-1]:header_lines_important[-1]+1]]) else: f_new.writelines([content[l][:h_pos]+new_hchain+content[l][h_pos+1:] for l in idx_list[header_lines_important[-1]:header_lines_important[-1]+1]]) f_new.writelines([content[l][:start_chain-5]+' '+content[l][start_chain-4:start_chain]+new_hchain+content[l][start_chain+1:] for l in idx_list[header_lines_important[-1]+1:]]) if l_chain is not None: f_new.writelines([content[l][:start_chain-5]+' '+content[l][start_chain-4:start_chain]+new_lchain+content[l][start_chain+1:] for l in idx_list_l]) if not self.ag_agnostic: f_new.writelines([content[l] for l in idx_list_antigen]) if not self.cmaps: f_new.writelines([content[l] for l in range(len(content)) if content[l][0:6] == 'HETATM' and content[l][chain_range] in [h_chain, l_chain] and l not in idx_list+idx_list_l+idx_list_antigen])
[docs] def generate_maps(self): r"""Generates the Normal Mode correlation maps. """ for i, entry in enumerate(self.entries): file_name = entry + self.selection path = self.structures_path + file_name + self.file_type_input new_path = self.dccm_map_path + entry self.generate_fv_pdb(self.structures_path+entry+self.file_type_input, lresidues=True) if not self.cmaps: # and len(np.load(self.residues_path + path[-11:-7] + '.npy')) > 500: subprocess.call(['/usr/local/bin/RScript', str(self.scripts_path)+'pdb_to_dccm.r', str(path), str(new_path), str(self.modes)], shell=True, stdout=open(os.devnull, 'wb')) #elif not self.cmaps: # print(path[-11:-7]) # subprocess.call(['/usr/local/bin/RScript', str(self.scripts_path)+'pdb_to_dccm_aa.r', str(path), str(new_path), str(self.modes)], stdout=open(os.devnull, 'wb')) else: subprocess.call(['python', str(self.scripts_path)+'generate_contact_maps.py', str(path), str(new_path), str(self.cmaps_thr)], stdout=open(os.devnull, 'wb')) if os.path.exists(path): os.remove(path) if i % 25 == 0: print('Map ' + str(i+1) + ' out of ' + str(len(self.entries)) + ' processed.')
[docs] def get_lists_of_lengths(self, selected_entries): r"""Retrieves lists with the lengths of the heavy and light chains. Parameters ---------- selected_entries: list PDB valid entries. Returns ------- heavy: list Lengths of the heavy chains. In the context of the prediction stage, this list has one element. light: list Lengths of the light chains. In the context of the prediction stage, this list has one element. selected_entries: list PDB valid entries. In the context of the prediction stage, this list has one element. """ heavy = [] light = [] if self.stage == 'training': rpath = self.residues_path else: rpath = self.test_residues_path for entry in selected_entries: list_of_residues = list(np.load(rpath+entry+'.npy'))[1:] chain_pos = 0 if 'END-Ab' in list_of_residues: list_of_residues = list_of_residues[:list_of_residues.index('END-Ab')] chain_pos = 1 else: list_of_residues = list_of_residues[:-1] h_chain = list_of_residues[0][chain_pos] l_chain = list_of_residues[-1][chain_pos] heavy.append(len([idx for idx in list_of_residues if idx[chain_pos] == h_chain])) if h_chain != l_chain: light.append(len([idx for idx in list_of_residues if idx[chain_pos] == l_chain])) else: light.append(0) return heavy, light, selected_entries
[docs] def get_max_min_chains(self): r"""Returns the longest and shortest possible chains. """ max_res_list_h = [] max_res_list_l = [] for f in self.file_residues_paths: shift = 0 if 'END-Ab' in np.load(f): shift = 1 idx = self.selected_entries.index(f[-8:-4]) current_list_h = np.load(f)[1:self.heavy[idx]+1] current_list_l = np.load(f)[self.heavy[idx]+1:self.heavy[idx]+self.light[idx]+1] current_list_h = [x[1+shift:] for x in current_list_h] current_list_l = [x[1+shift:] for x in current_list_l] max_res_list_h += list(set(current_list_h).difference(max_res_list_h)) max_res_list_l += list(set(current_list_l).difference(max_res_list_l)) max_res_list_h = sorted(max_res_list_h, key=remove_abc) min_res_list_h = list(dict.fromkeys([x for x in max_res_list_h])) max_res_list_h = [x.strip() for x in max_res_list_h] max_res_list_l = sorted(max_res_list_l, key=remove_abc) min_res_list_l = list(dict.fromkeys([x for x in max_res_list_l])) max_res_list_l = [x.strip() for x in max_res_list_l] for f in self.file_residues_paths: shift = 0 if 'END-Ab' in f: shift = 1 idx = self.selected_entries.index(f[-8:-4]) current_list_h = np.load(f)[1:self.heavy[idx]+1] current_list_l = np.load(f)[self.heavy[idx]+1:self.heavy[idx]+self.light[idx]+1] current_list_h = [x[1+shift:] for x in current_list_h] current_list_l = [x[1+shift:] for x in current_list_l] min_res_list_h = sorted(list(set(current_list_h).intersection(min_res_list_h))) min_res_list_l = sorted(list(set(current_list_l).intersection(min_res_list_l))) min_res_list_h = [x.strip() for x in min_res_list_h] min_res_list_l = [x.strip() for x in min_res_list_l] return max_res_list_h, max_res_list_l, min_res_list_h, min_res_list_l
[docs] def initialisation(self, renew_maps, renew_residues): r"""Computes the normal mode correlation maps and retrieves lists with the lengths of the heavy and light chains. Parameters ---------- renew_maps: bool Compute all the normal mode correlation maps. renew_residues: bool Retrieve the lists of residues for each entry. Returns ------- heavy: list Lengths of the heavy chains. light: list Lengths of the light chains. selected_entries: list PDB valid entries. """ if renew_maps: self.generate_maps() dccm_paths = sorted(glob.glob(os.path.join(self.dccm_map_path, '*.npy'))) selected_entries = [dccm_paths[i][-8:-4] for i in range(len(dccm_paths))] if renew_residues: heavy, light, selected_entries = self.get_lists_of_lengths(selected_entries) np.save(self.chain_lengths_path+'heavy_lengths.npy', heavy) np.save(self.chain_lengths_path+'light_lengths.npy', light) np.save(self.chain_lengths_path+'selected_entries.npy', selected_entries) else: heavy = np.load(self.chain_lengths_path+'heavy_lengths.npy').astype(int) light = np.load(self.chain_lengths_path+'light_lengths.npy').astype(int) assert list(np.load(self.chain_lengths_path+'selected_entries.npy')) == selected_entries for entry in selected_entries: residues = list(np.load(self.residues_path+entry+'.npy')) if 'END-Ab' in residues: assert len(residues[:residues.index('END-Ab')])-1 == heavy[selected_entries.index(entry)] + light[selected_entries.index(entry)] return heavy, light, selected_entries
[docs] def generate_masked_image(self, img, idx, test_h=None, test_l=None): r"""Generates a masked normal mode correlation map Parameters ---------- img: numpy.ndarray Original array containing no blank pixels. idx: int Input index. test_h: int Length of the heavy chain of an antibody in the test set. test_l: int Length of the light chain of an antibody in the test set. Returns ------- masked: numpy.ndarray Masked normal mode correlation map. mask: numpy.ndarray Mask itself. """ if self.stage == 'training': f = self.file_residues_paths[idx] elif self.alphafold is False: f = sorted(glob.glob(os.path.join(self.test_residues_path, '*'+self.test_pdb_id+'.npy')))[0] else: f = sorted(glob.glob(os.path.join(self.test_residues_path, '*'+self.test_pdb_id[:-3]+'.npy')))[0] # removing '_af' suffix antigen_max_pixels = self.ag_residues f_res = np.load(f) # We force unique elements self.max_res_list_h = list(dict.fromkeys(self.max_res_list_h)) self.max_res_list_l = list(dict.fromkeys(self.max_res_list_l)) max_res_h = len(self.max_res_list_h) max_res_l = len(self.max_res_list_l) max_res = max_res_h + max_res_l masked = np.zeros((max_res+antigen_max_pixels, max_res+antigen_max_pixels)) mask = np.zeros((max_res+antigen_max_pixels, max_res+antigen_max_pixels)) if self.stage != 'training': h = test_h l = test_l else: current_idx = self.selected_entries.index(f[-8:-4]) h = self.heavy[current_idx] l = self.light[current_idx] current_list_h = f_res[1:h+1] shift = len(f_res[1][:f_res[1].find(' ')]) current_list_h = [x[shift:].strip() for x in current_list_h] # First letter is the type of amino acid and second letter is the chain ID current_list_l = f_res[h+1:h+l+1] current_list_l = [x[shift:].strip() for x in current_list_l] idx_list = [i for i in range(max_res_h) if self.max_res_list_h[i] in current_list_h] idx_list += [i+max_res_h for i in range(max_res_l) if self.max_res_list_l[i] in current_list_l] idx_list += [i+max_res_h+max_res_l for i in range(min(antigen_max_pixels, img.shape[-1]-(h+l)))] for k, i in enumerate(idx_list): for l, j in enumerate(idx_list): masked[i, j] = img[k, l] mask[i, j] = 1 return masked, mask
[docs] def load_training_images(self): r"""Returns the input/output pairs of the model and their corresponding labels. """ imgs = [] raw_imgs = [] kds = [] labels = [] file_paths = sorted(glob.glob(os.path.join(self.dccm_map_path, '*.npy'))) for f in file_paths: pdb_id = f[-8:-4] if pdb_id in self.selected_entries and pdb_id not in self.pathological: raw_sample = np.load(f) idx = self.entries.index(pdb_id) idx_new = self.selected_entries.index(pdb_id) labels.append(pdb_id) raw_imgs.append(raw_sample) imgs.append(self.generate_masked_image(raw_sample, idx_new)[0]) if self.affinity_entries_only: kds.append(np.log10(np.float32(self.affinity[idx]))) assert labels == [item for item in self.selected_entries if item not in self.pathological] for pdb in self.selected_entries: if pdb not in self.pathological and self.affinity_entries_only: assert np.float16(10**kds[[item for item in self.selected_entries if item not in self.pathological].index(pdb)] == np.float16(self.df[self.df['pdb']==pdb]['affinity'])).all() return np.array(imgs), np.array(kds), labels, raw_imgs
[docs] def load_test_image(self): r"""Returns a test normal mode correlation map which is masked according to the existing residues in the training set. """ pdb_id = self.test_pdb_id if self.alphafold is True: h, l, _ = self.get_lists_of_lengths(selected_entries=str(pdb_id[:-3]).split()) h = h[0] l = l[0] hupsymchain = 1 + h lupsymchain = 1 + l lresidues = False else: hupsymchain = None lupsymchain = None lresidues = True # Generating the normal mode correlation map file_name = pdb_id + self.selection path = self.test_structure_path + file_name + self.file_type_input new_path = self.test_dccm_map_path + pdb_id self.generate_fv_pdb(self.test_structure_path+pdb_id+self.file_type_input, lresidues=lresidues, hupsymchain=hupsymchain, lupsymchain=lupsymchain) subprocess.call(['/usr/local/bin/RScript '+str(self.scripts_path)+'pdb_to_dccm.r '+str(path)+' '+str(new_path)+' '+str(self.modes)], shell=True, stdout=open(os.devnull, 'wb')) if os.path.exists(path): os.remove(path) # Getting lengths, residues and masking file_path = sorted(glob.glob(os.path.join(self.test_dccm_map_path, '*'+pdb_id+'.npy'))) for f in file_path: raw_sample = np.load(f) if self.alphafold is False: h, l, _ = self.get_lists_of_lengths(selected_entries=str(pdb_id).split()) h = h[0] l = l[0] return self.generate_masked_image(raw_sample, 0, test_h=int(h), test_l=int(l))[0]