Nothing
#' Function to calculate 3D-structure propoperties such as the average charge and hydrophobicity, pKa shift, free energy, RMSD of PDB files and add them to an AntibodyForests-object
#' @description Function to calculate protein 3D-structure properties of antibodies (or antibody-antigen complexes) and integrate them into an AntibodyForests-object.
#' @param VDJ a dataframe with V(D)J information such as the output of Platypus::VDJ_build(). Must contain columns sample_id, clonotype_id, barcode.
#' @param pdb.dir a directory containing PDB files.
#' @param file.df a dataframe of pdb filenames (column file_name) to be used and sequence IDs (column sequence) corresponding to the the barcodes column of the VDJ dataframe.
#' @param properties a vector of properties to be calculated. Default is c("charge", "hydrophobicity").
#' - charge: The net electrical charge at pH 7.0
#' - hydrophobicity: The hypdrophobicity of each amino acid, devided by the sequence length.
#' - RMSD_germline: the root mean square deviation to the germline structure (needs the germline pdb)
#' - 3di_germline: the edit distance between the 3di sequence of each sequences and the germline sequence (needs foldseek output).
#' - pKa_shift: the acid dissociation constant shift upon binding of the antibody to the antigen (needs Propka output)
#' - free_energy: the free energy of binding of the antibody to the antigen at a certain pH (needs Propka output)
#' - pLDDT: the pLDDT score of the model
#' @param free_energy_pH the pH to be used to calculate the free energy of binding. Default is 7.
#' @param sequence.region a character vector of the sequence region to be used to calculate properties. Default is "full.sequence".
#' - full.sequence: the full sequence(s) in the PDB file
#' - sub.sequence: part of the full sequence, for example the CDR3 region in the PDB file. This sub sequence must be a column in the VDJ dataframe.
#' - binding.residues: the binding residues in the PDB file
#' @param sub.sequence.column a character vector of the column name in the VDJ dataframe containing the sub sequence to be used to calculate properties. Default is NULL.
#' @param chain a character vector of the chain to be used to calculate properties. Default is both heavy and light chain
#' Assuming chain "A" is heavy chain, chain "B" is light chain, and possible chain "C" is the antigen.
#' - HC+LC: both heavy and light chain
#' - HC: heavy chain, assuming chain A is the heavy chain.
#' - LC: light chain, assuming chain B is the light chain.
#' - AG: antigen, assuming chain C is the antigen.
#' - whole.complex: the whole complex of antibody-antigen (all available chains in the pdb file).
#' @param propka.dir a directory containing Propka output files. The propka filenames should be similar to the PDB filenames.
#' @param foldseek.dir a directory containing dataframes with the Foldseek 3di sequence per chain for each sequence. Filenames should be similar to the PDB filenames and it needs to have column "chain" containing the 'A', 'B', and/or 'C' chain. Default is NULL.
#' @param germline.pdb PDB filename of the germline. Default is NULL.
#' @return the input VDJ dataframe with the calculated 3D-structure properties.
#' @export
#' @examples
#' \dontrun{
#' vdj_structure_antibody <- VDJ_3d_properties(VDJ = AntibodyForests::small_vdj,
#' pdb.dir = "~/path/PDBS_superimposed/",
#' file.df = files,
#' properties = c("charge", "3di_germline", "hydrophobicity"),
#' chain = "HC+LC",
#' sequence.region = "full.sequence",
#' propka.dir = "~/path/Propka_output/",
#' germline.pdb = "~/path/PDBS_superimposed/germline_5_model_0.pdb",
#' foldseek.dir = "~/path/3di_sequences/")
#' }
VDJ_3d_properties <- function(VDJ,
pdb.dir,
file.df,
properties,
sequence.region,
chain,
propka.dir,
free_energy_pH,
sub.sequence.column,
germline.pdb,
foldseek.dir){
#Check for missing or false input
if(missing(VDJ)){stop("VDJ dataframe is missing")}
if(!any(c("sample_id", "clonotype_id", "barcode") %in% colnames(VDJ))){stop("VDJ dataframe must contain columns sample_id, clonotype_id, barcode")}
if(missing(pdb.dir)){stop("pdb.dir is missing")}
if(!dir.exists(pdb.dir)){stop("pdb.dir does not exist")}
if(missing(file.df)){stop("file.df directory is missing")}
if(!any(c("file_name", "sequence") %in% colnames(file.df))){stop("file.df dataframe must contain columns file_name, sequence")}
if(!(all(file.df$file_name %in% list.files(pdb.dir)))){stop("file.df contains files that do not exist in pdb.dir")}
if(missing(sequence.region)){sequence.region = "full.sequence"}
if(!sequence.region %in% c("full.sequence", "sub.sequence", "binding.residues")){stop("sequence.region must be one of 'full.sequence', 'sub.sequence', 'binding.residues'")}
if(missing(chain)){chain = "HC+LC"}
if(!chain %in% c("HC+LC", "HC", "LC", "AG", "whole.complex")){stop("chain must be one of 'HC+LC', 'HC', 'LC', 'AG', 'whole.complex'")}
if(missing(properties)){properties = c("charge", "hydrophobicity")}
if(!all(properties %in% c("charge", "hydrophobicity", "pKa_shift", "free_energy", "RMSD_germline", "pLDDT", "3di_germline"))){stop("properties must be one or more of 'charge', 'hydrophobicity', 'RMDS_germline', '3di_germline', 'pLDDT', 'pKa_shift', 'free_energy'")}
if(missing(propka.dir)){propka.dir = NULL}
if(is.null(propka.dir) & any(c("pKa_shift", "free_energy") %in% properties)){stop("propka.dir is missing")}
if(!(is.null(propka.dir))){if(!dir.exists(propka.dir)){stop("propka.dir does not exist")}}
if(missing(free_energy_pH)){free_energy_pH = 7}
if(!is.numeric(free_energy_pH)){stop("free_energy_pH must be numeric")}
if(missing(sub.sequence.column)){sub.sequence.column = NULL}
if(!is.null(sub.sequence.column)){if(!sub.sequence.column %in% colnames(VDJ)){stop("sub.sequence.column does not exist in VDJ dataframe")}}
if(sequence.region == "sub.sequence" & is.null(sub.sequence.column)){stop("sub.sequence.column is missing")}
if(missing(germline.pdb)){germline.pdb = NULL}
if(!is.null(germline.pdb)){if(!file.exists(germline.pdb)){stop("germline.pdb does not exist")}}
if("RMDS_germline" %in% properties & is.null(germline.pdb)){stop("germline.pdb is missing")}
if(missing(foldseek.dir)){foldseek.dir = NULL}
if(!is.null(foldseek.dir)){if(!dir.exists(foldseek.dir)){stop("foldseek.dir does not exist")}}
if("3di_germline" %in% properties & is.null(foldseek.dir)){stop("foldseek.dir is missing")}
#Set global variables for CRAN
resno <- NULL
resid <- NULL
#Based on Lucas' VDJ_structure_analysis function
calculate_charge <- function(pdb, chains, resnos_df){
df_charge <- data.frame()
pdb$atom$charge <- NULL
#Calculate the charge of each residue per chain
for(CHAIN in chains){
df_chain <- pdb$atom |> dplyr::filter(chain == CHAIN) |> dplyr::distinct(resno, .keep_all = TRUE) |> dplyr::select(resno, chain)
df_chain$charge <- pdb$atom |> dplyr::filter(chain == CHAIN) |> dplyr::distinct(resno, .keep_all = TRUE) |> dplyr::pull(resid) |> stringr::str_to_title() |> seqinr::a() |> Peptides::charge()
#Select residues
resnos <- resnos_df[resnos_df$chain == CHAIN,"resno"]
df_chain <- df_chain[df_chain$resno %in% resnos,]
df_charge <- rbind(df_charge, df_chain)
}
pdb$atom <- dplyr::left_join(pdb$atom, df_charge, by = c("resno","chain"))
#Calculate the average charge
average_charge = mean(stats::na.omit(pdb$atom$charge))
return(average_charge)
}
#Based on Lucas' VDJ_structure_analysis function
calculate_hydrophobicity <- function(pdb, chains, resnos_df){
df_hydph <- data.frame()
#Calculate the hydrophobicity of each residue per chain
for(CHAIN in chains){
df_chain <- pdb$atom |> dplyr::filter(chain == CHAIN) |> dplyr::distinct(resno, .keep_all = T) |> dplyr::select(resno,chain)
df_chain$hydrophobicity <- pdb$atom |> dplyr::filter(chain == CHAIN) |> dplyr::distinct(resno, .keep_all = T) |> dplyr::pull(resid) |> stringr::str_to_title() |> seqinr::a() |> Peptides::hydrophobicity()
#Select residues
resnos <- resnos_df[resnos_df$chain == CHAIN,"resno"]
df_chain <- df_chain[df_chain$resno %in% resnos,]
df_hydph <- rbind(df_hydph,df_chain)
}
pdb$atom <- dplyr::left_join(pdb$atom, df_hydph, by = c("resno","chain"))
#Calculate the average hydrophobicity
average_hydrophobicity = mean(stats::na.omit(pdb$atom$hydrophobicity))
return(average_hydrophobicity)
}
#Adapted from Lucas' Structure_utils.py
#Function to get the binding site residues in a PDB file
#* pdb object
#* chains_binder: The chain or a list of chains of the binders
#* chains_ligand: The chain or a list of chains of the ligands
#* cutoff: The cutoff in distance (A) between atoms to be part of the binding site.
get_bs_res <- function(pdb, chains_binder, chains_ligand, cutoff = 5) {
# Extract atoms for binders and ligands
atoms_binder <- pdb$atom[pdb$atom$chain %in% chains_binder, ]
atoms_ligand <- pdb$atom[pdb$atom$chain %in% chains_ligand, ]
# Extract coordinates
coord_binder <- atoms_binder[, c("x", "y", "z")]
coord_ligand <- atoms_ligand[, c("x", "y", "z")]
# Calculate the distance matrix
dist_mat <- bio3d::dist.xyz(coord_binder, coord_ligand)
# Apply cutoff to distance matrix
dist_mat_bin <- dist_mat <= cutoff
# Get binding site residues
bs_res_binder <- unique(atoms_binder[rowSums(dist_mat_bin) >= 1, c("chain", "resno")])
bs_res_ligand <- unique(atoms_ligand[colSums(dist_mat_bin) >= 1, c("chain", "resno")])
return(rbind(bs_res_binder, bs_res_ligand))
}
#Get the resnos from a subsequences in the PDB file
get_subseq_res <- function(pdb, sub_seq, CHAIN){
#Get the residues in the PDB file
pdb$atom |> dplyr::filter(chain %in% CHAIN) |> dplyr::distinct(resno, .keep_all = TRUE) |> dplyr::pull(resid, resno) -> resids
resids_df <- as.data.frame(resids)
resids_df$resno <- rownames(resids_df)
resids_df$resids <- sapply(resids_df$resids, function(x){x <- stringr::str_to_title(x);return(seqinr::a(x))})
#Align the subsequence to the PDB sequence
pwa <- pwalign::pairwiseAlignment(sub_seq, paste(resids_df$resids, collapse = ""))
aligned_subseq <- as.character(pwalign::alignedPattern(pwa))
#Get the matching indices
ind <- which(strsplit(aligned_subseq, "")[[1]] != "-")
#Get the corresponding resnos
resnos <- resids_df$resno[ind]
resnos <- data.frame(chain = CHAIN, resno = as.numeric(resnos))
return(resnos)
}
#From Steropodon
calculate_pKa_shift <- function(propka_out, filename, resnos){
#Create propka dataframe
start <- grep('SUMMARY OF THIS PREDICTION', propka_out)
end <- grep('Free energy of', propka_out)
pka <- propka_out[(start+1):(end-3)]
pka <- paste(pka, collapse = '\n')
tc <- textConnection(pka)
df <- utils::read.table(tc, as.is=T, fill=T, blank.lines.skip=F)
colnames(df) <- c("aa", "resno", "chain", "pKa", "model-pKa")
df <- df[2:nrow(df),]
df$resno <- as.numeric(as.character(df$resno))
#Filter on chain and residues
df <- stats::na.omit(dplyr::left_join(resnos, df, by = c("resno" = "resno", "chain" = "chain")))
#Calculate the pKa shift
df[,c("pKa", "model.pKa")] <- lapply(df[,c("pKa", "model-pKa")], as.numeric)
df$pKa_shift <- df$pKa - df$model.pKa
average_pKa_shift <- mean(df$pKa_shift)
return(average_pKa_shift)
}
calculate_free_energy <- function(propka_out, free_energy_pH){
start <- grep('Free energy of', propka_out)
end <- grep('The pH of optimum stability', propka_out)
en <- propka_out[(start+1):(end-2)]
en <- paste(en, collapse = '\n')
tc <- textConnection(en)
df <- utils::read.table(tc, as.is=T, fill=T, blank.lines.skip=F)
close(tc)
free_energy <- df[df$V1 == free_energy_pH, "V2"]
return(free_energy)
}
#Calculate the RMSD to the germline
calculate_RMSD_germline <- function(pdb, germline.pdb, chains, resnos_df){
#Germline pdb and CA indices
pdb_germline <- bio3d::read.pdb(germline.pdb)
ca.inds.germline <- c()
ca.inds <- c()
for(CHAIN in chains){
resnos <- resnos_df[resnos_df$chain == CHAIN,"resno"]
ca.inds.germline <- bio3d::combine.select(ca.inds.germline,
bio3d::atom.select(pdb_germline, "calpha", chain = chains, resno = resnos),
verbose = F)
ca.inds <- bio3d::combine.select(ca.inds, bio3d::atom.select(pdb, "calpha", chain = chains, resno = resnos), verbose = F)
}
#Calculate RMSD from germline
rmsd <- bio3d::rmsd(pdb, pdb_germline, fit = T, a.inds = ca.inds, b.inds = ca.inds.germline)
return(rmsd)
}
#Calculate the edit distance between the 3di sequences of a node and the germline
calculate_3di_distance <- function(pdb, foldseek.dir, file.name, germline.df, chains, resnos_df, sequence.region, germline.pdb.file){
#Read the sequence 3Di dataframe
seqs_3di <- utils::read.table(paste0(foldseek.dir,substr(file.name, 1, nchar(file.name)-4),
".csv"), header = T, sep = ",")
#Store the lv distance for each chain
distance_chain_vector <- c()
for (CHAIN in chains){
#Get the 3Di subsequence
resnos <- resnos_df[resnos_df$chain == CHAIN,"resno"]
sub_seqs_3di <- paste0(strsplit(seqs_3di[seqs_3di$chain == CHAIN,2], "")[[1]][resnos], collapse = "")
#Get the germline subsequence
if(sequence.region == "full.sequence" | sequence.region == "sub.sequence"){
sub_gl_3di <- germline.df[germline.df$chain == CHAIN,2]
}
if(sequence.region == "binding.residues"){
germline.pdb <- bio3d::read.pdb(germline.pdb.file)
gl_resnos <- get_bs_res(germline.pdb, chains_binder = c("A", "B"), chains_ligand = "C")
gl_resnos <- gl_resnos[resnos_df$chain == CHAIN,"resno"]
sub_gl_3di <- paste0(strsplit(germline.df[germline.df$chain == CHAIN,2], "")[[1]][gl_resnos], collapse = "")
}
distance_chain <- stringdist::stringdist(sub_seqs_3di, sub_gl_3di, method = 'lv')
distance_chain_vector <- c(distance_chain_vector, distance_chain)
}
distance_3di <- sum(distance_chain_vector)
return(distance_3di)
}
#Add the pLDDT score to the VDJ dataframe, this is stored in the b-factor field of the pdb file (unlike a b-factor, higher pLDDT is better)
calculate_plddt <- function(pdb, chains, resnos){
#select residues
df <- dplyr::left_join(resnos, pdb$atom, by = c("resno" = "resno", "chain" = "chain"))
#Calculate average plddt
average_plddt <- mean(df$b)
return(average_plddt)
}
#Get list of files if file.df is null
if(is.null(file.df)){file.df = list.files(pdb.dir)}
#Add the empty properties columns to the VDJ dataframe
if("charge" %in% properties){VDJ$average_charge <- NA}
if("hydrophobicity" %in% properties){VDJ$average_hydrophobicity <- NA}
if("RMSD_germline" %in% properties){VDJ$RMSD_germline <- NA}
if("pKa_shift" %in% properties){VDJ$average_pKa_shift <- NA}
if("free_energy" %in% properties){VDJ$free_energy <- NA}
if("pLDDT" %in% properties){VDJ$average_pLDDT <- NA}
#Set the chains
if(chain == "HC+LC"){chains = c("A", "B")}
if(chain == "HC"){chains = "A"}
if(chain == "LC"){chains = "B"}
if(chain == "AG"){chains = "C"}
if(chain == "whole.complex"){chains = c("A", "B", "C")}
#Calculate properties for each structure
for(row in seq(1:nrow(file.df))){
file = file.df[row,]
if(file$sequence %in% VDJ$barcode){
#Read in the pdb file
pdb <- bio3d::read.pdb(paste0(pdb.dir, file$file_name))
#Get the residues for calculations of average properties
if(sequence.region == "full.sequence"){
resnos <- unique(pdb$atom[pdb$atom$chain %in% chains, c('chain', "resno")])
}
if(sequence.region == "binding.residues"){
resnos <- get_bs_res(pdb, chains_binder = c("A", "B"), chains_ligand = "C")
}
if(sequence.region == "sub.sequence"){
#get the subsequence
sub_seq <- VDJ[VDJ$barcode == file$sequence,sub.sequence.column]
resnos <- get_subseq_res(pdb, sub_seq, chains)
}
if("charge" %in% properties){
#Calculate average charge
average_charge <- calculate_charge(pdb, chains, resnos)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$average_charge <- average_charge
}
if("hydrophobicity" %in% properties){
#Calculate average hydrophobicity
average_hydrophobicity <- calculate_hydrophobicity(pdb, chains, resnos)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$average_hydrophobicity <- average_hydrophobicity
}
if("RMSD_germline" %in% properties){
#Calculate the RMSD to the germline
RMSD_germline <- calculate_RMSD_germline(pdb, germline.pdb, chains, resnos)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$RMSD_germline <- RMSD_germline
}
if("3di_germline" %in% properties){
#Get the germline 3di sequence
germline_3di <- tryCatch({
germline_3di <- utils::read.csv(paste0(foldseek.dir, "/", substr(file.df[file.df$sequence == "germline", "file_name"], 1,
nchar(file.df[file.df$sequence == "germline", "file_name"])-4),".csv"),
header = T, sep = ",")
}, error = function(e){
warning(paste0("Sequence ", file$sequence, " not found in foldseek output"))
germline_3di = NULL
})
if(!is.null(germline_3di)){
#Calculate the 3di distance to the germline
foldseek_germline <- calculate_3di_distance(pdb, foldseek.dir, germline.df = germline_3di, file.name = file$file_name,
chains, resnos, sequence.region, germline.pdb)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,"3di_germline"] <- foldseek_germline
}
}
if("pKa_shift" %in% properties | "free_energy" %in% properties){
#Read in the propka output
propka_out <- readLines(paste0(propka.dir, "/", substr(file$file_name, 1, nchar(file$file_name)-4),".pka"))
if("pKa_shift" %in% properties){
#Calculate the pKa shift
average_pKa_shift <- calculate_pKa_shift(propka_out, file$file_name, resnos)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$average_pKa_shift <- average_pKa_shift
}
if("free_energy" %in% properties){
#Calculate the free energy
free_energy <- calculate_free_energy(propka_out, free_energy_pH)
#Add to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$free_energy <- free_energy
}
}
if("pLDDT" %in% properties){
#Calculate the pLDDT
average_pLDDT <- calculate_plddt(pdb, chains, resnos)
#Add the pLDDT score to the VDJ dataframe
VDJ[VDJ$barcode == file$sequence,]$average_pLDDT <- average_pLDDT
}
} else {
warning(paste0("Sequence ", file$sequence, " not found in VDJ dataframe"))
next
}
}
return(VDJ)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.