#' Calculate statistical and physicochemical features for peptides
#'
#' This function is used to calculate several distinct families of features for
#' a vector of peptides.
#'
#' This function is partially based on functions [Peptides::aaComp()]
#' and [Peptides::aaDescriptors()] (check documentation for details).
#'
#' Some of the features are calculated based on an aminoacid propensity scale,
#' which was originally obtained from *E.L. Ulrich et al., "BioMagResBank".
#' Nucleic Acids Research 36, D402-D408 (2008) DOI: 10.1093/nar/gkm957*. The
#' data was downloaded from
#' [http://www.bmrb.wisc.edu/ref_info/aadata.dat](http://www.bmrb.wisc.edu/ref_info/aadata.dat)
#' on July 1, 2020.
#'
#' @param df a *data.table* of class *windowed_epit_dt* or *windowed_prot_dt*,
#' generated by [make_window_df()]
#' @param max.N maximum length of N-peptide frequency features to be calculated.
#' @param ncpus number of cores to use.
#' @param save_folder path to folder for saving the output.
#'
#' @return Data table containing the calculated features appended as new columns.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk});
#' Jodie Ashford (\email{ashfojsm@@aston.ac.uk})
#'
#' @export
#'
calc_features <- function(df,
max.N = 2,
ncpus = 1,
save_folder = NULL){
# ========================================================================== #
# Sanity checks and initial definitions
df_classes <- c("windowed_epit_dt", "windowed_prot_dt")
assertthat::assert_that(is.data.frame(df),
any(class(df) %in% df_classes),
assertthat::is.count(max.N),
assertthat::is.count(ncpus))
type <- "prot"
if("windowed_epit_dt" %in% class(df)){
type <- "epit"
}
# Check save folder and create file names
if(!is.null(save_folder)) {
if(!dir.exists(save_folder)) dir.create(save_folder)
ymd <- gsub("-", "", Sys.Date())
df_file <- paste0(normalizePath(save_folder), "/", ymd, "_df_", type,
"_feats.rds")
errfile <- paste0(normalizePath(save_folder), "/", ymd, "_df_", type,
"_removed.rds")
}
# aa_codes <- get_aa_codes()
# invalid <- which(!sapply(df$Info_window_seq,
# function(x){
# all(strsplit(x, split = "")[[1]] %in% aa_codes)
# }))
#
# if(length(invalid) > 0){
# errlist <- df[invalid, ]
# df <- df[-invalid, ]
# if(!is.null(save_folder)) saveRDS(errlist, errfile)
# }
# ========================================================================== #
# Calculate features
cat("\nCalculating features")
if (ncpus > 1) {
DTthreads_old <- data.table::getDTthreads(verbose = FALSE)
data.table::setDTthreads(threads = 1)
}
tmp <- mypblapply(ncpus = ncpus,
X = df$Info_window_seq,
FUN = feat_calc,
max.N = max.N)
if (ncpus > 1) {
data.table::setDTthreads(threads = DTthreads_old)
}
tmp <- data.table::rbindlist(tmp, use.names = TRUE, fill = TRUE)
df <- cbind(df, tmp)
# Sort data.table (The variable names are initialised below just to
# prevent NOTEs on CRAN. The data.table ordering uses references to variables
# internal to df)
Info_sourceOrg_id <- Info_protein_id <- NULL
Info_epitope_id <- Info_center_pos <- NULL
Info_UID <- NULL
if (type == "epit"){
df <- df[order(Info_sourceOrg_id, Info_protein_id, Info_epitope_id,
Info_center_pos), ]
class(df) <- unique(c(class(df), "windowed_epit_dt"))
} else {
df <- df[order(Info_UID, Info_center_pos), ]
class(df) <- unique(c(class(df), "windowed_prot_dt"))
}
if(!is.null(save_folder)) {
saveRDS(df, df_file)
}
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.