R/calc_features.R

Defines functions calc_features

Documented in calc_features

#' Calculate statistical and physicochemical features for peptides
#'
#' This function is used to calculate several distinct families of features for
#' a vector of peptides.
#'
#' This function is partially based on functions [Peptides::aaComp()]
#' and [Peptides::aaDescriptors()] (check documentation for details).
#'
#' Some of the features are calculated based on an aminoacid propensity scale,
#' which was originally obtained from *E.L. Ulrich et al., "BioMagResBank".
#' Nucleic Acids Research 36, D402-D408 (2008) DOI: 10.1093/nar/gkm957*. The
#' data was downloaded from
#' [http://www.bmrb.wisc.edu/ref_info/aadata.dat](http://www.bmrb.wisc.edu/ref_info/aadata.dat)
#' on July 1, 2020.
#'
#' @param df a *data.table* of class *windowed_epit_dt* or *windowed_prot_dt*,
#'        generated by [make_window_df()]
#' @param max.N maximum length of N-peptide frequency features to be calculated.
#' @param ncpus number of cores to use.
#' @param save_folder path to folder for saving the output.
#'
#' @return Data table containing the calculated features appended as new columns.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk});
#'         Jodie Ashford (\email{ashfojsm@@aston.ac.uk})
#'
#' @export
#'
calc_features <- function(df,
                          max.N = 2,
                          ncpus = 1,
                          save_folder = NULL){
  # ========================================================================== #
  # Sanity checks and initial definitions
  df_classes <- c("windowed_epit_dt", "windowed_prot_dt")
  assertthat::assert_that(is.data.frame(df),
                          any(class(df) %in% df_classes),
                          assertthat::is.count(max.N),
                          assertthat::is.count(ncpus))

  type <- "prot"
  if("windowed_epit_dt" %in% class(df)){
    type <- "epit"
  }

  # Check save folder and create file names
  if(!is.null(save_folder)) {
    if(!dir.exists(save_folder)) dir.create(save_folder)
    ymd <- gsub("-", "", Sys.Date())
    df_file <- paste0(normalizePath(save_folder), "/", ymd, "_df_", type,
                      "_feats.rds")
    errfile <- paste0(normalizePath(save_folder), "/", ymd, "_df_", type,
                      "_removed.rds")
  }

  # aa_codes <- get_aa_codes()
  # invalid <- which(!sapply(df$Info_window_seq,
  #                          function(x){
  #                            all(strsplit(x, split = "")[[1]] %in% aa_codes)
  #                          }))
  #
  # if(length(invalid) > 0){
  #   errlist <- df[invalid, ]
  #   df      <- df[-invalid, ]
  #   if(!is.null(save_folder)) saveRDS(errlist, errfile)
  # }
  # ========================================================================== #

  # Calculate features
  cat("\nCalculating features")
  if (ncpus > 1) {
    DTthreads_old <- data.table::getDTthreads(verbose = FALSE)
    data.table::setDTthreads(threads = 1)
  }
  tmp <- mypblapply(ncpus   = ncpus,
                    X       = df$Info_window_seq,
                    FUN     = feat_calc,
                    max.N   = max.N)
  if (ncpus > 1) {
    data.table::setDTthreads(threads = DTthreads_old)
  }

  tmp <- data.table::rbindlist(tmp, use.names = TRUE, fill = TRUE)

  df <- cbind(df, tmp)

  # Sort data.table (The variable names are initialised below just to
  # prevent NOTEs on CRAN. The data.table ordering uses references to variables
  # internal to df)
  Info_sourceOrg_id <- Info_protein_id <- NULL
  Info_epitope_id   <- Info_center_pos <- NULL
  Info_UID <- NULL
  if (type == "epit"){
    df <- df[order(Info_sourceOrg_id, Info_protein_id, Info_epitope_id,
                   Info_center_pos), ]
    class(df) <- unique(c(class(df), "windowed_epit_dt"))
  } else {
    df <- df[order(Info_UID, Info_center_pos), ]
    class(df) <- unique(c(class(df), "windowed_prot_dt"))
  }

  if(!is.null(save_folder)) {
    saveRDS(df, df_file)
  }

  return(df)
}
fcampelo/epitopes documentation built on April 22, 2023, 12:23 a.m.