epitopes: Processing and Feature Extraction for Epitope Data

Documented in make_OrgSpec_datasets

#' Build organism-specific data sets for training epitope prediction models
#'
#' This function extracts organism or taxon-specific datasets from IEDB data
#' and returns data sets for the development and assessment of epitope
#' prediction models.
#'
#' @section Data splitting and BLAST requirements:
#' If the sum of `split_perc` is less than 100 an extra split is generated
#' with the remaining observations - e.g., `split_perc = c(50, 30)` results in
#' three sets with an approximately 50/30/20% split *of the total observations.*
#' If the sum is greater than 100 the splits are linearly scaled down so that
#' the sum becomes 100. Note that the split percents correspond to the number of
#' observations, not the number of unique IDs.
#'
#' This function will attempt to approximate the desired split levels, but
#' depending on the size of the data set and the desired `split_level` it may
#' not be possible (e.g., if `split_level = "org"` and a single organism
#' corresponds to 90% of the data, one of the splits will necessarily correspond
#' to at least 90% of the data, regardless of the values informed in
#' `split_perc`).
#'
#' If `split_level == "prot` the routine will keep any pairs of proteins
#' having (coverage >= `coverage_threshold` AND
#' identity >=  `identity_threshold`) under the same split. This is useful to
#' prevent accidental data leakage due to quasi-identical proteins with
#' different UIDs.
#' **NOTE**: this will require BLAST+ to be installed in your
#' local machine. For details on how to set up BLAST+ on your machine, check
#' <https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download>.
#' This function was developed for versions `blast 2.10.0` or later.
#'
#' @inheritParams prepare_join_df
#' @inheritParams filter_epitopes
#' @inheritParams split_epitope_data
#' @param taxonomy_list list containing taxonomy information
#'        (generated by [get_taxonomy()])
#' @param orgIDs vector of organism/taxon IDs to retain (see
#'        [filter_epitopes()]). If `NULL` then no organism ID  filtering is
#'        performed.
#' @param hostIDs vector of host IDs to retain  (see [filter_epitopes()]).
#'        If `NULL` then no host filtering is performed.
#' @param removeIDs vector of organism IDs to remove (see [filter_epitopes()]).
#' Useful for, e.g., using a Class-level `orgIDs` and removing some species
#' or genera. If `NULL` then no removal is performed.
#' @param window_size positive integer, size of window to use.
#' @param max.N maximum length of N-peptide frequency features to be calculated.
#' @param ncpus number of cores to use for data windowing and feature
#'              calculation.
#'
#' @return List containing the resulting datasets.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk})
#'
#' @export
#'
make_OrgSpec_datasets <- function(epitopes, proteins, taxonomy_list,
                                  orgIDs          = NULL,
                                  hostIDs         = NULL,
                                  removeIDs       = NULL,
                                  save_folder     = "./",
                                  min_epit        = 8,
                                  max_epit        = 25,
                                  only_exact      = FALSE,
                                  pos.mismatch.rm = "all",
                                  set.positive    = "mode",
                                  window_size     = 2 * min_epit - 1,
                                  max.N           = 2,
                                  split_level     = "prot",
                                  split_perc      = c(75, 25),
                                  split_names     = c("01_training", "02_holdout"),
                                  coverage_threshold = 80,
                                  identity_threshold = 80,
                                  ncpus           = 1){

  # ========================================================================== #
  # Sanity checks and initial definitions
  id_classes <- c("NULL", "numeric", "integer", "character")
  assertthat::assert_that(class(orgIDs)    %in% id_classes,
                          class(hostIDs)   %in% id_classes,
                          class(removeIDs) %in% id_classes,
                          assertthat::is.count(min_epit),
                          assertthat::is.count(max_epit),
                          min_epit <= max_epit,
                          pos.mismatch.rm %in% c("all", "align"),
                          set.positive    %in% c("any", "mode", "all"),
                          is.logical(only_exact) & length(only_exact) == 1,
                          assertthat::is.count(window_size),
                          assertthat::is.count(max.N),
                          assertthat::is.count(ncpus),
                          is.character(save_folder),
                          length(save_folder) == 1,
                          is.character(split_level), length(split_level) == 1,
                          split_level %in% c("org", "prot", "epit"),
                          is.numeric(split_perc),
                          all(sapply(split_perc, assertthat::is.count)),
                          is.null(split_names) | is.character(split_names))

  if(!dir.exists(save_folder)) dir.create(save_folder, recursive = TRUE)
  # ========================================================================== #

  # Join proteins onto epitope dataset
  jdf <- prepare_join_df(epitopes = epitopes, proteins = proteins,
                         min_epit = min_epit, max_epit = max_epit,
                         only_exact = only_exact,
                         pos.mismatch.rm = pos.mismatch.rm,
                         set.positive = set.positive)

  # Filter epitopes by organism / host IDs
  jdf <- filter_epitopes(df        = jdf,
                         orgIDs    = orgIDs,
                         hostIDs   = hostIDs,
                         removeIDs = removeIDs,
                         tax_list  = taxonomy_list)

  # Prepare windowed representation
  wdf <- make_window_df(df = jdf,
                        window_size = window_size,
                        ncpus = ncpus)

  # Calculate features
  wdf <- calc_features(wdf, max.N = max.N, ncpus = ncpus)

  # Select relevant proteins
  prots <- proteins[which(proteins$UID %in% unique(jdf$protein_id)), ]


  if (split_level == "prot") {
    # Check that BLASTp is installed
    a <- tryCatch(system2("blastp", args = "-version",
                          stdout = TRUE, stderr = TRUE),
                  error   = function(c){"Nope!"},
                  warning = function(c){"Nope!"})
    if (a == "Nope!"){
      message("\nBLASTp not installed.
              \rPlease check function documentation for instructions.
              \rSplitting will be done using only protein IDs.")

    } else {
      blast_v <- strsplit(a[1], split = ": ", fixed = TRUE)[[1]][2]
      BLAST_path <- paste0(save_folder, "/BLASTp")
      if(!dir.exists(BLAST_path)) dir.create(BLAST_path)

      # Run BLASTp to determine protein similarities
      fn         <- paste0(BLAST_path, "/prots.fasta")
      seqinr::write.fasta(as.list(prots$TSeq_sequence), names = prots$UID,
                          file.out = fn, as.string = TRUE, nbchar = 100000)

      #if(gsub("\\+", "", blast_v) == "2.10.0"){
      blast_call <- paste0("blastp -query ", fn, " -db ", fn, " -seg no ",
                           "-outfmt '6 qseqid sseqid pident qcovhsp' > ",
                           fn, "-BLAST")
      # } else {
      #   blast_call <- paste0("blastp -query ", fn, " -db ", fn, " -seg no ",
      #                        "-outfmt '6 qseqid sseqid length qlen slen nident pident qcovhsp mismatch gaps qstart qend sstart send evalue score' > ",
      #                        fn, "-BLAST")
      #
      #
      # }
      system(paste0("makeblastdb -in ", fn, " -dbtype prot"))
      system(blast_call)
    }
  }

  # Split data into sets
  splits <- split_epitope_data(wdf,
                               split_level = split_level,
                               split_perc  = split_perc,
                               split_names = split_names,
                               blast_file  = unlist(ifelse(a[1] == "Nope!",
                                                           list(NULL),
                                                           paste0(fn, "-BLAST"))),
                               coverage_threshold = coverage_threshold,
                               identity_threshold = identity_threshold)

  # Save results
  for (i in seq_along(splits)){
    saveRDS(splits[[i]]$wdf,
            file = paste0(save_folder, "/", names(splits)[i], ".rds"))
  }

  return(splits)
}