epitopes: Processing and Feature Extraction for Epitope Data

Documented in filter_epitopes

#' Filter epitopes by taxonomy and host
#'
#' Filters an epitope dataframe by source organism and/or host IDs.
#'
#' @param df *data.table* object of class *LBCE_dt* (generated by [get_LBCE()]),
#'        *joined_epit_dt* (generated by [prepare_join_df()]) or
#'        *windowed_epit_dt* (generated by [make_window_df()])
#' @param orgIDs vector of organism taxon IDs that we want to retain
#'        (using *sourceOrg_id*).
#' @param hostIDs vector of host taxon IDs that we want to retain
#'        (using *host_id*).
#' @param removeIDs vector of organism taxon IDs that we want to remove
#'        (using *sourceOrg_id*).
#' @param tax_load_file optional taxonomy file (RDS file generated either by this
#'        function or by [get_taxonomy()]).
#' @param tax_list optional taxonomy list (to be passed instead of tax_load_file)
#' @param tax_save_folder optional folder saving the taxonomy. Ignored
#'        if **tax_load_file** is not _NULL_.
#'
#' @return Epitope dataframe filtered by the criteria in **orgIDs**, **hostIDs** and
#'         **removeIDs**.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk})
#'
#' @export
#'

filter_epitopes <- function(df,
                            orgIDs    = NULL,
                            hostIDs   = NULL,
                            removeIDs = NULL,
                            tax_load_file = NULL,
                            tax_list    = NULL,
                            tax_save_folder = NULL) {

  # ========================================================================== #
  # Sanity checks and initial definitions
  id_classes <- c("NULL", "numeric", "integer", "character")
  df_classes <- c("LBCE_dt", "joined_epit_dt", "windowed_epit_dt")
  assertthat::assert_that(is.data.frame(df),
                          any(class(df)   %in%  df_classes),
                          class(orgIDs)    %in% id_classes,
                          class(hostIDs)   %in% id_classes,
                          class(removeIDs) %in% id_classes,
                          is.null(tax_load_file)   | is.character(tax_load_file),
                          is.null(tax_list)        | is.list(tax_list),
                          is.null(tax_save_folder) | is.character(tax_save_folder),
                          is.null(tax_save_folder) | length(tax_save_folder) == 1)

  if(!is.null(tax_load_file)) assertthat::assert_that(file.exists(tax_load_file))

  # Standardise relevant variables:
  df_type <- class(df)[grep("_dt", class(df))]
  if(df_type %in% c("LBCE_dt", "joined_epit_dt")){
    ids <- data.frame(org = df$sourceOrg_id, host = df$host_id)
  } else {
    ids <- data.frame(org = df$Info_sourceOrg_id, host = df$Info_host_id)
  }

  if (is.null(tax_load_file) & is.null(tax_list)){
    tax  <- get_taxonomy(unique(ids$org), save_folder = tax_save_folder)
  } else if (!is.null(tax_list)){
    tax <- tax_list
  } else {
    tax <- readRDS(tax_load_file)
  }

  # Extract the relevant organism IDs using the taxonomy data
  fextr <- function(x, tid){
    ul <- c(x$Taxonomy$UID, x$UID)
    if(any(tid %in% ul)) return(x$UID)
  }

  target_org  <- unlist(sapply(tax, FUN = fextr, tid = orgIDs))
  target_host <- unlist(sapply(tax, FUN = fextr, tid = hostIDs))
  target_rm   <- unlist(sapply(tax, FUN = fextr, tid = removeIDs))
  target_org  <- unique(c(target_org, orgIDs))
  target_host <- unique(c(target_host, hostIDs))
  target_rm   <- unique(c(target_rm, removeIDs))

  # Get the filtering indices
  fmatch <- function(pat, trg){
    str <- unlist(strsplit(pat, split = ",", fixed = TRUE))
    any(str %in% trg)
  }
  if (!is.null(orgIDs)){
    idx1 <- which(sapply(ids$org, FUN = fmatch, trg = target_org))
    ids  <- ids[idx1, ]
    df <- df[idx1, ]
  }

  if (!is.null(hostIDs)){
    idx2 <- which(sapply(ids$host, FUN = fmatch, trg = target_host))
    ids  <- ids[idx2, ]
    df <- df[idx2, ]
  }

  if (!is.null(removeIDs)){
    idx3 <- which(sapply(ids$org, FUN = fmatch, trg = target_rm))
    if (length(idx3) > 0){
      ids  <- ids[-idx3, ]
      df <- df[-idx3, ]
    }
  }


  if(df_type %in% c("LBCE_dt", "joined_epit_dt")){
    # Initialize dummy variables to prevent CRAN notes
    sourceOrg_id <- protein_id <- epitope_id <- NULL
    # Sort data.table (the variable names referred below are internal to df)
    df <- df[order(sourceOrg_id, protein_id, epitope_id), ]
  } else {
    # Initialize dummy variables to prevent CRAN notes
    Info_sourceOrg_id <- Info_protein_id <- NULL
    Info_epitope_id   <- Info_center_pos <- NULL
    df <- df[order(Info_sourceOrg_id, Info_protein_id, Info_epitope_id,
                   Info_center_pos), ]
  }

  return(df)

}