R/marker_ranking_boruta.R

Defines functions marker_ranking_boruta

Documented in marker_ranking_boruta

#' @title Marker Ranking by Boruta
#' @description Function to run Boruta on the stimulation - cell population combinations that passed the Fisher's exact test to rank the markers according to their contribution to the response.
#' @param mapped_data      Returned list from the \code{\link{HDStIM}} function.
#' @param path             Path to the folder to save figures generated by this function.
#' @param n_cells          Number of cells to down sample the data. Default is NULL to include all the cells.
#' @param max_runs         Maximum number of runs for the random forest algorithm. Default is 100.
#' @param seed_val         Seed value for Boruta. Default is 123.
#' @param verbose          0, 1, or 2. Default is 0.
#' @return                 A list with a tibble containing attribute statistics calculated by Boruta and ggplot objects. If the path is not NULL, plots are also rendered and saved in the specified folder in PNG format.
#' @import Boruta ggplot2
#' @importFrom tibble as_tibble
#' @importFrom dplyr slice_sample
#' @importFrom grDevices png dev.off rainbow
#' @importFrom graphics plot
#' @importFrom stats as.formula
#' @export
#'
#' @examples
#' \donttest{
#' mapped_data <- HDStIM(chi11$expr_data, chi11$state_markers,
#'                       chi11$cluster_col, chi11$stim_label,
#'                       chi11$unstim_label, seed_val = 123, umap = FALSE, umap_cells = NULL,
#'                       verbose = FALSE)
#'
#' marker_ranking <- marker_ranking_boruta(mapped_data, path = NULL, n_cells = NULL,
#'                                         max_runs = 1000, seed_val = 123,
#'                                         verbose = 0)
#' }
marker_ranking_boruta <- function(mapped_data, path = NULL, n_cells = NULL, max_runs = 100, seed_val = 123, verbose = 0){
  # Check if path exists; if not then create it.
  if(!is.null(path)){
    if(!dir.exists(path)){
      if(verbose > 0){message(paste("Creating %s folder", path))}
      dir.create(path, recursive = TRUE)
    } else {
      if(verbose > 0){message(paste(path, "folder already exists. Output will be over written."))}
    }
  }

  # Bind global variables.
  comb_no <- maxImp <- medianImp <- minImp <- normHits <- state_marker <- decision <- NULL

  state_markers <- mapped_data$state_markers
  dat_boruta <- mapped_data$response_mapping_main
  form_boruta <- as.formula(paste0("k_cluster_id ~ ",paste0(state_markers, collapse = " + ")))

  grouped <- group_by(dat_boruta, comb_no)
  split_data <- group_split(grouped)

  df_stats_out <- data.frame()

  allplots <- list()
  for(i in 1:length(split_data)){
    # dat <- split_data[[1]] # For debugging.
    dat <- split_data[[i]]
    if(!is.null(n_cells)){
      if(n_cells < nrow(dat)){
        set.seed(seed_val)
        dat <- dat %>% dplyr::slice_sample(n = n_cells)
      }
    }

    stim <- as.character(unique(dat$stim_type)[unique(dat$stim_type) %in% mapped_data$stim_label])
    clust <- as.character(unique(dat$cell_population))

    # Run Boruta.
    set.seed(seed_val)
    res_boruta <- Boruta(form_boruta, data = dat, doTrace = verbose, maxRuns = max_runs)
    att_stats <- attStats(res_boruta)

    # Estimate variable importance and generate output data frame.
    df_imp <- tibble::rownames_to_column(att_stats, "state_marker")
    df_imp <- df_imp[order(df_imp$medianImp),]
    df_imp$state_marker <- factor(df_imp$state_marker, levels = df_imp$state_marker)
    df_imp$decision <- as.character(df_imp$decision)
    df_imp <- cbind("stim_type" = stim, "cell_population" = clust, df_imp)
    df_stats_out <- rbind(df_stats_out, df_imp)

    # Plot.
    plot_title <- paste0("Stim.: ", stim,"\nCell Pop.: ", clust)
    my_cols <- c("Confirmed" = "darkgreen", "Tentative" = "darkblue", "Rejected" = "darkred")

    att_plot <- ggplot(df_imp, aes(x = state_marker, y = medianImp, col = decision)) +
      geom_point() +
      geom_errorbar(aes(ymin=minImp, ymax=maxImp), width = 0.2) +
      labs(x = "State Marker", y = "Importance", title = plot_title, col = "Decision") +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      scale_colour_manual(values = my_cols)

    allplots[[i]] <- att_plot

    if(!is.null(path)){
      att_file <- paste0("imp_", stim, "_", clust, ".png")
      ggsave(att_file, plot = att_plot, path = path,
             device = "png", dpi = 300, width = 7, height = 6, units = "in")
    }
  }
  return(list("attribute_stats" = as_tibble(df_stats_out),
         "plots" = allplots))
}
niaid/HDStIM documentation built on Oct. 15, 2023, 4:43 p.m.