GINIR: Genetic inteRaction and EssenTiality neTwork mApper

Documented in GI_screen_perms

#' @title Perform permutation test n genetic interaction screen
#' 
#' @description Randomly samples dependency probabilities from a given mutant and control group screened.
#' 
#' @param control_id string, A vector containing two or more DepMap_id, Default: NULL
#' @param mutant_id string, A vector containing two or more DepMap_id, Default: NULL
#' @param rnai_screen logical, TRUE if performing analysis using RNAi screen data, FALSE (default) performing analysis using CRISPR KO data, Default: FALSE
#' @param n_perm integer, Number of permutations to run, Default: 100
#' @param core_num integer, Number of cores to run analysis, Default: NULL
#' @param output_dir string, Full path to where output file should be saved, Default: NULL
#' @param data_dir string Path to GRETTA_data
#' @param filename string name of file without the '.csv' extension. 
#'
#' @return A data frame containing results from the genetic screen. A copy is also saved to the 
#' directory defined in `output_dir`.
#' 
#' @details Description of output data frame
#' * `perm_test` - permutation iteration
#' * `_median`, `_mean` - Control and mutant group's median, mean, of dependency probabilities. Dependency probabilities range from zero to one, 
#' where one indicates a essential gene (ie. KO of gene was lethal) and zero indicates a non-essential gene 
#' (KO of gene was not lethal)
#' * `Pval` - P-value from Mann Whitney U test between control and mutant groups.
#' * `n_perm` - Number of permutations assigned
#' @md
#' 
#' @examples 
#' gretta_data_dir <- './GRETTA_example/'
#' gretta_output_dir <- './GRETTA_example_output/'
#' 
#' if(!dir.exists(gretta_data_dir)){
#'   download_example_data(".")
#' }
#' 
#' if(!dir.exists(gretta_data_dir)){
#'   download_example_data(".")
#' }
#' 
#' Screen_results <- GI_screen_perms(
#' control_id = c('ACH-001354', 'ACH-000274', 'ACH-001799'), 
#' mutant_id = c('ACH-000911', 'ACH-001957', 'ACH-000075'), 
#' n_perm = 10,
#' core_num = 2, 
#' output_dir = gretta_output_dir,
#' data_dir = gretta_data_dir)
#' 
#' @rdname GI_screen_perms
#' @export 
#' @importFrom parallel detectCores
#' @importFrom doMC registerDoMC
#' @importFrom dplyr mutate filter group_by summarize pull rename select count
#' @importFrom forcats fct_relevel
#' @importFrom foreach `%dopar%` foreach
#' @importFrom rcompanion cliffDelta
#' @importFrom diptest dip.test
#' @importFrom broom tidy
#' @importFrom tidyr pivot_longer pivot_wider
#' @importFrom tidyselect everything
#' @importFrom readr write_csv
#' @importFrom stats median sd IQR wilcox.test p.adjust

GI_screen_perms <- function(control_id = NULL, mutant_id = NULL, 
  rnai_screen = FALSE, n_perm = 100,
  core_num = NULL, output_dir = NULL,
  data_dir = NULL, filename = NULL) {
  
  # Check that essential inputs are given:
  if (is.null(control_id)) {
    stop("No control IDs detected")
  }
  if (is.null(mutant_id)) {
    stop("No mutant IDs detected")
  }
  if (is.null(core_num)) {
    cores_detected <- parallel::detectCores()
    message("No cores specified")
    message("Detected: ", cores_detected, " cores")
    message("Using: ", cores_detected/2, " cores")
    doMC::registerDoMC(cores_detected/2)
  }
  if (is.null(output_dir)) {
    output_dir <- paste0(getwd(), "/GRETTA_", Sys.Date())
    say <- paste0("No output directory specified. Creating: ",
                  output_dir)
    message(say)
    dir.create(output_dir)
  }
  if (!dir.exists(output_dir)) {
    stop("Output directory does not exist. Please provide full path to directory.")
  }
  if (is.null(data_dir)) {
    stop("No directory to data was specified. Please provide path to DepMap data.")
  }
  if (!dir.exists(data_dir)) {
    stop("DepMap data directory does not exists. Please check again and provide the full path to the DepMap data directory.")
  }
  if (!is.null(filename)) {
    output_dir_and_filename <- paste0(output_dir,
                                      "/", filename, ".csv")
  } else {
    output_dir_and_filename <- paste0(output_dir,
                                      "/GRETTA_GI_screen_perms.csv")
  }
  
  # Set cores:
  if (!is.null(core_num)) {
    doMC::registerDoMC(core_num)
  }
  
  # Check to see enough samples were given:
  if (length(control_id) < 2) {
    stop("Not enough controls! Provide at least two.")
  }
  if (length(mutant_id) < 2) {
    stop("Not enough mutants! Provide at least two.")
  }
  
  # Load necessary data
  if(rnai_screen){ # RNAi screen
    dep <- dep_annot <- rnai_long <- rnai_annot <- NULL
    load(paste0(data_dir, "/rnai_long.rda"), envir = environment())
    load(paste0(data_dir, "/rnai_annot.rda"), envir = environment())
    dep <- rnai_long 
    dep_annot <- rnai_annot

  } else { # CRISPR KO screen
    dep <- dep_annot <- NULL  # see: https://support.bioconductor.org/p/24756/
    load(paste0(data_dir, "/dep.rda"), envir = environment())
    load(paste0(data_dir, "/dep_annot.rda"), envir = environment())
  }
  
  # Check to see if enough samples were given
  # after filtering:
  Control_group_avail <- control_id[control_id %in% dep$DepMap_ID]
  Mutant_groups_avail <- mutant_id[mutant_id %in% dep$DepMap_ID]
  if (length(Control_group_avail) < 2) {
    say <- paste0(
      "Not enough controls were screened! Only the following control samples were screen: ",
      paste0(Control_group_avail, collapse = ", "))
    stop(say)
  }
  if (length(Mutant_groups_avail) < 2) {
    say <- paste0(
      "Not enough mutants were screened! Only the following mutant samples were screen: ",
      paste0(Mutant_groups_avail, collapse = ", "))
    stop(say)
  }
  
  # Filter dep probs to only those that are used:
  if(rnai_screen){
    select_dep <- dep %>% 
      dplyr::rename(DepProb = "values") %>%
      dplyr::mutate(
        CellType = case_when(
          DepMap_ID %in% Mutant_groups_avail ~ "Mutant", 
          DepMap_ID %in% Control_group_avail ~ "Control", 
          TRUE ~ "Others")) %>%
      dplyr::filter(.data$CellType != "Others", !is.na(.data$DepProb)) %>%
      dplyr::mutate(CellType = forcats::fct_relevel(.data$CellType, "Control", "Mutant"))

  } else {
    select_dep <- dep %>%
      tidyr::pivot_longer(
        cols = tidyselect::matches("\\d"),
        names_to = "GeneNameID", 
        values_to = "DepProb") %>%
      dplyr::mutate(
        CellType = case_when(
          DepMap_ID %in% Mutant_groups_avail ~ "Mutant", 
          DepMap_ID %in% Control_group_avail ~ "Control", 
          TRUE ~ "Others")) %>%
      dplyr::filter(.data$CellType != "Others", !is.na(.data$DepProb)) %>%
      dplyr::mutate(CellType = forcats::fct_relevel(.data$CellType, "Control", "Mutant"))
  }
  
  # Need to define function. A fix for a
  # strange bug:
  `%dopar%` <- foreach::`%dopar%`
  
  # Begin loop
  All_res <- each <- NULL
  All_res <- foreach::foreach(each = seq_len(n_perm), .combine = bind_rows) %dopar% {
    
    # Give feedback
    if (each == 1) {
      message("Processing ", each, " of ", n_perm)
    } else if (each == n_perm) {
      message("Processing ", each, " of ", n_perm)
    } else if (each%%1000 == 0) {
      message("Processing ", each, " of ", n_perm)
    }
    
    # Create randomly sampled DepProb dataframe
    # need to recount due to potential loss of samples with NA values
    Control_group_avail <- select_dep %>% 
      dplyr::filter(.data$CellType == "Control") %>% 
      dplyr::pull(.data$DepMap_ID) %>% unique()
    Mutant_groups_avail <- select_dep %>%
      dplyr::filter(.data$CellType == "Mutant") %>% 
      dplyr::pull(.data$DepMap_ID) %>% unique()
      
    dummy_geneID <- "A1BG_1"
    df <- tibble::tibble(
      DepMap_ID = unique(select_dep$DepMap_ID),
      GeneNameID = dummy_geneID,
      CellType = c(rep("Control", length(Control_group_avail)), rep("Mutant", length(Mutant_groups_avail))),
      DepProb = sample(select_dep$DepProb, size = length(unique(select_dep$DepMap_ID)), replace = FALSE)
    )
    
    df_post_filter_check <- df %>%
      dplyr::count(.data$CellType)
    
    if (any(df_post_filter_check$n < 2)) {
      populate <- rep(NA, 5)
      
    } else if (all(df$DepProb == 0)) {
      populate <- rep(0, 5)
      
    } else if (all(df$DepProb == 1)) {
      populate <- rep(1, 5)
      
    } else {
      # # MWU doesn't handle na or zero's
      # well so # FOR NOW remove zeros.  df
      # <- df %>% filter(!is.na(DepProb))
      # %>% filter(DepProb != 0)
      
      stats <- df %>%
        dplyr::group_by(.data$CellType) %>%
        dplyr::summarize(
          Median = stats::median(.data$DepProb, na.rm = TRUE), 
          Mean = mean(.data$DepProb, na.rm = TRUE), .groups = "drop")
      
      if ((any(is.na(stats)) != TRUE) & (nrow(stats) == 2)) {
        fit_pval <- stats::wilcox.test(
          DepProb ~ CellType, df, 
          # paired = FALSE, 
          alternative = "two.sided",
          conf.int = TRUE, na.action = "na.omit")$p.value
        
      } else if ((any(is.na(stats)) == TRUE) & (nrow(stats) == 2)) {
        populate <- rep(0, 5)
      }
      
      if ((any(is.na(stats)) != TRUE) & (nrow(stats) == 2)) {
        populate <- as.numeric(c(unlist(stats)[-c(1, 2)], fit_pval))
      } else {
        populate <- rep(0, 5)
      }
    }
    
    tibble(Result = 
      c("Control_median", "Mutant_median", "Control_mean", 
      "Mutant_mean", "Pval")) %>%
      dplyr::mutate(!!sym(dummy_geneID) := populate) %>%
      tidyr::pivot_longer(-.data$Result) %>%
      tidyr::pivot_wider(
        names_from = .data$Result,
        values_from = .data$value) %>%
      dplyr::rename(GeneNameID = .data$name)
  }  # End of for loop
  
  # Add mutant group name
  output <- All_res %>%
    dplyr::mutate(
      perm_test = 1:n_perm,
      n_perm = n_perm) %>%
    dplyr::select(-.data$GeneNameID) %>%
    dplyr::select(.data$perm_test, everything())
  
  # save and return output
  output %>%
    readr::write_csv(file = output_dir_and_filename)
  
  message(
    "Permutations finished. Outputs were also written to: ", 
    output_dir_and_filename)

  return(output)
}