richcleaner: Clean results generated by GSEA

Documented in rich_aggregate rich_find rich_wider

#' find files
#'
#' The function looks recursively inside a folder and find specific files. Used internally.
#'
#' @param path The folder to look into
#' @param reg_expr Regexp to look
#' @return A dataframe
#' @export
rich_find <- function(path = path, reg_expr){

  # "report.*(pos|neg).*tsv"
  # ".*png$"
  files_reports <- data.frame(
    file = fs::dir_ls(path, recurse = TRUE,  regexp = reg_expr)
  )

  if(nrow(files_reports)  == 0){
    stop("No relevant files detected in the folder!")
  }

  # check the files at the same folder depth
  splitted_files <- strsplit(files_reports$file, "/")
  path_depth <- unlist(lapply(splitted_files, length))
  # if all the path_dept are equal to the first, then we have as many TRUE as the vector elements.
  if(!sum(path_depth == path_depth[[1]]) == length(path_depth)){
    stop("Not all the file of that  at the same path depth!!")
  }

  path_depth <-  path_depth[[1]]


  # easier to split by /, escape makes problems in separate
  if (Sys.info()['sysname'] == "Windows"){
    files_reports$file <- gsub("\\\\", "/", files_reports$file)
  }

  files_reports  <- tidyr::separate(files_reports,
                                    col  = file,
                                    into = as.character(1:path_depth[1]),
                                    sep = "/",
                                    remove = FALSE)

  # we are subsetting by position so if the folder tree is different we are screwed
  # we want to select the second to the last (contrast) and one to the last(gs)
  # remove _ from them
  # files_reports[path_depth - 2:1] <- apply(files_reports[path_depth - 2:1], 2, function(x) sub("_", "", x))

  files_reports[, "id"] <-   paste0(files_reports[, path_depth - 2], "~", files_reports[, path_depth - 1])

  files_reports
}

#' rich_wider
#'
#' Given a dataframe generate by \code{\link{rich_aggregate}}, filters it to get a specific `gs`, it calculates
#' negative `log10(p.norm) * sign of NES`, filters out all `FDR.q.val >  fdr_threshold` and
#' returns a wider datagrame with `pathways` as rows, `contrasts` as
#' column and one of the columns as the `value`. When a combination of contrast-pathway is not presence a 0 is substituted
#' to NA when pivoting.
#'
#' @param dat Dataframe as returned by \code{\link{rich_aggregate}}.
#' @param fdr_threshold Minimum `FDR.q.val` to keep.
#' @param gs Gene Set Database.
#' @param value Column from which to take the value for pivit wider. One of c("fdr_q_val", "nes", "nom_p_val", "n_logp_sign").
#'     Defaut is "n_logp_sign".
#' @export
rich_wider <- function(dat,
                       fdr_threshold,
                       gs,
                       value = "n_logp_sign"){

  # mr flick
  defined <- ls()
  passed <- names(as.list(match.call())[-1])
  if (any(!defined %in% passed)) {
    stop(paste("missing values for", paste(setdiff(defined, passed), collapse=", ")))
  }

  if(!is.numeric(fdr_threshold)){
    stop("fdr_threshold is not numeric!")
  }

  if(!value %in% c("fdr_q_val", "nes", "nom_p_val", "n_logp_sign")){
    stop("value needs to be one of  c(`fdr_q_val`, `nes`, `nom_p_val`, `n_logp_sign`)!")
  }

  if(!gs %in% unique(dat$gs)){
    stop("gs argument is not an among the elements of the column gs!!!")
  }

  dat$nom_p_val[dat$nom_p_val == 0 ] <- 10^-10

  dat[ ,"n_logp_sign"] <- -log10(dat$nom_p_val) * sign(dat$nes)

  dat <- dat[dat$gs == gs,]
  dat <- dat[dat$fdr_q_val < fdr_threshold,]

  dat_wider <- as.data.frame(tidyr::pivot_wider(dat,
                                  id_cols = "description",
                                  names_from = "contrast",
                                  values_from = value,
                                  # values_fn = length,
                                  values_fill = 0))

  row.names(dat_wider) <-  dat_wider$description
  dat_wider$description <- NULL
  dat_wider
}


#' aggregate enriched data
#'
#' The function looks recursively inside a folder and extract and bind together enrichment data (`report.*.tsv`)
#' generated by \href{https://www.gsea-msigdb.org/gsea/index.jsp}{gsea}. If not path is specified it opens
#' an interactive `{svDialogs}` modal dialog box  to chose the folder.
#'
#'
#' @param path The folder to look into, default is `svDialogs::dlg_dir()$res`
#' @return A dataframe of enriched results.
#' @export
rich_aggregate <- function(
                  path = svDialogs::dlg_dir()$res){

  if(path == "svDialogs::dlg_dir()$res") eval(path)

  files_reports <- rich_find(path = path, reg_expr = "report.*(pos|neg).*tsv")

  message("Importing files ...")
  rich_ls <- lapply(files_reports$file, data.table::fread, showProgress = TRUE)

  names(rich_ls) <- files_reports$id

  # remove empty dataframe
  nrows_rich_ls <- pbapply::pblapply(rich_ls, nrow)
  to_keep <- nrows_rich_ls != 0
  rich_df <- dplyr::bind_rows(rich_ls[to_keep], .id = "id")

  # clean names
  names(rich_df) <- gsub("\\.|\\s|-", "_", tolower(names(rich_df)))
  names(rich_df)[names(rich_df) == "name"] <- "description"
  rich_df <- tidyr::separate(rich_df, col = "id", into = c("contrast", "gs"), sep = "~")

  # remove unwanted columns
  to_remove <- append(grep("gs.+", names(rich_df), value = TRUE, perl = TRUE), "v12")
  rich_df[, to_remove] <- NULL

  rich_df
}