scCustFx: single-cell custom fxs Package

Documented in runSDAv1_2D SDA.AddCompStats SDApostprocessing SDAresChecks

#' Run SDA V1.1 in 2D i.e., matrix form
#'
#' Executes the SDA algorithm on preprocessed and normalized single-cell gene expression data,
#' allowing for dimensionality reduction and analysis. This function is designed to work with
#' the specified version of SDA, taking in several parameters to customize the analysis process,
#' including the number of components, iteration settings, and parallel processing options.
#'
#' @param path.sda A character string specifying the path to the SDA software executable.
#' @param resultsDir A character string specifying the directory where the SDA results will be saved.
#' @param rawDataFile A character string specifying the path to the preprocessed and normalized gene
#'   expression data file to be analyzed.
#' @param numComps Integer, the number of components to be used in the analysis, default is 30.
#' @param max_iter Integer, the maximum number of iterations for the SDA algorithm, default is 100.
#' @param save_freq Integer, frequency of saving interim results, default is 20 iterations.
#' @param set_seed Integer, the seed for random number generation to ensure reproducibility, default is 1234.
#' @param n_cells Integer, the number of cells in the dataset to be analyzed.
#' @param nThreads Integer, the number of threads for parallel processing, default is 6.
#'   If greater than 1, eigen decomposition will be done in parallel.
#' @param num_blocks Integer, the number of blocks to divide the dataset into for parallel processing, default is 6.
#'
#' @return Invisible NULL. The function is called for its side effects, including the execution
#'   of the SDA analysis and saving of results to the specified directory.
#' @export
runSDAv1_2D <- function(path.sda, resultsDir, rawDataFile, numComps = 30, max_iter = 100, save_freq = 20, 
                        set_seed = 1234, n_cells, nThreads = 6, num_blocks = 6){
  
  
  SDAtools::run_SDA(sda_location = path.sda, out = resultsDir, 
                    data = rawDataFile, num_comps = numComps, max_iter = max_iter, 
                    save_freq = save_freq, set_seed = set_seed, N = n_cells, 
                    eigen_parallel = (nThreads > 1), ignore_missing = FALSE, 
                    num_blocks = num_blocks, num_openmp_threads = nThreads)
  
  
  
}

#' Post-process Single-cell Data Analysis (SDA) Results
#'
#' This function loads the results from an SDA analysis, enriches them with additional metadata such
#' as cell barcodes and feature (gene) names, computes component statistics, and optionally performs
#' Gene Ontology (GO) enrichment analysis on the components. It is designed to work with results
#' generated by the SDAtools and provides a comprehensive overview of the analysis for further interpretation.
#'
#' @param resultsDir A character string specifying the directory where the SDA analysis results are stored.
#' @param outputFolder A character string specifying the path to the output folder where additional
#'   processed results and analyses will be saved.
#' @param normedDGE A matrix or data frame containing the normalized gene expression data used in the
#'   SDA analysis. Row names should correspond to features (genes), and column names should correspond to cell barcodes.
#' @param storeGoEnrichment A logical value indicating whether to perform and store Gene Ontology (GO)
#'   enrichment analysis on the analysis components. Default is `TRUE`.
#'
#' @return A list containing the enriched SDA results, including cell barcodes, feature names, component
#'   statistics, and optionally GO enrichment analysis results.
#' @export
SDApostprocessing <- function(resultsDir, 
                              outputFolder, 
                              normedDGE, 
                              orgDb = "org.Hs.eg.db", 
                              # mouse org.Mm.eg.db
                              # rhesus macaque org.Mmu.eg.db
                              storeGoEnrichment = T){
  
  results <- SDAtools::load_results(results_folder = resultsDir, 
                                    data_path = outputFolder)
  
  results$CellBarcodes <- colnames(normedDGE)
  results$Features <- rownames(normedDGE)
  results <-  SDA.AddCompStats(results)
  
  if (storeGoEnrichment) {
    results$goEnrichment <- SDA.GO_Enrichment(results, components = 1:nrow(results$loadings[[1]]), orgDb=orgDb)
  }
  
  
  return(results)
}


#' @title SDA.AddCompStats
#'
#' @description SDA.AddCompStats
#' @param SDAres An SDA results list/object 
#' @return An SDA results list/object with Comp statistics
#' @import data.table
#' @export
SDA.AddCompStats <- function(SDAres, sdThrsh = 0.04, maxscoreThrsh=20, 
                             maxloadThrsh = 1, redoCalc=T){
  if (redoCalc)   SDAres$component_statistics <- NULL
  if (is.null(SDAres$component_statistics) ) {
    
    SDAres$component_statistics <- as.data.frame(data.table(
      Component = 1:SDAres$n$components, 
      Component_name = dimnames(SDAres$scores)[[2]], 
      max_score = apply(abs(SDAres$scores),  2, max),
      max_loading = apply(abs(SDAres$loadings[[1]]), 1, max),
      mean_score = apply(abs(SDAres$scores),  2, mean),
      mean_loading = apply(abs(SDAres$loadings[[1]]), 1, mean),
      sd_score = apply(abs(SDAres$scores),  2, sd),
      sd_loading = apply(abs(SDAres$loadings[[1]]), 1, sd),
      ssqrd_score = apply(SDAres$scores,  2, function(x) sum(x^2)),
      ssqrd_loading = apply(SDAres$loadings[[1]], 1, function(x) sum(x^2))
    )[order(-Component)])
    
  }
  
  SDAres$component_statistics$Component_namev2    <- SDAres$component_statistics$Component_name
  SDAres$component_statistics$Component_name_plot <- SDAres$component_statistics$Component_name
  
  SDAres$component_statistics[which(SDAres$component_statistics$sd_loading<sdThrsh),]$Component_namev2         <- rep("", length(which(SDAres$component_statistics$sd_loading<sdThrsh)))
  
  SDAres$component_statistics[which(SDAres$component_statistics$max_score<maxscoreThrsh & SDAres$component_statistics$max_loading<maxloadThrsh),]$Component_name_plot <- rep("", length(which(SDAres$component_statistics$max_score<maxscoreThrsh & SDAres$component_statistics$max_loading<maxloadThrsh)))
  
  SDAres$component_statistics <- data.table(SDAres$component_statistics)
  return(SDAres)
  
}



#' Perform Gene Ontology Enrichment Analysis on SDA Components
#'
#' This function conducts Gene Ontology (GO) enrichment analysis on specified components
#' from Single-cell Data Analysis (SDA) results. It supports analysis in positive, negative,
#' or both directions and allows specifying the number of top genes to consider. The function
#' utilizes `clusterProfiler` for the enrichment analysis and is capable of handling different
#' organism databases.
#'
#' @param sdaResults A list containing SDA results, expected to include a 'loadings' element
#'   with components and gene loadings.
#' @param components A numeric vector indicating which components from the SDA results to analyze.
#' @param orgDb A character string specifying the Bioconductor organism annotation package
#'   to use for the enrichment analysis. Default is "org.Hs.eg.db" for human. Other examples
#'   include "org.Mm.eg.db" for mouse and "org.Mmu.eg.db" for rhesus macaque.
#' @param geneNumber An integer specifying the number of top genes to consider for enrichment
#'   analysis in each component and direction. Default is 100.
#' @param direction A character string indicating the direction of gene loading to consider
#'   for the analysis. Can be "Pos" for positive, "Neg" for negative, or "Both" for both directions.
#'   Default is "Both".
#'
#' @return A list where each element corresponds to a component and direction analyzed, containing
#'   the results of the GO enrichment analysis. Results include enrichment scores, gene ratios,
#'   background ratios, and odds ratios among others.
#'   
#' @export
SDA.GO_Enrichment <- function (sdaResults, components, 
                               orgDb = "org.Hs.eg.db", 
                               # mouse org.Mm.eg.db
                               # rhesus macaque org.Mmu.eg.db
                               geneNumber = 100, 
          direction = "Both") 
{
  ret <- list()
  if (direction == "Both") {
    directions <- c("Pos", "Neg")
  }
  else {
    directions <- direction
  }
  for (comp in components) {
    for (direction in directions) {
      print(paste0("Loading component ", comp, ", direction: ", 
                   direction))
      if (direction == "Neg") {
        top_genes <- names(sort(sdaResults$loadings[[1]][comp, 
        ]))[1:geneNumber]
      }
      else if (direction == "Pos") {
        top_genes <- names(sort(sdaResults$loadings[[1]][comp, 
        ], decreasing = T))[1:geneNumber]
      }
      gene_universe <- names(sdaResults$loadings[[1]][comp, 
      ])
      ego <- clusterProfiler::enrichGO(gene = top_genes, 
                                       universe = gene_universe, OrgDb = orgDb, keyType = "SYMBOL", 
                                       ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 1, 
                                       qvalueCutoff = 1)
      frac_to_numeric <- function(x) sapply(x, function(x) eval(parse(text = x)))
      ego@result$Enrichment <- frac_to_numeric(ego@result$GeneRatio)/frac_to_numeric(ego@result$BgRatio)
      ego@result$GeneOdds <- unlist(lapply(strsplit(ego@result$GeneRatio, 
                                                    "/", fixed = TRUE), function(x) {
                                                      x <- as.numeric(x)
                                                      x[1]/(x[2] - x[1])
                                                    }))
      ego@result$BgOdds <- unlist(lapply(strsplit(ego@result$BgRatio, 
                                                  "/", fixed = TRUE), function(x) {
                                                    x <- as.numeric(x)
                                                    x[1]/(x[2] - x[1])
                                                  }))
      ret[[paste0(comp, "-", direction)]] <- ego@result
    }
  }
  return(ret)
}

#' Diagnostic Checks on SDA Results
#'
#' Executes a series of diagnostic checks on Single-cell Data Analysis (SDA) results to
#' assess convergence, distribution of loadings and scores, and Posterior Inclusion Probability (PIP)
#' distributions. The function uses `SDAtools` for generating diagnostic outputs and handles
#' errors gracefully for each diagnostic step, ensuring that all possible checks are attempted.
#'
#' @details This function is designed to be run with SDA results available in the global environment
#'   or within its scope. It attempts to run convergence checks, loading and score distributions, and
#'   PIP distributions. If an error occurs during any of the checks, it catches the error, prints
#'   an error message specific to the failed check, and continues with the next checks.
#'
#' @return Invisible NULL. The function is called for its side effects, which include printing
#'   diagnostic information and error messages directly to the console.
#'
#' @export
SDAresChecks <- function(results){
  tryCatch({
    print(SDAtools::check_convergence(results))
    print(SDAtools::loading_distribution(results))
    print(SDAtools::scores_distribution(results))
  }, error = function(e) {
    print(paste0("Error generating SDA first plots"))
    print(conditionMessage(e))
  })
  tryCatch({
    print(SDAtools::PIP_distribution(results))
  }, error = function(e) {
    print(paste0("Error generating SDA PIP_distribution"))
    print(conditionMessage(e))
  })
  tryCatch({
    print(SDAtools::PIP_threshold_distribution(results))
  }, error = function(e) {
    print(paste0("Error generating SDA PIP_threshold_distribution"))
    print(conditionMessage(e))
  })
}