R/deTest.R

Defines functions fx_calcMarker fx_calcNeighb fx_calcDist_cellEmb fx_calcDist_geneExpr fx_calcDist_scoreDE fx_calcDist_numDE fx_calcSilhouette fx_calcDEcombn_slow fx_calcDEcombn_BP fx_calcDEcombn fx_calcEScombn fx_calcDEvsRest_slow fx_calcDEvsRest_BP fx_calcDEvsRest fx_calcESvsRest_BP fx_calcESvsRest fx_calcCGS_BP fx_calcCGS CalcSCV CalcAllSCV

Documented in CalcAllSCV CalcSCV fx_calcCGS fx_calcCGS_BP fx_calcDEcombn fx_calcDEcombn_BP fx_calcDEcombn_slow fx_calcDEvsRest fx_calcDEvsRest_BP fx_calcDEvsRest_slow fx_calcDist_cellEmb fx_calcDist_geneExpr fx_calcDist_numDE fx_calcDist_scoreDE fx_calcEScombn fx_calcESvsRest fx_calcESvsRest_BP fx_calcMarker fx_calcNeighb fx_calcSilhouette

#' @include sCVdataClass.R dataAccess.R
NULL


# CalcAllSCV ----

#' Prepare all cluster solutions for visualization with scClustViz
#'
#' An all-in-one function to prepare your data for viewing in the interactive
#' Shiny app. See example for the basic usage of scClustViz.
#'
#' This is a wrapper function for running \code{\link{CalcSCV}} over each
#' cluster resolution in the input, and outputs a list of \code{\link{sCVdata}}
#' objects that should be saved along with the input data. The resulting file is
#' ready to be read by \code{\link{runShiny}} for viewing. For each cluster
#' solution provided, this function calculates summary statistics per gene per
#' cluster, differential gene expression, and cluster separation metrics. This
#' may take a while to run, depending on the number of cluster solutions tested.
#' Use the \code{testAll} argument to prevent testing of overfitted cluster
#' solutions. To help track its progress, this function uses progress bars from
#' \code{pbapply}. To disable these, set
#' \code{\link[pbapply]{pboptions}(type="none")}. To re-enable, set
#' \code{\link[pbapply]{pboptions}(type="timer")}.
#'
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#' @param clusterDF A data frame of cluster assignments for all cells in the
#'   dataset. Variables (columns) are cluster solutions with different
#'   parameters, and rows should correspond to cells of the input gene
#'   expression matrix.
#' @param assayType Default = "" (for Seurat v1/2). A length-one character
#'   vector representing the assay object in which the expression data is stored
#'   in the input object. This is not required for Seurat v1 or v2 objects. For 
#'   Seurat v3 objects, this is often "RNA". For SingleCellExperiment objects, 
#'   this is often "logcounts". See \code{\link{getExpr}} for details.
#' @param assaySlot An optional length-one character vector representing
#'   the slot of the Seurat v3 \code{\link[Seurat]{Assay}} object to use. Not 
#'   used for other single-cell data objects. The default is to use the 
#'   normalized data in the "data" slot, but you can also use the 
#'   \code{\link[Seurat]{SCTransform}}-corrected counts by setting 
#'   \code{assayType = "SCT"} and \code{assaySlot = "counts"}. This is 
#'   recommended, as it will speed up differential expression 
#'   calculations. See \code{\link{getExpr}} for details.
#' @param DRforClust Default = "pca".A length-one character vector representing
#'   the dimensionality reduction method used as the input for clustering. This
#'   is commonly PCA, and should correspond to the slot name of the cell
#'   embedding in your input data - either the \code{type} argument in
#'   \code{\link[SingleCellExperiment]{reducedDim}(x,type)} or the
#'   \code{reduction.type} argument in
#'   \code{\link[Seurat]{GetDimReduction}(object,reduction.type)} (v2) or
#'   \code{reduction} in \code{\link[Seurat]{Embeddings}(object,reduction)}.
#' @param exponent Default = 2. A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount Default = 1. A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' @param DRthresh Default = 0.1. A length-one numeric vector between 0 and 1
#'   representing the detection rate threshold for inclusion of a gene in the
#'   differential expression testing. A gene will be included if it is detected
#'   in at least this proportion of cells in at least one of the clusters being
#'   compared.
#' @param testAll Default = TRUE. Logical value indicating whether to test all
#'   cluster solutions (\code{TRUE}) or stop testing once a cluster solution has
#'   been found where there is no differentially expressed genes found between
#'   at least one pair of nearest neighbouring clusters (\code{FALSE}). If set
#'   to FALSE, this function will test cluster solutions in ascending order of
#'   number of clusters found. \emph{If set to (\code{FALSE}), only tested
#'   cluster solutions will appear in the scClustViz shiny app.}
#' @param FDRthresh Default = 0.05. A length-one numeric vector representing the
#'   targeted false discovery rate used to determine the number of
#'   differentially expressed genes between nearest neighbouring clusters,
#'   assuming \code{testAll} is set FALSE. If \code{testAll} is TRUE, this
#'   argument is unused.
#' @param calcSil Default = TRUE. A logical vector of length 1. If TRUE,
#'   silhouette widths (a cluster cohesion/separation metric) will be calculated
#'   for all cells. This calculation is performed using the function
#'   \code{\link{CalcSilhouette}}, which is a wrapper to
#'   \code{\link[cluster]{silhouette}} with distance calculated using the same
#'   reduced dimensional cell embedding as was used for clustering, as indicated
#'   in the \code{DRforClust} argument. If the package \code{cluster} is not
#'   installed, this calculation is skipped.
#' @param calcDEvsRest Default = TRUE. A logical vector of length 1. If TRUE,
#'   differential expression tests will be performed comparing each cluster to
#'   the remaining cells in the data using a Wilcoxon rank-sum test and
#'   reporting false discovery rates. This calculation is performed using the
#'   function \code{\link{CalcDEvsRest}}. If set to FALSE, it is suggested that
#'   you perform DE testing on the same set of comparisons using a statistical
#'   method of your choice. This can be passed into your \code{sCVdata} objects
#'   in the list returned by \code{CalcAllSCV} using the function
#'   \code{\link{CalcDEvsRest}}. See function documentation for details.
#' @param calcDEcombn Default = TRUE.  A logical vector of length 1. If TRUE,
#'   differential expression tests will be performed comparing all pairwise
#'   combinations of clusters using a Wilcoxon rank-sum test and reporting false
#'   discovery rates. This calculation is performed using the function
#'   \code{\link{calcDEcombn}}. If set to FALSE, it is suggested that you
#'   perform DE testing on the same set of comparisons using a statistical
#'   method of your choice. This can be passed into your \code{sCVdata} objects
#'   in the list returned by \code{CalcAllSCV} using the function
#'   \code{\link{calcDEcombn}}. See function documentation for details.
#'
#' @return The function returns a list containing \code{\link{sCVdata}} objects
#'   for each cluster resolution (sample) in the \code{clusterDF} data frame.
#'   The output object and the \code{inD} object should be saved as an
#'   \code{.RData} file. That file is the input for \code{\link{runShiny}}, the
#'   scClustViz Shiny interaction visualization app. See example. For details of
#'   calculations performed / stored by this function, see
#'   \code{\link{sCVdata}}.
#'
#' @examples
#' \dontrun{
#' your_cluster_columns <- grepl("res[.0-9]+$",
#'                               names(getMD(your_scRNAseq_data_object)))
#' # ^ Finds the cluster columns of the metadata in a Seurat object.
#'
#' your_cluster_results <- getMD(your_scRNAseq_data_object)[your_cluster_columns]
#'
#' sCVdata_list <- CalcAllSCV(inD=your_scRNAseq_data_object,
#'                            clusterDF=your_cluster_results,
#'                            assayType="RNA",
#'                            DRforClust="pca",
#'                            exponent=exp(1),
#'                            pseudocount=1,
#'                            DRthresh=0.1,
#'                            testAll=F,
#'                            FDRthresh=0.05,
#'                            calcSil=T,
#'                            calcDEvsRest=T,
#'                            calcDEcombn=T)
#'
#' save(your_scRNAseq_data_object,sCVdata_list,
#'      file="for_scClustViz.RData")
#'
#' runShiny(filePath="for_scClustViz.RData")
#' # ^ see ?runShiny for detailed argument list
#' }
#'
#' @seealso \code{\link{sCVdata}} for information on the output data class.
#'   \code{\link{CalcSCV}} to generate an \code{sCVdata} object for a single
#'   cluster solution. \code{\link{runShiny}} starts the interactive Shiny GUI
#'   to view the results of this testing.
#'
#' @export

CalcAllSCV <- function(inD,
                       clusterDF,
                       assayType="",
                       assaySlot="",
                       DRforClust="pca",
                       exponent=2,
                       pseudocount=1,
                       DRthresh=0.1,
                       testAll=TRUE,
                       FDRthresh=0.05,
                       calcSil=T,
                       calcDEvsRest=T,
                       calcDEcombn=T) {
  if (!is(inD)[1] %in% findMethodSignatures(getExpr)) {
    stop(paste(
      paste0("Input data object must be one of: ",
             paste(findMethodSignatures(getExpr),collapse=", "),
             "."),
      paste("Other input objects are not supported at this time,",
            "but please let me know what object class"),
      paste("you'd like supported at",
            "https://github.com/BaderLab/scClustViz/issues, thanks!"),
      sep="\n  "))
  }
  if (testAll == FALSE & calcDEcombn == FALSE) {
    stop(paste("Argument 'calcDEcombn' must be TRUE if 'testAll' = FALSE.",
               "The decision to stop testing depends on the results of caclDEcombn.",
               sep="\n"))
  }
  if (is.null(colnames(getExpr(inD,assayType,assaySlot))) | 
      is.null(rownames(getExpr(inD,assayType,assaySlot)))) {
    stop("Gene expression matrix returned by 'getExpr(inD,assayType,assaySlot)' is missing col/rownames.")
  }
  if (is.data.frame(clusterDF)) {
    if (nrow(clusterDF) != ncol(getExpr(inD,assayType,assaySlot))) {
      stop(paste("clusterDF must be a data frame where each variable (column) is the cluster",
                 "assignments for all cells (rows) from each cluster solution tested.",
                 sep="\n  "))
    }
  } else if (length(clusterDF) == ncol(getExpr(inD,assayType,assaySlot))) {
    clusterDF <- as.data.frame(clusterDF)
    colnames(clusterDF) <- "Clust"
  } else {
    stop(paste("clusterDF must be a data frame where each variable (column) is the cluster",
               "assignments for all cells (rows) from each cluster solution tested.",
               sep="\n  "))
  }
  if (!identical(rownames(clusterDF),colnames(getExpr(inD,assayType,assaySlot)))) {
    rownames(clusterDF) <- colnames(getExpr(inD,assayType,assaySlot))
  }
  
  # If testAll == F, cluster solutions are sorted in ascending order of number
  # of clusters found.
  if (!testAll) {
    message(paste("  Testing cluster solutions in ascending order of number of clusters found.",
                  "Testing will stop after finding a solution with 0 differentially expressed",
                  "genes between nearest neighbouring clusters, and the resulting list of",
                  "sCVdata objects will be in ascending order of number of clusters found.",
                  sep="\n  "))
    sortedClusts <- order(sapply(clusterDF,function(X) length(unique(X))))
    clusterDF <- clusterDF[sortedClusts]
  }
  
  if (assaySlot == "counts" & !(is.na(exponent) | is.na(pseudocount))) {
    stop(paste("If you are using count data (not log-transformed),",
               "you must set 'exponent' and/or 'pseudocount' to NA.",
               sep="\n  "))
  }
  
  # This loop iterates through every cluster solution, and does DE testing
  # between clusters to generate the DE metrics for assessing your clusters.
  # This takes some time. If testAll == FALSE, it will stop once no
  # significantly differentially-expressed genes are detected between nearest
  # neighbouring clusters.
  outList <- list()
  for (X in names(clusterDF)) {
    message(paste(" ",
                  "--------------------------------------",
                  "--------------------------------------",
                  paste("Processing cluster solution:",X),
                  "--------------------------------------",sep="\n"))
    temp <- clusterDF[[X]]
    names(temp) <- rownames(clusterDF)
    outList[[X]] <- CalcSCV(inD=inD,
                            cl=temp,
                            assayType=assayType,
                            assaySlot=assaySlot,
                            DRforClust=DRforClust,
                            exponent=exponent,
                            pseudocount=pseudocount,
                            DRthresh=DRthresh,
                            calcSil=calcSil,
                            calcDEvsRest=calcDEvsRest,
                            calcDEcombn=calcDEcombn)
    if (!testAll) {
      if (min(sapply(DEneighb(outList[[X]],FDRthresh),nrow)) < 1) { break }
    }
  }
  
  return(outList)
}


# CalcSCV ----

#' Create sCVdata object with calculation results for a cluster
#'
#' Creates and populates a new \code{\link{sCVdata}} object for a single
#' clustering result on the input data. This is the building block of the input
#' to \code{\link{runShiny}}, the scClustViz interactive visualization of
#' scRNAseq data.
#'
#' By default, \code{CalcSCV} populates all slots of the object, calculating
#' cluster separation metrics and differential gene expression results. At a
#' minimum, it calculates the basic gene statistics required by scClustViz for
#' visualization. This can take some time. To help track its progress, this
#' function uses progress bars from \code{pbapply}. To disable these, set
#' \code{\link[pbapply]{pboptions}(type="none")}. To re-enable, set
#' \code{\link[pbapply]{pboptions}(type="timer")}. To view the results using
#' \code{\link{runShiny}}, the resulting \code{\link{sCVdata}} object(s) must be
#' stored as a named list of cluster solutions and saved to an \code{.RData}
#' file along with the input data object. See example for details.
#'
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#' @param cl a factor where each value is the cluster assignment for a cell
#'   (column) in the input gene expression matrix.
#' @param assayType Default = "" (for Seurat v1/2). A length-one character
#'   vector representing the assay object in which the expression data is stored
#'   in the input object. This is not required for Seurat v1 or v2 objects. For 
#'   Seurat v3 objects, this is often "RNA". For SingleCellExperiment objects, 
#'   this is often "logcounts". See \code{\link{getExpr}} for details.
#' @param assaySlot An optional length-one character vector representing
#'   the slot of the Seurat v3 \code{\link[Seurat]{Assay}} object to use. Not 
#'   used for other single-cell data objects. The default is to use the 
#'   normalized data in the "data" slot, but you can also use the 
#'   \code{\link[Seurat]{SCTransform}}-corrected counts by setting 
#'   \code{assayType = "SCT"} and \code{assaySlot = "counts"}. This is 
#'   recommended, as it will speed up differential expression 
#'   calculations. See \code{\link{getExpr}} for details.
#' @param DRforClust Default = "pca".A length-one character vector representing
#'   the dimensionality reduction method used as the input for clustering. This
#'   is commonly PCA, and should correspond to the slot name of the cell
#'   embedding in your input data - either the \code{type} argument in
#'   \code{\link[SingleCellExperiment]{reducedDim}(x,type)} or the
#'   \code{reduction.type} argument in
#'   \code{\link[Seurat]{GetDimReduction}(object,reduction.type)} (v2) or
#'   \code{reduction} in \code{\link[Seurat]{Embeddings}(object,reduction)}.
#' @param exponent Default = 2. A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount Default = 1. A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' @param DRthresh Default = 0.1. A length-one numeric vector between 0 and 1
#'   representing the detection rate threshold for inclusion of a gene in the
#'   differential expression testing. A gene will be included if it is detected
#'   in at least this proportion of cells in at least one of the clusters being
#'   compared.
#' @param calcSil Default = TRUE. A logical vector of length 1. If TRUE,
#'   silhouette widths (a cluster cohesion/separation metric) will be calculated
#'   for all cells. This calculation is performed using the function
#'   \code{\link{CalcSilhouette}}, which is a wrapper to
#'   \code{\link[cluster]{silhouette}} with distance calculated using the same
#'   reduced dimensional cell embedding as was used for clustering, as indicated
#'   in the \code{DRforClust} argument. If the package \code{cluster} is not
#'   installed, this calculation is skipped.
#' @param calcDEvsRest Default = TRUE. A logical vector of length 1. If TRUE,
#'   differential expression tests will be performed comparing each cluster to
#'   the remaining cells in the data using a Wilcoxon rank-sum test and
#'   reporting false discovery rates. This calculation is performed using the
#'   function \code{\link{CalcDEvsRest}}. If set to FALSE, it is suggested that
#'   you perform DE testing on the same set of comparisons using a statistical
#'   method of your choice. This can be passed into your \code{sCVdata} objects
#'   in the list returned by \code{CalcAllSCV} using the function
#'   \code{\link{CalcDEvsRest}}. See function documentation for details.
#' @param calcDEcombn Default = TRUE.  A logical vector of length 1. If TRUE,
#'   differential expression tests will be performed comparing all pairwise
#'   combinations of clusters using a Wilcoxon rank-sum test and reporting false
#'   discovery rates. This calculation is performed using the function
#'   \code{\link{calcDEcombn}}. If set to FALSE, it is suggested that you
#'   perform DE testing on the same set of comparisons using a statistical
#'   method of your choice. This can be passed into your \code{sCVdata} objects
#'   in the list returned by \code{CalcAllSCV} using the function
#'   \code{\link{calcDEcombn}}. See function documentation for details.
#'
#' @return The function returns an \code{\link{sCVdata}} object with all slots
#'   populated by default, and at least the \code{Clusters},
#'   \code{ClustGeneStats}, and \code{params} slots populated. The resulting
#'   object can be added to a list as part of the input to
#'   \code{\link{runShiny}} for visualization of the cluster results.
#'
#' @examples
#' \dontrun{
#' ## This example shows integration of scClustViz with Seurat clustering ##
#'
#' DE_bw_clust <- TRUE
#' seurat_resolution <- 0
#' sCVdata_list <- list()
#'
#' while(DE_bw_clust) {
#'   seurat_resolution <- seurat_resolution + 0.2
#'   # ^ iteratively incrementing resolution parameter
#'
#'   your_seurat_obj <- Seurat::FindClusters(your_seurat_obj,
#'                                           resolution=seurat_resolution)
#'
#'   curr_sCVdata <- CalcSCV(inD=your_seurat_obj,
#'                           cl=Indents(your_seurat_obj),
#'                           assayType="RNA",
#'                           DRforClust="pca",
#'                           exponent=exp(1),
#'                           pseudocount=1,
#'                           DRthresh=0.1,
#'                           calcSil=T,
#'                           calcDEvsRest=T,
#'                           calcDEcombn=T)
#'
#'   DE_bw_NN <- sapply(DEneighb(curr_sCVdata,0.05),length)
#'   # ^ counts # of DE genes between neighbouring clusters at 5% FDR
#'
#'   if (min(DE_bw_NN) < 1) { DE_bw_clust <- FALSE }
#'   # ^ If no DE genes between nearest neighbours, don't loop again.
#'
#'   sCVdata_list[[paste0("res.",seurat_resolution)]] <- curr_sCVdata
#' }
#'
#' save(your_seurat_obj,sCVdata_list,
#'      file="for_scClustViz.RData")
#'
#' runShiny(filePath="for_scClustViz.RData")
#' # ^ see ?runShiny for detailed argument list
#' }
#'
#' @seealso \code{\link{sCVdata}} for information on the output data class.
#'   \code{\link{CalcAllSCV}} to generate a list of \code{sCVdata} objects for
#'   all cluster solutions in the input data. \code{\link{runShiny}} starts the
#'   interactive Shiny GUI to view the results of this testing.
#'
#' @export

CalcSCV <- function(inD,
                    cl,
                    assayType="",
                    assaySlot="",
                    DRforClust="pca",
                    exponent=2,
                    pseudocount=1,
                    DRthresh=0.1,
                    calcSil=T,
                    calcDEvsRest=T,
                    calcDEcombn=T) {
  if (!is(inD)[1] %in% findMethodSignatures(getExpr)) {
    stop(paste(
      paste0("Input data object must be one of: ",
             paste(findMethodSignatures(getExpr),collapse=", "),
             "."),
      paste("Other input objects are not supported at this time,",
            "but please let me know what object class"),
      paste("you'd like supported at",
            "https://github.com/BaderLab/scClustViz/issues, thanks!"),
      sep="\n  "))
  }
  if (is.null(colnames(getExpr(inD,assayType,assaySlot))) | 
      is.null(rownames(getExpr(inD,assayType,assaySlot)))) {
    stop("Gene expression matrix returned by 'getExpr(inD,assayType,assaySlot)' is missing col/rownames.")
  }
  if (length(cl) != ncol(getExpr(inD,assayType,assaySlot))) {
    stop(paste("cl must be a factor where each value is the cluster assignment",
               "for a cell (column) in the input gene expression matrix.",
               sep="\n  "))
  }
  if (is.character(cl) | is.numeric(cl)) {
    cl <- as.factor(cl)
  }
  if (!all(names(cl) == colnames(getExpr(inD,assayType,assaySlot))) | is.null(names(cl))) {
    names(cl) <- colnames(getExpr(inD,assayType,assaySlot))
  }
  if (any(grepl("-",levels(cl)))) {
    stop("Cluster names (levels in 'cl') cannot contain '-'.")
  }
  
  out <- sCVdata(Clusters=cl,
                 params=sCVparams(assayType=c(assayType,assaySlot),
                                  DRforClust=DRforClust,
                                  exponent=ifelse(is.na(exponent),NA_real_,exponent),
                                  pseudocount=ifelse(is.na(pseudocount),NA_real_,pseudocount),
                                  DRthresh=DRthresh))
  if (calcSil) { #Doing this first in case DRforClust is set incorrectly.
    if (require(cluster)) {
      Silhouette(out) <- CalcSilhouette(out,inD)
    } else {
      warning(paste("  Silhouette could not be calculated because package 'cluster'",
                    "is not installed. Try 'install.packages(cluster)', then run",
                    "'CalcSilhouette()' for the sCVdata object at this resolution.",sep="\n  "))
    }
  }
  
  #this is not optional, since everything depends on it.
  ClustGeneStats(out) <- CalcCGS(out,inD) 
  
  if (calcDEvsRest) {
    DEvsRest(out) <- CalcDEvsRest(out,inD)
  }
  
  if (calcDEcombn) {
    DEcombn(out) <- CalcDEcombn(out,inD)
  }
  return(out)
}


# CalcCGS ----

#' Internal fx for cluster-wise gene statistics
#'
#' Internal function. See \code{\link{CalcCGS}}.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param exponent Default = 2. A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount Default = 1. A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' 
#' @return The function returns a list of dataframes. Each list element contains
#'   a named list of clusters at that resolution. Each of those list elements
#'   contains a dataframe of three variables, where each sample is a gene.
#'   \code{DR} is the proportion of cells in the cluster in which that gene was
#'   detected. \code{MDGE} is mean normalized gene expression for that gene in
#'   only the cells in which it was detected (see \code{\link{meanLogX}} for
#'   mean calculation). \code{MGE} is the mean normalized gene expression for
#'   that gene in all cells of the cluster (see \code{\link{meanLogX}} for mean
#'   calculation).

fx_calcCGS <- function(nge,cl,exponent,pseudocount) {
  temp_DR <- sapply(levels(cl),function(X) 
    data.frame(DR=RowNNZ(nge[,cl %in% X]) / sum(cl %in% X)),simplify=F)
  message("-- Calculating cluster-wise gene statistics --")
  temp_Ms <- pbapply::pbsapply(
    sapply(levels(cl),function(i) nge[,cl %in% i,drop=F],simplify=F),
    function(X) data.frame(
      MDGE=switch(any(is.na(c(exponent,pseudocount))) + 1,
                  apply(X,1,function(Y) 
                    meanLogX(Y[Y > 0],
                             ncell=ncol(nge),
                             ex=exponent,
                             pc=pseudocount)
                  ),
                  log2(apply(X,1,function(Y) mean(Y[Y > 0])) + 1/ncol(nge))
      ),
      MGE=switch(any(is.na(c(exponent,pseudocount))) + 1,
                 apply(X,1,function(Y) 
                   meanLogX(Y,
                            ncell=ncol(nge),
                            ex=exponent,
                            pc=pseudocount)
                 ),
                 log2(Matrix::rowMeans(X) + 1/ncol(nge))
      )
    ),simplify=F)
  for (l in names(temp_Ms)) { temp_Ms[[l]][is.na(temp_Ms[[l]])] <- 0 }
  return(sapply(names(temp_Ms),function(X) cbind(temp_DR[[X]],temp_Ms[[X]]),simplify=F))
}

# fx_calcCGS <- function(nge,cl,exponent,pseudocount) {
#   message("-- Calculating gene detection rate per cluster --")
#   DR <- pbapply::pbsapply(levels(cl),function(X)
#     RowNNZ(nge[,cl == X]) / sum(cl == X),simplify=F)
# 
#   message("-- Calculating mean detected gene expression per cluster --")
#   if (any(is.na(c(exponent,pseudocount)))) {
#     MDGE <- pbapply::pbsapply(sapply(levels(cl),function(i) nge[,cl %in% i,drop=F],simplify=F),
#                               function(X)
#                                 log2(apply(X,1,function(Y) mean(Y[Y > 0])) + 1/ncol(nge)),
#                               simplify=F)
#   } else {
#     MDGE <- pbapply::pbsapply(sapply(levels(cl),function(i) nge[,cl %in% i,drop=F],simplify=F),
#                               function(X) apply(X,1,function(Y) {
#                                 temp <- meanLogX(Y[Y > 0],
#                                                  ncell=ncol(nge),
#                                                  ex=exponent,
#                                                  pc=pseudocount)
#                                 if (is.na(temp)) { temp <- 0 }
#                                 return(temp)
#                               }),simplify=F)
#   }
# 
#   message("-- Calculating mean gene expression per cluster --")
#   if (any(is.na(c(exponent,pseudocount)))) {
#     MGE <- pbapply::pbsapply(sapply(levels(cl),function(i) nge[,cl %in% i,drop=F],simplify=F),
#                              function(X) log2(Matrix::rowMeans(X) + 1/ncol(nge)),simplify=F)
#   } else {
#     MGE <- pbapply::pbsapply(sapply(levels(cl),function(i) nge[,cl %in% i,drop=F],simplify=F),
#                              function(X) apply(X,1,function(Y)
#                                meanLogX(Y,
#                                         ncell=ncol(nge),
#                                         ex=exponent,
#                                         pc=pseudocount)),simplify=F)
#   }
# 
#   return(sapply(levels(cl),function(X)
#     data.frame(DR=DR[[X]],MDGE=MDGE[[X]],MGE=MGE[[X]]),simplify=F))
# }

#' Internal fx for cluster-wise gene statistics using BiocParallel
#'
#' Internal function. See \code{\link{CalcCGS}}.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param exponent A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' 
#' @return The function returns a list of dataframes. Each list element contains
#'   a named list of clusters at that resolution. Each of those list elements
#'   contains a dataframe of three variables, where each sample is a gene.
#'   \code{DR} is the proportion of cells in the cluster in which that gene was
#'   detected. \code{MDGE} is mean normalized gene expression for that gene in
#'   only the cells in which it was detected (see \code{\link{meanLogX}} for
#'   mean calculation). \code{MGE} is the mean normalized gene expression for
#'   that gene in all cells of the cluster (see \code{\link{meanLogX}} for mean
#'   calculation).

fx_calcCGS_BP <- function(nge,cl,exponent,pseudocount) {
  message("-- Calculating gene detection rate per cluster --")
  DR <- BiocParallel::bplapply(sapply(levels(cl),function(i) nge[,cl %in% i],simplify=F),
                               function(X) apply(X,1,function(Y) sum(Y>0)/length(Y)))
  names(DR) <- levels(cl)

  message("-- Calculating mean detected gene expression per cluster --")
  MDGE <- BiocParallel::bplapply(sapply(levels(cl),function(i) nge[,cl %in% i],simplify=F),
                                 function(X) apply(X,1,function(Y) {
                                   temp <- meanLogX(Y[Y>0],
                                                    ncell=ncol(nge),
                                                    ex=exponent,
                                                    pc=pseudocount)
                                   if (is.na(temp)) { temp <- 0 }
                                   return(temp)
                                 }))
  names(MDGE) <- levels(cl)

  message("-- Calculating mean gene expression per cluster --")
  MGE <- BiocParallel::bplapply(sapply(levels(cl),function(i) nge[,cl %in% i],simplify=F),
                                function(X) apply(X,1,function(Y)
                                  meanLogX(Y,
                                           ncell=ncol(nge),
                                           ex=exponent,
                                           pc=pseudocount)))
  names(MGE) <- levels(cl)

  return(sapply(levels(cl),function(X) 
    data.frame(DR=DR[[X]],MDGE=MDGE[[X]],MGE=MGE[[X]]),simplify=F))
}


#' Calculate cluster-wise gene statistics for sCVdata
#'
#' Calculates gene summary statistics per cluster for the clusters in an sCVdata
#' object, using the gene expression matrix from the input data object. This is
#' called by \code{\link{CalcSCV}} and you shouldn't need to call it on its own.
#'
#' To help track its progress, this function uses progress bars from
#' \code{pbapply}. To disable these, set
#' \code{\link[pbapply]{pboptions}(type="none")}. To re-enable, set
#' \code{\link[pbapply]{pboptions}(type="timer")}.
#'
#' @param sCVd An sCVdata object.
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#'
#' @return The function returns a list of dataframes. Each list element contains
#'   a named list of clusters at that resolution. Each of those list elements
#'   contains a dataframe where each sample is a gene, containing the following
#'   variables: \code{DR} is the proportion of cells in the cluster in which
#'   that gene was detected. \code{MDGE} is mean normalized gene expression for
#'   that gene in only the cells in which it was detected (see
#'   \code{\link{meanLogX}} for mean calculation). \code{MGE} is the mean
#'   normalized gene expression for that gene in all cells of the cluster (see
#'   \code{\link{meanLogX}} for mean calculation).
#'
#' @seealso \code{\link{CalcSCV}} for wrapper function to calculate all
#'   statistics for an sCVdata object, and \code{\link{fx_calcCGS}} for the
#'   internal function performing the calculations.
#'
#' @examples
#' \dontrun{
#' ClustGeneStats(your_sCV_obj) <- CalcCGS(sCVd=your_sCV_obj,
#'                                         inD=your_scRNAseq_data_object)
#' }
#' 
#' @name CalcCGS
#'
#' @export
#' 

setGeneric("CalcCGS",function(sCVd,inD) standardGeneric("CalcCGS"))


#' @describeIn CalcCGS Calculate cluster-wise gene stats for sCVdata
#' @export

setMethod("CalcCGS",signature("sCVdata"),
          function(sCVd,inD) {
            # if (UseBiocParallel) {
            #   fx_calcCGS_BP(nge=getExpr(inD,Param(sCVd,"assayType")),
            #                 cl=Clusters(sCVd),
            #                 exponent=Param(sCVd,"exponent"),
            #                 pseudocount=Param(sCVd,"pseudocount"))
            # } else {
            fx_calcCGS(nge=getExpr(inD,
                                   Param(sCVd,"assayType")[1],
                                   Param(sCVd,"assayType")[2]),
                       cl=Clusters(sCVd),
                       exponent=Param(sCVd,"exponent"),
                       pseudocount=Param(sCVd,"pseudocount"))
            # }
          })


# CalcDEvsRest ----

#' Internal fx to calculate logGER for DEvsRest calculation
#'
#' Internal function. See \code{\link{CalcDEvsRest}}.
#'
#' Calculates the log-ratios of gene expression for all genes in each one-vs-all
#' comparison of a cluster vs the rest of the data. This is used to determine
#' the genes used in DEvsRest calculations.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param CGS The output from \code{\link{CalcCGS}}.
#' @param exponent A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' @param DRthresh The threshold for minimum detection rate of a gene in the
#'   cluster for the gene to be considered in the following Wilcoxon rank-sum
#'   test.
#'
#' @return The function returns a list where each list element is the log-ratios
#'   of gene expression when comparing each gene in a cluster to the rest of the
#'   cells as a whole in a one vs all comparison. These logGER tables are
#'   filtered to only include those gene that pass logGER threshold, and thus
#'   the names for each list entry correspond to the genes to test in
#'   \code{\link{fx_calcDEvsRest}}.
#'   

fx_calcESvsRest <- function(nge,cl,CGS,exponent,pseudocount,DRthresh) {
  message("-- Calculating differential expression cluster vs rest effect size --")
  return(pbapply::pbsapply(levels(cl),function(i) {
    data.frame(
      logGER=CGS[[i]][CGS[[i]]$DR >= DRthresh,"MGE"] - 
        switch(any(is.na(c(exponent,pseudocount))) + 1,
               apply(
                 nge[CGS[[i]]$DR >= DRthresh,(!cl %in% i | is.na(cl))],
                 MARGIN=1,
                 FUN=meanLogX,
                 ncell=ncol(nge),
                 ex=exponent,
                 pc=pseudocount
               ),
               log2(Matrix::rowMeans(
                 nge[CGS[[i]]$DR >= DRthresh,(!cl %in% i | is.na(cl))]
               ) + 1/ncol(nge))
        ),
      Wstat=NA,
      pVal=NA,
      FDR=NA,
      row.names=rownames(CGS[[i]])[CGS[[i]]$DR >= DRthresh]
    )
  },simplify=F))
}


#' Internal fx to calculate logGER for DEvsRest calculation using BiocParallel
#'
#' Internal function. See \code{\link{CalcDEvsRest}}.
#'
#' Calculates the log-ratios of gene expression for all genes in each one-vs-all
#' comparison of a cluster vs the rest of the data. This is used to determine
#' the genes used in DEvsRest calculations.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param CGS The output from \code{\link{CalcCGS}}.
#' @param exponent A length-one numeric vector representing the
#'   base of the log-normalized gene expression data to be processed. Generally
#'   gene expression data is transformed into log2 space when normalizing (set
#'   this to 2), though \code{Seurat} uses the natural log (set this to exp(1)). 
#'   If you are using data that has not been log-transformed (for example, 
#'   corrected counts from SCTransform), set this to NA.
#' @param pseudocount A length-one numeric vector representing the
#'   pseudocount added to all log-normalized values in your input data. Most
#'   methods use a pseudocount of 1 to eliminate log(0) errors. If you are using 
#'   data that has not been log-transformed (for example, corrected counts from 
#'   SCTransform), set this to NA. 
#' @param DRthresh The threshold for minimum detection rate of a gene in the
#'   cluster for the gene to be considered in the following Wilcoxon rank-sum
#'   test.
#'
#' @return The function returns a list where each list element is the log-ratios
#'   of gene expression when comparing each gene in a cluster to the rest of the
#'   cells as a whole in a one vs all comparison. These logGER tables are
#'   filtered to only include those gene that pass logGER threshold, and thus
#'   the names for each list entry correspond to the genes to test in
#'   \code{\link{fx_calcDEvsRest}}.
#'   

fx_calcESvsRest_BP <- function(nge,cl,CGS,exponent,pseudocount,DRthresh) {
  message("-- Calculating differential expression cluster vs rest effect size --")
  temp <- BiocParallel::bplapply(levels(cl),function(i) {
    temp <- data.frame(
      # overThreshold = CGS[[i]]$DR >= DRthresh, #deprecated
                       logGER=NA,
                       Wstat=NA,
                       pVal=NA,
                       FDR=NA)
    rownames(temp) <- rownames(CGS[[i]])
    temp[temp$overThreshold,"logGER"] <- CGS[[i]][temp$overThreshold,"MGE"] -
      apply(nge[temp$overThreshold,(!cl %in% i | is.na(cl))],1,function(Y)
        meanLogX(Y,ncell=ncol(nge),ex=exponent,pc=pseudocount))
    return(temp)
  })
  names(temp) <- levels(cl)
  return(temp)
}


#' Internal fx to perform one vs all DE testing
#'
#' Internal function. See \code{\link{CalcDEvsRest}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each one-vs-all
#' comparison of a cluster vs the rest of the data. You probably don't need to
#' use this unless you're trying to customize \code{\link{clusterWiseDEtest}}.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deTes The output from \code{\link{fx_calcESvsRest}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to the rest of the cells as a whole in a one vs all
#'   comparison. The results are stored as a named list of dataframes. There is
#'   a list element for each cluster containing a dataframe of three variables,
#'   where each sample is a gene. \code{logGER} is the log gene expression ratio
#'   calculated by subtracting the mean expression of the gene (see
#'   \link{meanLogX} for mean calculation) in all other cells from the mean
#'   expression of the gene in this cluster. \code{Wstat} and \code{pVal} are
#'   the test statistic and the p-value of the Wilcoxon rank sum test.
#'   \code{FDR} is the false discovery rate-corrected p-value of the test.
#'   

fx_calcDEvsRest <- function(nge,cl,deTes) {
  message("-- Testing differential expression cluster vs rest --")
  deT_pVal <- presto::wilcoxauc(X=nge,y=cl)
  for (i in names(deTes)) {
    tempRows <- deT_pVal$feature %in% rownames(deTes[[i]]) & deT_pVal$group == i
    deTes[[i]][deT_pVal[tempRows,"feature"],"Wstat"] <- deT_pVal[tempRows,"statistic"]
    deTes[[i]][deT_pVal[tempRows,"feature"],"pVal"] <- deT_pVal[tempRows,"pval"]
    deTes[[i]][deT_pVal[tempRows,"feature"],"FDR"] <- p.adjust(deT_pVal[tempRows,"pval"],"fdr")
  } 
  return(deTes)
}


#' Internal fx to perform one vs all DE testing using BiocParallel
#'
#' Internal function. See \code{\link{CalcDEvsRest}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each one-vs-all
#' comparison of a cluster vs the rest of the data. You probably don't need to
#' use this unless you're trying to customize \code{\link{clusterWiseDEtest}}.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deTes The output from \code{\link{fx_calcESvsRest}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to the rest of the cells as a whole in a one vs all
#'   comparison. The results are stored as a named list of dataframes. There is
#'   a list element for each cluster containing a dataframe of three variables,
#'   where each sample is a gene. \code{logGER} is the log gene expression ratio
#'   calculated by subtracting the mean expression of the gene (see
#'   \link{meanLogX} for mean calculation) in all other cells from the mean
#'   expression of the gene in this cluster. \code{Wstat} and \code{pVal} are
#'   the test statistic and the p-value of the Wilcoxon rank sum test.
#'   \code{FDR} is the false discovery rate-corrected p-value of the test.
#'   

fx_calcDEvsRest_BP <- function(nge,cl,deTes) {
  message("-- Testing differential expression cluster vs rest --")
  deT_pVal <- BiocParallel::bplapply(levels(cl),function(i)
    apply(nge[rownames(deTes[[i]]),],1,function(X)
      # suppressWarnings(wilcox.test(X[cl %in% i],X[!cl %in% i],alternative="greater")$p.value)
      suppressWarnings(unlist(wilcox.test(X[cl %in% i],X[!cl %in% i])[c("statistic","p.value")]))
    ))
  names(deT_pVal) <- levels(cl)
  for (i in names(deTes)) {
    deTes[[i]][colnames(deT_pVal[[i]]),"Wstat"] <- deT_pVal[[i]]["statistic.W",]
    deTes[[i]][colnames(deT_pVal[[i]]),"pVal"] <- deT_pVal[[i]]["p.value",]
    deTes[[i]][colnames(deT_pVal[[i]]),"FDR"] <- p.adjust(deT_pVal[[i]]["p.value",],"fdr")
  } 
  return(deTes)
}


#' Internal fx to perform one vs all DE testing using base R
#'
#' Internal function. See \code{\link{CalcDEvsRest}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each one-vs-all
#' comparison of a cluster vs the rest of the data. You probably don't need to
#' use this unless you're trying to customize \code{\link{clusterWiseDEtest}}.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deTes The output from \code{\link{fx_calcESvsRest}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to the rest of the cells as a whole in a one vs all
#'   comparison. The results are stored as a named list of dataframes. There is
#'   a list element for each cluster containing a dataframe of three variables,
#'   where each sample is a gene. \code{logGER} is the log gene expression ratio
#'   calculated by subtracting the mean expression of the gene (see
#'   \link{meanLogX} for mean calculation) in all other cells from the mean
#'   expression of the gene in this cluster. \code{Wstat} and \code{pVal} are
#'   the test statistic and the p-value of the Wilcoxon rank sum test.
#'   \code{FDR} is the false discovery rate-corrected p-value of the test.
#'   

fx_calcDEvsRest_slow <- function(nge,cl,deTes) {
  message("-- Testing differential expression cluster vs rest --")
  deT_pVal <- pbapply::pbsapply(levels(cl),function(i)
    apply(nge[rownames(deTes[[i]]),],1,function(X) 
      # ^ slice by rowname is a little slower, but safer
      # suppressWarnings(wilcox.test(X[cl %in% i],X[!cl %in% i],alternative="greater")$p.value)
      suppressWarnings(unlist(wilcox.test(X[cl %in% i],X[!cl %in% i])[c("statistic","p.value")]))
    ),simplify=F)
  for (i in names(deTes)) {
    deTes[[i]][colnames(deT_pVal[[i]]),"Wstat"] <- deT_pVal[[i]]["statistic.W",]
    deTes[[i]][colnames(deT_pVal[[i]]),"pVal"] <- deT_pVal[[i]]["p.value",]
    deTes[[i]][colnames(deT_pVal[[i]]),"FDR"] <- p.adjust(deT_pVal[[i]]["p.value",],"fdr")
  } 
  return(deTes)
}


#' Calculates one vs. all DE tests for sCVdata
#'
#' Performs differential gene expression tests for each cluster in an sCVdata
#' object, comparing the cells in the cluster to the remaining cells in the data
#' using the gene expression matrix of input data object. Alternatively, this
#' function can be skipped, and existing DE test results can be assigned
#' directly to the sCVdata object.
#'
#' This function performs Wilcoxon rank sum tests comparing gene expression
#' between each cluster and all other cells in the input data. Gene expression
#' ratio in log space (\code{logGER}) is reported for all genes in the
#' comparison. Genes are tested if they are detected in the cluster at a higher
#' proportion than \code{Param(sCVd,"DRthresh")}, and both unadjusted p-values
#' and false discovery rates are reported for all genes tested. To help track
#' its progress, this function uses progress bars from \code{pbapply}. To
#' disable these, set \code{\link[pbapply]{pboptions}(type="none")}. To
#' re-enable, set \code{\link[pbapply]{pboptions}(type="timer")}.
#'
#' If using existing DE test results, assign results of one vs. all tests for
#' every cluster in sCVdata to the \code{\link{DEvsRest}} slot of the
#' \code{\link{sCVdata}} object. See example and slot documentation.
#'
#' @param sCVd An sCVdata object.
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#'
#' @return A named list of data frames, one entry for each level in
#'   \code{Clusters(sCVd)} (with corresponding name). Each entry is data frame
#'   containing gene differential expression stats when comparing the cells of
#'   that cluster to all other cells in the input data. Rows represent genes,
#'   and variables include \code{logGER} (an effect size measure: gene
#'   expression ratio in log space, often referred to as logFC) and \code{FDR}
#'   (significance measure: false discovery rate). Also included are
#'   \code{Wstat} and \code{pVal}, the test statistic and the p-value of the
#'   Wilcoxon rank sum test.
#'
#' @seealso \code{\link{CalcSCV}} for wrapper function to calculate all
#'   statistics for an sCVdata object,  and \code{\link{fx_calcESvsRest}} and
#'   \code{\link{fx_calcDEvsRest}} for the internal functions performing the
#'   calculations. Wilcox test is now powered by \code{\link[presto]{wilcoxauc}}
#'   for super speed.
#'
#' @examples
#' \dontrun{
#' ## Example using CalcDEvsRest ##
#' DEvsRest(your_sCV_obj) <- CalcDEvsRest(sCVd=your_sCV_obj,
#'                                        inD=your_scRNAseq_data_object)
#'
#'
#' ## Example using MAST results from Seurat to replace CalcDEvsRest ##
#' MAST_oneVSall <- FindAllMarkers(your_seurat_obj,
#'                                 logfc.threshold=0,
#'                                 min.pct=0.1,
#'                                 test.use="MAST",
#'                                 latent.vars="nUMI")
#' # ^ FindAllMarkers and CalcDEvsRest do equivalent comparisons 
#' 
#' names(MAST_oneVSall)[names(MAST_oneVSall) == "avg_logFC"] <- "logGER"
#' # ^ Effect size variable must be named 'logGER'
#' names(MAST_oneVSall)[names(MAST_oneVSall) == "p_val_adj"] <- "FDR"
#' # ^ Significance variable must be named 'FDR'
#' 
#' MAST_oneVSall_list <- sapply(levels(MAST_oneVSall$cluster),
#'                              function(X) {
#'                                temp <- MAST_oneVSall[MAST_oneVSall$cluster == X,]
#'                                rownames(temp) <- temp$gene
#'                                # ^ Rownames must be gene names.
#'                                return(temp)
#'                              },simplify=F)
#' # ^ Dataframe converted to list of dataframes per cluster
#' 
#' DEvsRest(your_sCV_obj) <- MAST_oneVSall_list
#' # ^ Slot MAST results into sCVdata object
#'
#' }
#'
#' @name CalcDEvsRest
#'
#' @export
#' 

setGeneric("CalcDEvsRest",function(sCVd,inD) 
  standardGeneric("CalcDEvsRest"))


#' @describeIn CalcDEvsRest Calculate one vs. all DE tests for sCVdata
#' @export

setMethod("CalcDEvsRest","sCVdata",
          function(sCVd,inD) {
            if (!is(inD)[1] %in% findMethodSignatures(getExpr)) {
              stop(paste("The input data object must be one of:",
                         paste(findMethodSignatures(getExpr),collapse=", "),
                         sep="\n  "))
            }
            if (length(levels(Clusters(sCVd))) <= 1) {
              stop("scClustViz can't calculate differential expression when there's only one cluster.")
            }
            
            # if (UseBiocParallel) {
            #   deTes <- fx_calcESvsRest_BP(nge=getExpr(inD,Param(sCVd,"assayType")),
            #                               cl=Clusters(sCVd),
            #                               CGS=ClustGeneStats(sCVd),
            #                               exponent=Param(sCVd,"exponent"),
            #                               pseudocount=Param(sCVd,"pseudocount"),
            #                               DRthresh=Param(sCVd,"DRthresh"))
            # } else {
            deTes <- fx_calcESvsRest(nge=getExpr(inD,
                                                 Param(sCVd,"assayType")[1],
                                                 Param(sCVd,"assayType")[2]),
                                     cl=Clusters(sCVd),
                                     CGS=ClustGeneStats(sCVd),
                                     exponent=Param(sCVd,"exponent"),
                                     pseudocount=Param(sCVd,"pseudocount"),
                                     DRthresh=Param(sCVd,"DRthresh"))
            # }
            # if (UseBiocParallel) {
            #   deTes <- fx_calcDEvsRest_BP(nge=getExpr(inD,Param(sCVd,"assayType")),
            #                               cl=Clusters(sCVd),
            #                               deTes=deTes)
            # } else {
            if (require(presto)) {
              deTes <- fx_calcDEvsRest(nge=getExpr(inD,
                                                   Param(sCVd,"assayType")[1],
                                                   Param(sCVd,"assayType")[2]),
                                       cl=Clusters(sCVd),
                                       deTes=deTes)
            } else {
              warning(
                paste("",
                      "--------------------------------------------------------",
                      "This wilcoxon rank-sum test is 1000x faster with presto.",
                      "Install by running:",
                      "devtools::install('immunogenomics/presto')",
                      "--------------------------------------------------------",
                      "",
                      sep="\n")
              )
              deTes <- fx_calcDEvsRest_slow(nge=getExpr(inD,
                                                        Param(sCVd,"assayType")[1],
                                                        Param(sCVd,"assayType")[2]),
                                            cl=Clusters(sCVd),
                                            deTes=deTes)
            }
            # }
            return(deTes)
          })


# CalcDEcombn ----

#' Internal fx to calculate genes used for deVS calculation
#'
#' Internal function. See \code{\link{CalcDEcombn}}.
#'
#' Calculates the log-ratios of gene expression and difference in detection rate
#' for all genes in each of the potential combinations of clusters to compare.
#' This is used to determine the genes used in deVS calculations.
#'
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param CGS The output from \code{\link{fx_calcCGS}}.
#' @param DRthresh The threshold for minimum detection rate of a gene to be
#'   considered in the following Wilcoxon rank-sum test. Gene must pass
#'   threshold in at least one of the pair of clusters.
#'
#' @return The function returns a list where each list element is a dataframe
#'   with effect size statistics (log-ratios of gene expression and difference
#'   in detection rate) when comparing each gene in a cluster to the rest of the
#'   cells as a whole in a one vs all comparison. These dataframes are filtered
#'   to only include those gene that pass the relevant threshold, and thus the
#'   rownames for each list entry correspond to the genes to test in
#'   \code{\link{fx_calcDEcombn}}.
#'

fx_calcEScombn <- function(cl,CGS,DRthresh) {
  combos <- combn(levels(cl),2)
  colnames(combos) <- apply(combos,2,function(X) paste(X,collapse="-"))
  return(apply(combos,2,function(i) {
    l <- CGS[[i[1]]]$DR >= DRthresh | CGS[[i[2]]]$DR >= DRthresh
    data.frame(
      logGER=CGS[[i[1]]]$MGE[l] - CGS[[i[2]]]$MGE[l],
      dDR=CGS[[i[1]]]$DR[l] - CGS[[i[2]]]$DR[l],
      row.names=rownames(CGS[[i[1]]])[l]
    )
  }))
}


#' Internal fx to calculate DE between combinations of clusters
#'
#' Internal function. See \code{\link{CalcDEcombn}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each of the potential
#' combinations of clusters to compare.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deMes The output from \code{\link{fx_calcEScombn}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to that gene in every other cluster in a series of
#'   tests. The results are stored as a nested list of dataframes. Each list
#'   element contains a named list of clusters (cluster A). Each of those lists
#'   contains a named list of all the other clusters (cluster B). Each of those
#'   list elements contains a dataframe of four variables, where each sample is
#'   a gene. \code{dDR} is the difference in detection rate of that gene between
#'   the two clusters (DR[A] - DR[B]). \code{logGER} is the log gene expression
#'   ratio calculated by taking the difference in mean expression of the gene
#'   (see \code{\link{meanLogX}} for mean calculation) between the two clusters
#'   (MGE[A] - MGE[B]). \code{Wstat} and \code{pVal} are the test statistic and
#'   the p-value of the Wilcoxon rank sum test. \code{FDR} is the false
#'   discovery rate-corrected p-value of the test.
#'   

fx_calcDEcombn <- function(nge,cl,deMes) {
  combosL <- strsplit(names(deMes),"-")
  message("-- Testing differential expression between clusters --")
  deM_pVal <- pbapply::pbsapply(combosL,function(G) {
    temp <- presto::wilcoxauc(X=nge,y=cl,groups_use=G)
    temp <- temp[temp$group == G[1],]
    return(temp)
  },simplify=F)
  names(deM_pVal) <- names(deMes)
  for (i in names(deMes)) {
    tempRows <- deM_pVal[[i]]$feature %in% rownames(deMes[[i]])
    deMes[[i]][deM_pVal[[i]][tempRows,"feature"],"Wstat"] <- deM_pVal[[i]][tempRows,"statistic"]
    deMes[[i]][deM_pVal[[i]][tempRows,"feature"],"pVal"] <- deM_pVal[[i]][tempRows,"pval"]
    deMes[[i]][deM_pVal[[i]][tempRows,"feature"],"FDR"] <- p.adjust(deM_pVal[[i]][tempRows,"pval"],"fdr")
  } 
  return(deMes)
}


#' Internal fx to calculate DE between combinations of clusters using BiocParallel
#'
#' Internal function. See \code{\link{CalcDEcombn}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each of the potential
#' combinations of clusters to compare.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deMes The output from \code{\link{fx_calcEScombn}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to that gene in every other cluster in a series of
#'   tests. The results are stored as a nested list of dataframes. Each list
#'   element contains a named list of clusters (cluster A). Each of those lists
#'   contains a named list of all the other clusters (cluster B). Each of those
#'   list elements contains a dataframe of four variables, where each sample is
#'   a gene. \code{dDR} is the difference in detection rate of that gene between
#'   the two clusters (DR[A] - DR[B]). \code{logGER} is the log gene expression
#'   ratio calculated by taking the difference in mean expression of the gene
#'   (see \code{\link{meanLogX}} for mean calculation) between the two clusters
#'   (MGE[A] - MGE[B]). \code{Wstat} and \code{pVal} are the test statistic and
#'   the p-value of the Wilcoxon rank sum test. \code{FDR} is the false
#'   discovery rate-corrected p-value of the test.
#'   

fx_calcDEcombn_BP <- function(nge,cl,deMes) {
  combosL <- strsplit(names(deMes),"-")
  message("-- Testing differential expression between clusters --")
  deM_pVal <- BiocParallel::bplapply(seq_along(combosL),function(i)
    apply(nge[rownames(deMes[[i]]),],1,function(X)
      suppressWarnings(unlist(
        wilcox.test(X[cl == combosL[[i]][1]],
                    X[cl == combosL[[i]][2]])[c("statistic","p.value")]
      ))
    )
  )
  for (i in seq_along(deMes)) {
    deMes[[i]][colnames(deM_pVal[[i]]),"Wstat"] <- deM_pVal[[i]]["statistic.W",]
    deMes[[i]][colnames(deM_pVal[[i]]),"pVal"] <- deM_pVal[[i]]["p.value",]
    deMes[[i]][colnames(deM_pVal[[i]]),"FDR"] <- p.adjust(deM_pVal[[i]]["p.value",],"fdr")
  } 
  return(deMes)
}


#' Internal fx to calculate DE between combinations of clusters with base R
#'
#' Internal function. See \code{\link{CalcDEcombn}}.
#'
#' Calculates Wilcoxon rank-sum tests for all genes in each of the potential
#' combinations of clusters to compare.
#'
#' @param nge The log-normalized gene expression matrix.
#' @param cl The factor with cluster assignments per cell (column of nge).
#' @param deMes The output from \code{\link{fx_calcEScombn}}.
#'
#' @return Differential testing results from Wilcoxon rank sum tests comparing a
#'   gene in each cluster to that gene in every other cluster in a series of
#'   tests. The results are stored as a nested list of dataframes. Each list
#'   element contains a named list of clusters (cluster A). Each of those lists
#'   contains a named list of all the other clusters (cluster B). Each of those
#'   list elements contains a dataframe of four variables, where each sample is
#'   a gene. \code{dDR} is the difference in detection rate of that gene between
#'   the two clusters (DR[A] - DR[B]). \code{logGER} is the log gene expression
#'   ratio calculated by taking the difference in mean expression of the gene
#'   (see \code{\link{meanLogX}} for mean calculation) between the two clusters
#'   (MGE[A] - MGE[B]). \code{Wstat} and \code{pVal} are the test statistic and
#'   the p-value of the Wilcoxon rank sum test. \code{FDR} is the false
#'   discovery rate-corrected p-value of the test.
#'   

fx_calcDEcombn_slow <- function(nge,cl,deMes) {
  combosL <- strsplit(names(deMes),"-")
  message("-- Testing differential expression between clusters --")
  deM_pVal <- pbapply::pbsapply(seq_along(combosL),function(i)
    apply(nge[rownames(deMes[[i]]),],1,function(X) 
      suppressWarnings(unlist(
        wilcox.test(X[cl == combosL[[i]][1]],
                    X[cl == combosL[[i]][2]])[c("statistic","p.value")]
      ))),simplify=F)
  for (i in seq_along(deMes)) {
    deMes[[i]][colnames(deM_pVal[[i]]),"Wstat"] <- deM_pVal[[i]]["statistic.W",]
    deMes[[i]][colnames(deM_pVal[[i]]),"pVal"] <- deM_pVal[[i]]["p.value",]
    deMes[[i]][colnames(deM_pVal[[i]]),"FDR"] <- p.adjust(deM_pVal[[i]]["p.value",],"fdr")
  } 
  return(deMes)
}


#' Performs DE testing between pairs of clusters in sCVdata
#'
#' Performs differential gene expression tests between each pairwise combination
#' of cluster in an sCVdata object using the gene expression matrix of input
#' data object. Alternatively, this function can be skipped, and existing DE
#' test results can be assigned directly to the sCVdata object.
#'
#' This function performs Wilcoxon rank sum tests comparing gene expression
#' between the cells of all pairwise combinations of cluster clusters in the
#' input data. Gene expression ratio in log space (\code{logGER}) and
#' differences in detection rates (\code{dDR}) are reported for all genes in the
#' comparison. Genes are tested if they are detected in at least one of the
#' cluster at a higher proportion than \code{Param(sCVd,"DRthresh")}, and both
#' unadjusted p-values and false discovery rates are reported for all genes
#' tested. To help track its progress, this function uses progress bars from
#' \code{pbapply}. To disable these, set
#' \code{\link[pbapply]{pboptions}(type="none")}. To re-enable, set
#' \code{\link[pbapply]{pboptions}(type="timer")}.
#'
#' If using existing DE test results, assign results of differential gene
#' expression tests for all pairwise combinations of clusters in sCVdata to the
#' \code{\link{DEcombn}} slot of the \code{\link{sCVdata}} object. See example
#' and slot documentation.
#'
#' @param sCVd An sCVdata object.
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#'
#' @return A named list of data frames, one entry for each pairwise combination
#'   of levels in \code{Clusters(sCVd)} (with corresponding name where levels
#'   are separated by '-'). Each entry is data frame containing gene
#'   differential expression stats when comparing the cells of that cluster to
#'   all other cells in the input data. Rows represent genes, and variables
#'   include \code{logGER} (an effect size measure: gene expression ratio in log
#'   space, often referred to as logFC), \code{dDR} (an effect size measure:
#'   difference in detection rate), and \code{FDR} (significance measure: false
#'   discovery rate). Also included are \code{Wstat} and \code{pVal}, the test
#'   statistic and the p-value of the Wilcoxon rank sum test.
#'
#' @seealso \code{\link{CalcSCV}} for wrapper function to calculate all
#'   statistics for an sCVdata object, and \code{\link{fx_calcEScombn}} and
#'   \code{\link{fx_calcDEcombn}} for the internal functions performing the
#'   calculations. Wilcox test is now powered by \code{\link[presto]{wilcoxauc}}
#'   for super speed.
#'
#' @examples
#' \dontrun{
#' ## Example using CalcDEvsRest ##
#' DEcombn(your_sCV_obj) <- CalcDEcombn(sCVd=your_sCV_obj,
#'                                      inD=your_scRNAseq_data_object)
#'
#'
#' ## Example using MAST results from Seurat to replace CalcDEcombn ##
#'
#' MAST_pw <- apply(combn(levels(your_seurat_obj@ident),2),
#'                  MARGIN=2,
#'                  function(X) {
#'                    FindMarkers(your_seurat_obj,
#'                                ident.1=X[1],
#'                                ident.2=X[2],
#'                                logfc.threshold=0,
#'                                min.pct=0.1,
#'                                test.use="MAST",
#'                                latent.vars="nUMI")
#'                  })
# ^ Test DE between every pairwise combination of clusters
#' names(MAST_pw) <- apply(combn(levels(your_seurat_obj@ident),2),2,
#'                         function(X) paste(X,collapse="-"))
#' # ^ Names must be in "X-Y" format
#' 
#' for (i in names(MAST_pw)) {
#'   MAST_pw[[i]]$dDR <- MAST_pw[[i]]$pct.1 - MAST_pw[[i]]$pct.2
#'   # ^ Diff in detect rate (dDR) must be a variable in each dataframe
#'   names(MAST_pw[[i]])[names(MAST_pw[[i]]) == "avg_logFC"] <- "logGER"
#'   # ^ Effect size variable must be named 'logGER'
#'   names(MAST_pw[[i]])[names(MAST_pw[[i]]) == "p_val_adj"] <- "FDR"
#'   # ^ Significance variable must be named 'FDR'
#'   # Note: rownames of each dataframe must be gene names, 
#'   # but FindMarkers should already do this.
#' }
#' DEcombn(your_sCV_obj) <- MAST_pw
#' # ^ Slot MAST results into sCVdata object
#'
#' }
#'
#' @name CalcDEcombn
#'
#' @export
#' 

setGeneric("CalcDEcombn",function(sCVd,inD)
  standardGeneric("CalcDEcombn"))


#' @describeIn CalcDEcombn Calculate DE between cluster pairs
#' @export

setMethod("CalcDEcombn","sCVdata",
          function(sCVd,inD) {
            if (!is(inD)[1] %in% findMethodSignatures(getExpr)) {
              stop(paste("The input data object must be one of:",
                         paste(findMethodSignatures(getExpr),collapse=", "),
                         sep="\n  "))
            }
            if (length(levels(Clusters(sCVd))) <= 1) {
              stop("scClustViz can't calculate differential expression when there's only one cluster.")
            }
            
            deMes <- fx_calcEScombn(cl=Clusters(sCVd),
                                    CGS=ClustGeneStats(sCVd),
                                    DRthresh=Param(sCVd,"DRthresh"))
            # if (UseBiocParallel) {
            #   deMes <- fx_calcDEcombn_BP(nge=getExpr(inD,Param(sCVd,"assayType")),
            #                              cl=Clusters(sCVd),
            #                              deMes=deMes)
            # } else {
            if (require(presto)) {
              
              deMes <- fx_calcDEcombn(nge=getExpr(inD,
                                                  Param(sCVd,"assayType")[1],
                                                  Param(sCVd,"assayType")[2]),
                                      cl=Clusters(sCVd),
                                      deMes=deMes)
            } else {
              warning(
                paste("",
                      "--------------------------------------------------------",
                      "This wilcoxon rank-sum test is 1000x faster with presto.",
                      "Install by running:",
                      "devtools::install('immunogenomics/presto')",
                      "--------------------------------------------------------",
                      "",
                      sep="\n")
              )
              deMes <- fx_calcDEcombn_slow(nge=getExpr(inD,
                                                       Param(sCVd,"assayType")[1],
                                                       Param(sCVd,"assayType")[2]),
                                           cl=Clusters(sCVd),
                                           deMes=deMes)
            }
            # }
            return(deMes)
          })


# CalcSilhouette ----

#' Internal fx to call \code{\link[cluster]{silhouette}} for sCVdata.
#'
#' Internal function. See \code{\link{CalcSilhouette}}.
#' 
#' @param pca The cell embeddings used to calculate clusters.
#' @param cl The cluster assignments.
#' 
#' @return A silhouette object.
#' 

fx_calcSilhouette <- function(pca,cl) {
  return(cluster::silhouette(as.integer(cl),dist(pca)))
}


#' Calculates silhouette widths for all cells in sCVdata
#'
#' Calls \code{\link[cluster]{silhouette}} to calculate silhouette widths (a
#' metric indicating each cell's contribution to cluster cohesion / separation)
#' for all cells in the sCVdata object, using the cell embedding used in
#' clustering (see \code{Param(sCVd,"DRforClust")}).
#'
#' @param sCVd An sCVdata object.
#' @param inD The input dataset. An object of class \code{\link[Seurat]{seurat}}
#'   or \code{\link[SingleCellExperiment]{SingleCellExperiment}}. Other data
#'   classes are not currently supported.
#'   \href{https://github.com/BaderLab/scClustViz/issues}{Please submit requests
#'   for other data objects here!}
#'
#' @return A silhouette object containing the silhouette widths for all cells in
#'   the sCVdata object.
#'
#' @seealso \code{\link{CalcSCV}} for wrapper function to calculate all
#'   statistics for an sCVdata object, \code{\link{fx_calcSilhouette}} for the
#'   internal function this method points to, and
#'   \code{\link[cluster]{silhouette}} for the function doing the calculations.
#'
#' @examples
#' \dontrun{
#' Silhouette(your_sCV_obj) <- CalcSilhouette(sCVd=your_sCV_obj,
#'                                            inD=your_scRNAseq_data_object)
#' }
#'
#' @name CalcSilhouette
#'
#' @export
#' 

setGeneric("CalcSilhouette",function(sCVd,inD) standardGeneric("CalcSilhouette"))


#' @describeIn CalcSilhouette Calculate silhouette widths for sCVdata
#' @export

setMethod("CalcSilhouette",signature("sCVdata"),function(sCVd,inD) {
  if (require(cluster)) {
    if (length(levels(Clusters(sCVd))) <= 1) {
      stop("Silhouette cannot be calculated with a single cluster.")
    }
    fx_calcSilhouette(pca=getEmb(inD,Param(sCVd,"DRforClust")),
                      cl=Clusters(sCVd))
  } else {
    stop(paste("Silhouette could not be calculated because package 'cluster' is missing.",
               "Try 'install.packages(cluster)', then run 'CalcSilhouette()' again.",sep="\n  "))
  }
})


# DEdist ----

#' Internal function for distance between clusters by number of DE genes.
#'
#' Internal function. See \code{\link{DEdist}}.
#'
#' Counts number of DE genes between each pair of clusters from
#' \code{\link{CalcDEcombn}} output.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}
#' @param FDRthresh False discovery rate for counting significance.
#'
#' @return A matrix of distances between clusters for each cluster resolution.
#'   Interpretable by \code{\link[stats]{as.dist}} to generate a \code{dist}
#'   object.
#'   

fx_calcDist_numDE <- function(deVS,FDRthresh) {
  deD <- sapply(deVS,function(X) sum(X$FDR <= FDRthresh,na.rm=T))
  names(deD) <- NULL
  return(sapply(unique(unlist(strsplit(names(deVS),"-"))),function(X)
    sapply(unique(unlist(strsplit(names(deVS),"-"))),function(Y) {
      if (X == Y) {
        return(NA)
      } else {
        return(deD[sapply(strsplit(names(deVS),"-"),
                          function(comp) X %in% comp & Y %in% comp)])
      }
    }))) # returns a matrix of distances b/w clusters (for DEneighb)
}


#' Internal function for distance between clusters by DE score.
#'
#' Internal function. See \code{\link{DEdist}}.
#'
#' Sums absolute DE scores between each pair of clusters from
#' \code{\link{CalcDEcombn}} output. DE scores are \code{-log10(FDR)}. Used to
#' determine nearest neighbouring clusters.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}
#'
#' @return A matrix of distances between clusters for each cluster resolution.
#'   Interpretable by \code{\link[stats]{as.dist}} to generate a \code{dist}
#'   object.
#'   

fx_calcDist_scoreDE <- function(deVS) {
  d <- vapply(deVS,function(X) {
    temp <- -log10(X$FDR) 
    temp[is.na(temp)] <- 0
    temp[temp == Inf] <- max(temp[temp < Inf]) + 1
    return(sum(temp^2))
    },FUN.VALUE=numeric(1))^0.5
  cb <- strsplit(names(deVS),"-")
  cl <- unique(unlist(cb))
  tempOut <- matrix(nrow=length(cl),ncol=length(cl),dimnames=list(cl,cl))
  for (i in seq_along(d)) {
    tempOut[cb[[i]][1],cb[[i]][2]] <- d[i]
    tempOut[cb[[i]][2],cb[[i]][1]] <- d[i]
  }
  return(tempOut) 
}

#' Internal function for distance between clusters by gene expression statistic.
#'
#' Internal function. See \code{\link{DEdist}}.
#'
#' Calculates euclidean distance in gene expression space between each pair of
#' clusters, using a summary statistic of gene expression for each cluster.
#' Calculated using \code{\link[stats]{dist}}.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}
#' @param metric A summary statistic of gene expression per cluster, from
#'   \code{\link{CalcCGS}}
#'
#' @return A matrix of distances between clusters for each cluster resolution.
#'   Interpretable by \code{\link[stats]{as.dist}} to generate a \code{dist}
#'   object.
#'   

fx_calcDist_geneExpr <- function(CGS,metric) {
  if (!metric %in% names(CGS[[1]])) {
    stop(paste("metric must be one of:",paste(names(CGS[[1]]),collapse=", ")))
  }
  tempDist <- as.vector(dist(t(sapply(CGS,function(X) X[[metric]]))))
  tempNames <- apply(combn(names(CGS),2),2,function(X) paste(X,collapse="-"))
  return(sapply(unique(unlist(strsplit(tempNames,"-"))),function(X)
    sapply(unique(unlist(strsplit(tempNames,"-"))),function(Y) {
      if (X == Y) {
        return(NA)
      } else {
        return(tempDist[sapply(strsplit(tempNames,"-"),
                               function(comp) X %in% comp & Y %in% comp)])
      }
    }))) # returns a matrix of distances b/w clusters (for DEneighb)
}

#' Internal function for distance between clusters from cell embedding
#'
#' Internal function. See \code{\link{DEdist}}.
#'
#' Calculates euclidean distance between centroids of each cluster in the cell
#' embedding used for clustering. Calculated using \code{\link[stats]{dist}}.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}
#' @param cellEmb A cell embedding with cells in rows and dimensions in columns.
#'
#' @return A matrix of distances between clusters for each cluster resolution.
#'   Interpretable by \code{\link[stats]{as.dist}} to generate a \code{dist}
#'   object.
#'   

fx_calcDist_cellEmb <- function(cl,cellEmb) {
  temp <- as.matrix(dist(apply(cellEmb,2,function(X) tapply(X,cl,mean))))
  temp[temp == 0] <- NA
  return(temp)
}


#' Calculate inter-cluster distances
#'
#' A variety of methods for calculating inter-cluster distances for sCVdata
#' objects, most of which return matrices which can be converted to
#' \code{\link[stats]{dist}} objects.
#'
#' @param sCVd An sCVdata object.
#' @param y One of the following, depending on the desired distance metric:
#'   \describe{
#'     \item{By number of differentially expressed genes:}{A numeric vector of 
#'       length one representing the threshold of false discovery rate 
#'       controlled for when counting significantly differentially expressed 
#'       genes.}
#'     \item{By differential expression significance score:}{Nothing. Only pass
#'       an argument to \code{sCVd}.}
#'     \item{By gene expression summary statistic:}{A character vector of length 
#'       one referring to a column name in \code{ClustGeneStats(sCVd)}: 
#'       \code{MGE}, \code{DR}, or \code{MDGE}.}
#'     \item{By distance between centroids in a cell embedding:}{A matrix 
#'       representing the cell embedding, where cells are rows and dimensions
#'       are columns. See \code{\link{getEmb}} to easily extract this from your
#'       input data object.}
#'   }
#'   
#' @return A matrix of distances between clusters for each cluster resolution.
#'   Interpretable by \code{\link[stats]{as.dist}} to generate a \code{dist}
#'   object.
#'
#' @seealso \code{\link{DEdistNN}} for a method to find nearest neighbouring
#'   clusters from \code{DEdist} results.
#'
#' @examples
#' \dontrun{
#' numDE_bw_clusts <- DEdist(sCVd=your_sCV_obj,0.01)
#' DEdistNN(numDE_bw_clusts)
#' }
#'
#' @name DEdist
#'
#' @export
#' 

setGeneric("DEdist",function(sCVd,y) standardGeneric("DEdist"))


#' @describeIn DEdist By number of differentially-expressed genes (see
#'   \code{\link{fx_calcDist_numDE}})
#' @export

setMethod("DEdist",signature("sCVdata","numeric"),
          function(sCVd,y) fx_calcDist_numDE(deVS=DEcombn(sCVd),FDRthresh=y))


#' @describeIn DEdist By differential expression significance score (see
#'   \code{\link{fx_calcDist_scoreDE}})
#' @export

setMethod("DEdist",signature("sCVdata","missing"),
          function(sCVd,y) fx_calcDist_scoreDE(DEcombn(sCVd)))


#' @describeIn DEdist By gene expression summary statistic (see
#'   \code{\link{fx_calcDist_geneExpr}})
#' @export

setMethod("DEdist",signature("sCVdata","character"),
          function(sCVd,y) fx_calcDist_geneExpr(CGS=ClustGeneStats(sCVd),metric=y))


#' @describeIn DEdist By distance between centroids in a cell embedding (see
#'   \code{\link{fx_calcDist_cellEmb}})
#' @export

setMethod("DEdist",signature("sCVdata","matrix"),
          function(sCVd,y) fx_calcDist_cellEmb(Clusters(sCVd),y))


# DEdistNN ----

#' Determine nearest neighbouring clusters
#'
#' Finds nearest neighbouring clusters from output of \code{\link{DEdist}}.
#'
#' @param d Output of \code{\link{DEdist}}
#'
#' @return A named integer vector where each name is a cluster and element is
#'   the position of its nearest neighbouring cluster in \code{Clusters(sCVd)}.
#'
#' @examples
#' \dontrun{
#' numDE_bw_clusts <- DEdist(sCVd=your_sCV_obj,0.01)
#' DEdistNN(numDE_bw_clusts)
#' }
#'
#' @name DEdistNN
#'
#' @export
#' 

setGeneric("DEdistNN",function(x) standardGeneric("DEdistNN"))


#' @describeIn DEdistNN Nearest neighbours from DEdist vector.
#' @export

setMethod("DEdistNN",signature("numeric"),
          function(x) 
            sapply(unique(unlist(strsplit(names(x),"-"))),
                   function(Y) 
                     as.integer(sub(paste0("-?",Y,"-?"),"",
                                    names(which.min(x[grep(Y,names(x))]))))))


#' @describeIn DEdistNN Nearest neighbours from DEdist matrix.
#' @export

setMethod("DEdistNN",signature("matrix"),
          function(x) apply(x,1,function(X) names(which.min(X))))


# DEneighb ----

#' Internal function to find DE genes between nearest neighbouring clusters
#'
#' Internal function. See \code{\link{DEneighb}}.
#'
#' Identifies the significantly differentially expressed genes between nearest
#' neighbouring clusters, using the pairwise differential gene expression
#' testing results.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}.
#' @param NN A vector of nearest neighbours from \code{\link{DEdistNN}}.
#' @param FDRthresh False discovery rate for counting significance.
#'
#' @return A named list of data frames, one entry for each cluster. Each data
#'   frame is the results of the gene expression test comparing the cluster to
#'   its nearest neighbour. See \code{\link{CalcDEcombn}} and
#'   \code{\link{DEcombn}} for details.
#'   


fx_calcNeighb <- function(deVS,NN,FDRthresh) {
  nb <- rbind(paste(names(NN),NN,sep="-"),
              paste(NN,names(NN),sep="-"))
  colnames(nb) <- names(NN)
  nbd <- apply(nb,2,function(X) X %in% names(deVS))
  nb <- nb[nbd]
  nbd <- apply(nbd,2,which)
  nbd[nbd == 2] <- -1
  if (missing(FDRthresh)) { FDRthresh <- 1 }
  deN <- sapply(seq_along(nb),function(i) {
    temp <- which(deVS[[nb[i]]]$FDR <= FDRthresh &
                    deVS[[nb[i]]]$logGER * nbd[i] > 0)
    out <- deVS[[nb[i]]][temp,names(deVS[[nb[i]]]) != "overThreshold"] # backwards compatibility
    names(out) <- paste(names(out),paste(names(NN),NN,sep="-")[i],sep="_")
    return(out)
  },simplify=F)
  names(deN) <- names(NN)
  return(deN)
}


#' Find differentially expressed genes between nearest neighbouring clusters
#'
#' Identifies the significantly differentially expressed genes between nearest
#' neighbouring clusters, using the pairwise differential gene expression
#' testing results.
#'
#' @param sCVd An sCVdata object.
#' @param FDRthresh False discovery rate for counting significance.
#'
#' @return A named list of data frames, one entry for each cluster. Each data
#'   frame is the results of the gene expression test comparing the cluster to
#'   its nearest neighbour. See \code{\link{CalcDEcombn}} and
#'   \code{\link{DEcombn}} for details.
#'
#' @name DEneighb
#'
#' @export
#' 

setGeneric("DEneighb",function(sCVd,FDRthresh) standardGeneric("DEneighb"))


#' @describeIn DEneighb Find DE genes between nearest neighbours from sCVdata
#' @export

setMethod("DEneighb","sCVdata",function(sCVd,FDRthresh) fx_calcNeighb(deVS=DEcombn(sCVd),
                                                                      NN=DEdistNN(DEdist(sCVd)),
                                                                      FDRthresh=FDRthresh))


# DEmarker ----

#' Internal function to find marker genes for each cluster
#'
#' Internal function. See \code{\link{DEmarker}}.
#'
#' Identifies the significantly positively differentially expressed genes
#' between a cluster and all other clusters, using the pairwise differential
#' gene expression testing results.
#'
#' @param deVS List of pairwise DE results from \code{\link{CalcDEcombn}}.
#' @param FDRthresh False discovery rate for counting significance.
#'
#' @return A named list of data frames, one entry for each cluster. Each data
#'   frame is the combined results of the set of pairwise gene expression tests
#'   comparing the cluster to each other cluster in the data. See
#'   \code{\link{CalcDEcombn}} and \code{\link{DEcombn}} for details.
#'   

fx_calcMarker <- function(deVS,FDRthresh) {
  combosL <- sapply(unique(unlist(strsplit(names(deVS),"-"))),
                    function(clust) {
                      temp <- sapply(strsplit(names(deVS),"-"),
                                     function(comp) 
                                       which(comp == clust))
                      out <- unlist(temp)
                      out[out == 2] <- -1
                      names(out) <- which(sapply(temp,length) > 0)
                      return(out)
                    },
                    simplify=F)
  if (missing(FDRthresh)) { FDRthresh <- 1 }
  mNames <- sapply(combosL,function(comp) {
    cn <- as.integer(names(comp))
    if ("Wstat" %in% colnames(deVS[[1]])) {
      return(Reduce(intersect,sapply(seq_along(comp),function(i) {
        tempW <- deVS[[cn[i]]]$Wstat - deVS[[cn[i]]]$Wstat[which.max(deVS[[cn[i]]]$FDR)]
        rownames(deVS[[cn[i]]])[which(deVS[[cn[i]]]$FDR <= FDRthresh & tempW * comp[i] > 0)]
      },simplify=F)))
    } else {
      return(Reduce(intersect,sapply(seq_along(comp),function(i) {
        rownames(deVS[[cn[i]]])[which(deVS[[cn[i]]]$FDR <= FDRthresh & 
                                             deVS[[cn[i]]]$logGER * comp[i] > 0)]
      },simplify=F)))
    }
  },simplify=F)
  deM <- sapply(seq_along(mNames),function(i) {
    do.call(cbind,lapply(names(combosL[[i]]),function(l) {
      L <- as.integer(l)
      temp <- deVS[[L]][mNames[[i]],names(deVS[[L]]) != "overThreshold"] # backwards compatibility
      if (combosL[[i]][l] == -1) {
        temp$logGER <- temp$logGER * combosL[[i]][l]
        temp$dDR <- temp$dDR * combosL[[i]][l]
        tempN <- paste(strsplit(names(deVS)[L],split="-")[[1]][c(2,1)],collapse="-")
        names(temp) <- paste(names(temp),tempN,sep="_")
      } else {
        names(temp) <- paste(names(temp),names(deVS)[L],sep="_")
      }
      return(temp)
    }))
  },simplify=F)
  names(deM) <- names(mNames)
  return(deM)
}


#' Find marker genes for each cluster
#'
#' Identifies the significantly positively differentially expressed genes
#' between a cluster and all other clusters, using the pairwise differential
#' gene expression testing results.
#'
#' @param sCVd An sCVdata object.
#' @param FDRthresh False discovery rate for counting significance.
#'
#' @return A named list of data frames, one entry for each cluster. Each data
#'   frame is the combined results of the set of pairwise gene expression tests
#'   comparing the cluster to each other cluster in the data. See
#'   \code{\link{CalcDEcombn}} and \code{\link{DEcombn}} for details.
#'
#' @name DEmarker
#'
#' @export
#' 

setGeneric("DEmarker",function(sCVd,FDRthresh) standardGeneric("DEmarker"))


#' @describeIn DEmarker Find marker genes from sCVdata
#' @export

setMethod("DEmarker","sCVdata",function(sCVd,FDRthresh) fx_calcMarker(deVS=DEcombn(sCVd),
                                                                      FDRthresh=FDRthresh))
BaderLab/scClustViz documentation built on Sept. 10, 2023, 11:51 p.m.