ICAS:

Documented in FindAllMarkers FindMarkers

#' SJ expression markers of identity classes
#'
#' Finds markers (differentially expressed SJs) for identity classes
#' @title FindMarkers
#' @name FindMarkers
#' @param object ICASDataSet object
#' @param ident.1 Identity class to define markers for
#' @param ident.2 A second identity class for comparison. If NULL (default) -
#' use all other cells for comparison.
#' @param genes.use SJs to test. Default is to use all SJs
#' @param delta.threshold Limit testing to SJs which show, on average, at least
#' delta.threshold between the two groups of samples. Default is 0.1
#' Increasing delta.threshold speeds up the function, but can miss weaker signals.
#' @param test.use Denotes which test to use. Available options are:
##' \itemize{
##'  \item{"wilcox"} : Wilcoxon rank sum test
##'  \item{"bimod"} : Likelihood-ratio test for SJ expression,
##'  (McDavid et al., Bioinformatics, 2013)
##'  \item{"roc"} : Standard AUC classifier
##'  \item{"t"} : Student's t-test
##'  \item{"WD"} : waldtest of variance model
##'  \item{"tobit"} : Tobit-test for differential SJ expression (Trapnell et
##'  al., Nature Biotech, 2014) (default)
##'  \item{"poisson"} : Likelihood ratio test assuming an underlying poisson
##'   distribution. Use only for UMI-based datasets
##'  \item{"negbinom"} :  Likelihood ratio test assuming an underlying negative
##'  binomial distribution. Use only for UMI-based datasets
##'  \item{"MAST"} : GLM-framework that treates cellular detection rate as a
##'  covariate (Finak et al, Genome Biology, 2015)
##'  \item{"Hyper"} : DE based on Hypergeometric test
##'  \item{"BB"} : DE based on a generalized linear models using betabinomial distribution
##' }
#' @param min.pct  only test SJs that are detected in a minimum fraction of
#' min.pct cells in either of the two populations. Meant to speed up the function
#' by not testing SJs that are very infrequently expressed. Default is 0.1
#' @param min.diff.pct  only test SJs that show a minimum difference in the
#' fraction of detection between the two groups. Set to -Inf by default
#' @param only.pos Only return positive markers (FALSE by default)
#' @param print.bar Print a progress bar once expression testing begins (uses
#' pbapply to do this)
#' @param max.cells.per.ident Down sample each identity class to a max number.
#' Default is no downsampling. Not activated by default (set to Inf)
#' @param random.seed Random seed for downsampling
#' @param min.cells.gene Minimum number of cells expressing the SJ in at least one
#' of the two groups, currently only used for poisson and negative binomial tests
#' @param maxit (Only for test.use is BB) maximum number of (usually Fisher-scoring) iterations allowed.
#' Decreasing maxit speeds up the function, but can weaken statistical reliability.
#' @param confounder (Only for test.use is BB) The confounder to regress out.
#' @param min.cells.group Minimum number of cells in one of the groups
#' @param NT cores for parallel (Currently only support roc test)
#' @param \dots Additional parameters to pass to specific DE functions
#' @seealso \code{\link{MASTDETest}}, and \code{\link{DESeq2DETest}} for more information on these methods
#' @return Matrix containing a ranked list of putative markers, and associated
#' statistics (p-values, ROC score, etc.)
#' @details p-value adjustment is performed using bonferroni correction based on
#' the total number of SJs in the dataset. Other correction methods are not
#' recommended, as ICAS pre-filters SJs using the arguments above, reducing
#' the number of tests performed. Lastly, as Aaron Lun has pointed out, p-values
#' should be interpreted cautiously, as the SJs used for clustering are the
#' same SJs tested for differential expression.
#' @import pbapply
#' @importFrom lmtest lrtest
#' @importFrom Matrix rowMeans
#' @importFrom matrixStats rowSds
#'
#' @seealso \code{\link{NegBinomDETest}}
#' @seealso Seurat
#' @references Seurat
#'
#' @export
#'

globalVariables(names = 'deltaPSI', package = 'ICAS', add = TRUE)

FindMarkers <- function(
  object,
  ident.1,
  ident.2 = NULL,
  genes.use = NULL,
  delta.threshold = 0.1,
  test.use = "tobit",
  min.pct = 0.1,
  min.diff.pct = -Inf,
  print.bar = TRUE,
  only.pos = FALSE,
  max.cells.per.ident = Inf,
  random.seed = 1,
  min.cells.gene = 3,
  min.cells.group = 3,
  maxit = 10,
  confounder = NULL,
  NT = 1,
  ...
) {
  if(!is(object, "ICASDataSet"))
    stop("The object must be a ICASDataSet data")

  data.use <- psi(object = object)

  genes.use <- SetIfNull(x = genes.use, default = rownames(x = data.use))

  if(!all(genes.use %in% rownames(x = data.use))) {
    stop("genes.use are not in object@psi")
  }

  if(!ident.1 %in% levels(object@colData[[object@design]])) {
    stop("ident.1 are not in levels of design")
  }

  if(!is.null(ident.2)) {
    if(any(!ident.2 %in% levels(object@colData[[object@design]]))) {
      stop("ident.2 are not in levels of design")
    }
  }

  cells.1 <- WhichCells(object = object, ident = ident.1)

  # if NULL for ident.2, use all other cells
  if(is.null(ident.2)) {
    cells.2 <- setdiff(colnames(data.use), cells.1)
  } else {
    cells.2 <- unlist(lapply(ident.2, FUN = function(x) WhichCells(object = object, ident = x)))
  }

  # error checking
  if (length(x = cells.1) == 0) {
    message(paste("Cell group 1 is empty - no cells with identity class", ident.1))
    return(NULL)
  }
  if (length(x = cells.2) == 0) {
    message(paste("Cell group 2 is empty - no cells with identity class", ident.2))
    return(NULL)
  }
  if (length(cells.1) < min.cells.group) {
    stop(paste("Cell group 1 has fewer than", as.character(min.cells.group), "cells in identity class", ident.1))
  }
  if (length(cells.2) < min.cells.group) {
    stop(paste("Cell group 2 has fewer than", as.character(min.cells.group), " cells in identity class", ident.2))
  }

  if(is.null(NT)) {
    NT <- 1
  }

  if(NT < 1) {
    stop("parallel must be a positive interger")
  }

  # gene selection (based on percent expressed)
  thresh.min <- 0
  data.temp1 <- round(
    x = apply(
      X = data.use[genes.use, cells.1, drop = F],
      MARGIN = 1,
      FUN = function(x) {
        return(sum(x > thresh.min, na.rm = TRUE) / length(x = x))
        # return(length(x = x[x>thresh.min]) / length(x = x))
      }
    ),
    digits = 3
  )
  data.temp2 <- round(
    x = apply(
      X = data.use[genes.use, cells.2, drop = F],
      MARGIN = 1,
      FUN = function(x) {
        return(sum(x > thresh.min, na.rm = TRUE) / length(x = x))
        # return(length(x = x[x > thresh.min]) / length(x = x))
      }
    ),
    digits = 3
  )
  data.alpha <- cbind(data.temp1, data.temp2)
  colnames(x = data.alpha) <- c("pct.1","pct.2")
  alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = min)
  names(x = alpha.min) <- rownames(x = data.alpha)

  genes.use <- names(x = which(x = alpha.min > min.pct))
  if (length(x = genes.use) == 0) {
    stop("No SJs pass min.pct threshold")
  }
  alpha.diff <- apply(X = data.alpha, MARGIN = 1, FUN = max) - alpha.min
  genes.use <- names(
    x = which(x = alpha.min > min.pct & alpha.diff > min.diff.pct)
  )
  if (length(x = genes.use) == 0) {
    stop("No SJs pass min.diff.pct threshold")
  }

  #gene selection (based on average difference)

  data.1 <- Matrix::rowMeans(data.use[genes.use, cells.1, drop = F], na.rm = TRUE)
  data.2 <- Matrix::rowMeans(data.use[genes.use, cells.2, drop = F], na.rm = TRUE)

  sd.1 <- matrixStats::rowSds(data.use[genes.use, cells.1, drop = F], na.rm = TRUE)
  sd.2 <- matrixStats::rowSds(data.use[genes.use, cells.2, drop = F], na.rm = TRUE)

  data.mean <- data.frame(mean.1 = data.1, mean.2 = data.2, row.names = genes.use)
  data.sd <- data.frame(sd.1 = sd.1, sd.2 = sd.2, row.names = genes.use)

  total.diff <- data.1 - data.2

  if (!only.pos) genes.diff <- names(x = which(x = abs(x = total.diff) >= delta.threshold))
  if (only.pos) genes.diff <- names(x = which(x = total.diff >= delta.threshold))

  genes.use <- intersect(x = genes.use, y = genes.diff)
  if (length(x = genes.use) == 0) {
    stop("No SJs pass delta.threshold threshold")
  }

  if (max.cells.per.ident < Inf) {
    set.seed(seed = random.seed)
    if (length(cells.1) > max.cells.per.ident) cells.1 = sample(x = cells.1, size = max.cells.per.ident)
    if (length(cells.2) > max.cells.per.ident) cells.2 = sample(x = cells.2, size = max.cells.per.ident)
  }

  if (test.use == "bimod") {
    to.return <- DiffExpTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT
    )
  }

  if (test.use == "roc") {
    to.return <- varImportances(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT
    )
  }

  if (test.use == "t") {
    to.return <- DiffTTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT
    )
  }

  if (test.use == "tobit") {
    to.return <- TobitTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT
    )
  }

  if (test.use == "negbinom") {
    to.return <- NegBinomDETest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      min.cells = min.cells.gene,
      NT = NT
    )
  }

  if (test.use == "poisson") {
    to.return <- PoissonDETest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      min.cells = min.cells.gene,
      NT = NT
    )
  }

  if (test.use == "MAST") {
    to.return <- MASTDETest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      ...
    )
  }

  if (test.use == "wilcox") {
    to.return <- WilcoxDETest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT,
      ...
    )
  }

  if (test.use == "WD") {
    to.return <- LRDETest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT,
      ...
    )
  }

  if(test.use == "Hyper") {
    to.return <- phyperTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      NT = NT
    )
  }

  if(test.use == "BB") {
    to.return <- bbTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = print.bar,
      maxit = maxit,
      confounder = confounder,
      NT = NT
    )
  }

  #return results
  to.return[, "deltaPSI"] <- total.diff[rownames(x = to.return)]
  to.return <- cbind(to.return,
                     data.alpha[rownames(x = to.return), , drop = FALSE],
                     data.mean[rownames(x = to.return), , drop = FALSE],
                     data.sd[rownames(x = to.return), , drop = FALSE])

  if(test.use != "roc") {
    to.return$p_val_adj = p.adjust(p = to.return$p_val)
  }

  if (test.use == "roc") {
    to.return <- to.return[order(-to.return$Importance, -to.return$deltaPSI), ]
  } else {
    to.return <- to.return[order(to.return$p_val, -to.return$deltaPSI), ]
  }

  if (only.pos) {
    to.return <- subset(x = to.return, subset = deltaPSI > 0)
  }
  return(to.return)
}

globalVariables(
  names = c('myAUC', 'p_val', 'deltaPSI'),
  package = 'ICAS',
  add = TRUE
)

#' @title FindAllMarkers
#' SJ expression markers for all identity classes
#'
#' Finds markers (differentially expressed SJs) for each of the identity classes in a dataset
#'
#' @inheritParams FindMarkers
#' @param print.bar Print a progress bar once expression testing begins (uses pbapply to do this)
#' @param max.cells.per.ident Down sample each identity class to a max number. Default is no downsampling.
#' @param random.seed Random seed for downsampling
#' @param return.thresh Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)
#' @param min.cells.gene Minimum number of cells expressing the SJ in at least one
#' of the two groups, currently only used for poisson and negative binomial tests
#' @param min.cells.group Minimum number of cells in one of the groups
#' @param \dots Additional parameters to pass to specific DE functions
#'
#' @return Matrix containing a ranked list of putative markers, and associated
#' statistics (p-values, ROC score, etc.)
#'
#' @export
#'
FindAllMarkers <- function(
  object,
  genes.use = NULL,
  delta.threshold = 0.1,
  test.use = "tobit",
  min.pct = 0.1,
  min.diff.pct = -Inf,
  print.bar = TRUE,
  only.pos = FALSE,
  max.cells.per.ident = Inf,
  return.thresh = 1e-2,
  random.seed = 1,
  maxit = 10,
  confounder = NULL,
  min.cells.gene = 3,
  min.cells.group = 3,
  NT = 1,
  ...
) {
  data.1 <- psi(object)
  genes.use <- SetIfNull(x = genes.use, default = rownames(x = data.1))
  if ((test.use == "roc") && (return.thresh == 1e-2)) {
    return.thresh = 0.7
  }
  idents.all <- levels(colData(object)[[object@design]])

  genes.de <- list()
  for (i in seq_along(idents.all)) {
    genes.de[[i]] <- tryCatch(
      {
        FindMarkers(
          object = object,
          ident.1 = idents.all[i],
          ident.2 = NULL,
          genes.use = genes.use,
          delta.threshold = delta.threshold,
          test.use = test.use,
          min.pct = min.pct,
          min.diff.pct = min.diff.pct,
          print.bar = print.bar,
          min.cells.gene = min.cells.gene,
          min.cells.group = min.cells.group,
          max.cells.per.ident = max.cells.per.ident,
          maxit = maxit,
          confounder = confounder,
          NT = NT,
          ...
        )
      },
      error = function(cond){
        return(NULL)
      }
    )
  }

  gde.all <- data.frame()
  for (i in seq_along(idents.all)) {
    if (is.null(x = unlist(x = genes.de[i]))) {
      next
    }
    gde <- genes.de[[i]]
    if (nrow(x = gde) > 0) {
      if (test.use == "roc") {
        gde <- gde
      } else {
        gde <- gde[order(gde$p_val, -gde$deltaPSI), ]
        gde <- subset(x = gde, subset = p_val <= return.thresh)
      }
      if (nrow(x = gde) > 0) {
        gde$cluster <- idents.all[i]
        gde$gene <- rownames(x = gde)
      }
      if (nrow(x = gde) > 0) {
        gde.all <- rbind(gde.all, gde)
      }
    }
  }
  if ((only.pos) && nrow(gde.all) > 0) {
    return(subset(x = gde.all, subset = deltaPSI > 0))
  }
  rownames(x = gde.all) <- make.unique(names = as.character(x = gde.all$gene))
  if (nrow(gde.all) == 0) {
    warning("No DE SJs identified.")
  }
  return(gde.all)
}