R/runDEAnalysis.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Documented in .formatDEAList runANOVA runDEAnalysis runDESeq2 runLimmaDE runMAST

#' Helper function for differential expression analysis methods that accepts
#' multiple ways of conditional subsetting and returns stable index format.
#' Meanwhile it does all the input checkings.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object. Required.
#' @param useAssay character. A string specifying which assay to use. Required.
#' @param index1 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. Specifies
#' which cells are of interests. Default \code{NULL}.
#' @param index2 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. specifies
#' the control group against those specified by \code{index1}. If
#' \code{NULL} when using index specification, \code{index1} cells will be
#' compared with all other cells. Default \code{NULL}.
#' @param class A vector/factor with \code{ncol(inSCE)} elements, or a character
#' scalar that specifies a column name of \code{colData(inSCE)}. Default
#' \code{NULL}.
#' @param classGroup1 a vector specifying which "levels" given in \code{class}
#' are of interests. Default \code{NULL}.
#' @param classGroup2 a vector specifying which "levels" given in \code{class}
#' is the control group against those specified by \code{classGroup1}. If
#' \code{NULL} when using annotation specification, \code{classGroup1} cells
#' will be compared with all other cells.
#' @param groupName1 A character scalar naming the group of interests. Required.
#' @param groupName2 A character scalar naming the control group. Required.
#' @param analysisName A character scalar naming the DEG analysis. Required
#' @param covariates A character vector of additional covariates used in linear
#' regression methods such as Limma and DESeq2. Default \code{NULL}
#' @param overwrite A logical scalar. Whether to overwrite result if exists.
#' Default \code{FALSE}.
#' @return A list object with part of formatted DE analysis information
#' @author Yichen Wang
.formatDEAList <- function(inSCE, useAssay, index1 = NULL, index2 = NULL,
                           class = NULL, classGroup1 = NULL,
                           classGroup2 = NULL, groupName1, groupName2,
                           analysisName, covariates = NULL, overwrite = FALSE){
    # Input checks
    if(!inherits(inSCE, "SingleCellExperiment")){
        stop('"inSCE" should be a SingleCellExperiment inherited Object.')
    }
    if(!useAssay %in% SummarizedExperiment::assayNames(inSCE)){
        stop(paste('"useAssay" name: ', useAssay, ' not found.'))
    }
    if(is.null(index1) && (is.null(classGroup1) || is.null(class))){
        stop('At least "index1" or "classGroup1" should be specified.')
    } else if(!is.null(index1) && (!is.null(classGroup1) || !is.null(class))){
        stop('Only one of "index" and "class"/"classGroup1" ',
             'should be specified.')
    }
    if(!is.null(covariates) &&
       !all(covariates %in% names(SummarizedExperiment::colData(inSCE)))){
        stop("Not all specified covariates exist.")
    }
    if ("diffExp" %in% names(S4Vectors::metadata(inSCE))){
        if(analysisName %in% names(S4Vectors::metadata(inSCE)$diffExp)){
            if(isTRUE(overwrite)){
                warning("analysisName '", analysisName, "' already exists, ",
                        "will be overwritten.")
            } else {
                stop("analysisName '", analysisName, "' already exists. ",
                     "Set `overwrite` to `TRUE` to overwrite.")
            }
        }
    }
    groupNames <- c(groupName1, groupName2)
    annotation <- c(class, covariates)
    if(!is.null(index1)){
        cells1 <- colnames(inSCE[,index1])
        if(!is.null(index2)){
            cells2 <- colnames(inSCE[,index2])
        } else {
            cells2 <- sort(setdiff(colnames(inSCE), cells1))
        }
    } else {
        if(length(class) == 1 && inherits(class, "character")){
            if(!class %in% names(SummarizedExperiment::colData(inSCE))){
                stop("class: '", class, "' not found.")
            }
            class <- SummarizedExperiment::colData(inSCE)[[class]]
        } else {
            if(!length(class) == ncol(inSCE)){
                stop("Length of given `class` vector should equal to ",
                     "`ncol(inSCE)`; Or specify a column name of ",
                     "`colData(inSCE)`")
            }
        }
        uniqCats <- unique(as.vector(class))
        index1 <- class %in% classGroup1
        if(is.null(classGroup2)){
            index2 <- !class %in% classGroup1
        } else {
            index2 <- class %in% classGroup2
        }
        cells1 <- colnames(inSCE[,index1])
        cells2 <- colnames(inSCE[,index2])
    }
    if(length(cells1) == 0){
        stop("Number of cells selected for group1 equals to zero.")
    }
    if(length(cells2) == 0){
        stop("Number of cells selected for group2 equals to zero.")
    }
    ix1 <- colnames(inSCE) %in% cells1
    ix2 <- colnames(inSCE) %in% cells2
    isec <- intersect(which(ix1), which(ix2))
    if (length(isec) > 0) {
      stop("Cell(s) selected for both conditions: ",
           paste(colnames(inSCE)[isec], collapse = ", "))
    }
    select <- list(ix1 = ix1, ix2 = ix2)
    if(!is.null(covariates)){
        for (c in covariates){
            col <- SummarizedExperiment::colData(inSCE)[(ix1 | ix2),c]
            if(length(unique(as.vector(col))) < 2){
                stop("Less than 2 levels in specified covariate: ", c)
            }
        }
    }
    return(
        list(useAssay = useAssay,
             groupNames = groupNames,
             select = select,
             annotation = annotation)
    )
}

#' Perform differential expression analysis on SCE with specified method
#' Method supported: 'MAST', 'DESeq2', 'Limma', 'ANOVA'
#' @param method A single character for specific method. Choose from
#' \code{"MAST"}, \code{"DESeq2"}, \code{"Limma"}, \code{"ANOVA"}. Default
#' \code{"MAST"}.
#' @param ... Other arguments passed to specific functions. Refer to
#' \code{\link{runMAST}}, \code{\link{runDESeq2}}, \code{\link{runLimmaDE}},
#' \code{\link{runANOVA}}
#' @examples
#' data(scExample, package = "singleCellTK")
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' sce <- runDEAnalysis(inSCE = sce, groupName1 = "Sample1", method = "DESeq2",
#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
#'  analysisName = "DESeq2")
#' @return Input SCE object with \code{metadata(inSCE)} updated with name
#' \code{"diffExp"} as a \code{list} object. Detail refers to the four child
#' functions.
#' @export
runDEAnalysis <- function(method = c('MAST', 'DESeq2', 'Limma', 'ANOVA'), ...){
    method <- match.arg(method)
    funcList <- list(MAST = runMAST,
                     DESeq2 = runDESeq2,
                     Limma = runLimmaDE,
                     ANOVA = runANOVA)
    funcList[[method]](...)
}

#' Perform differential expression analysis on SCE with DESeq2.
#'
#' Condition specification allows two methods:
#' 1. Index level selection. Arguments \code{index1} and \code{index2} will be
#' used.
#' 2. Annotation level selection. Arguments \code{class}, \code{classGroup1} and
#' \code{classGroup2} will be used.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useAssay character. A string specifying which assay to use for the
#' DESeq2 regression. Default \code{"logcounts"}.
#' @param index1 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. Specifies
#' which cells are of interests. Default \code{NULL}.
#' @param index2 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. specifies
#' the control group against those specified by \code{index1}. If
#' \code{NULL} when using index specification, \code{index1} cells will be
#' compared with all other cells. Default \code{NULL}.
#' @param class A vector/factor with \code{ncol(inSCE)} elements, or a character
#' scalar that specifies a column name of \code{colData(inSCE)}. Default
#' \code{NULL}.
#' @param classGroup1 a vector specifying which "levels" given in \code{class}
#' are of interests. Default \code{NULL}.
#' @param classGroup2 a vector specifying which "levels" given in \code{class}
#' is the control group against those specified by \code{classGroup1}. If
#' \code{NULL} when using annotation specification, \code{classGroup1} cells
#' will be compared with all other cells.
#' @param analysisName A character scalar naming the DEG analysis. Required
#' @param groupName1 A character scalar naming the group of interests. Required.
#' @param groupName2 A character scalar naming the control group. Required.
#' @param covariates A character vector of additional covariates to use when
#' building the model. All covariates must exist in
#' \code{names(colData(inSCE))}. Default \code{NULL}.
#' @param fullReduced Whether to apply LRT (Likelihood ratio test) with a 'full'
#' model. Default \code{TRUE}.
#' @param onlyPos Whether to only output DEG with positive log2_FC value.
#' Default \code{FALSE}.
#' @param log2fcThreshold Only out put DEGs with the absolute values of log2FC
#' greater than this value. Default \code{0.25}
#' @param fdrThreshold Only out put DEGs with FDR value less than this
#' value. Default \code{0.05}
#' @param overwrite A logical scalar. Whether to overwrite result if exists.
#' Default \code{FALSE}.
#' @examples
#' data(scExample, package = "singleCellTK")
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' sce <- runDESeq2(inSCE = sce, groupName1 = "Sample1",
#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
#'  analysisName = "DESeq2")
#'
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{metadata(inSCE)$DESeq2} updated with the results: a list named by
#' \code{analysisName}, with \code{$groupNames} containing the naming of the
#' two conditions, \code{$useAssay} storing the assay name that was used for
#' calculation, \code{$select} storing the cell selection indices (logical) for
#' each condition, \code{$result} storing a \code{\link{data.frame}} of
#' the DEGs summary, and \code{$method} storing \code{"DESeq2"}.
#' @export
runDESeq2 <- function(inSCE, useAssay = 'counts', index1 = NULL,
                      index2 = NULL, class = NULL, classGroup1 = NULL,
                      classGroup2 = NULL, analysisName, groupName1,
                      groupName2, covariates = NULL, fullReduced = TRUE,
                      onlyPos = FALSE, log2fcThreshold = NULL,
                      fdrThreshold = 1, overwrite = FALSE){
    resultList <- .formatDEAList(inSCE, useAssay, index1, index2, class,
                                 classGroup1, classGroup2, groupName1,
                                 groupName2, analysisName, covariates,
                                 overwrite)
    ix1 <- resultList$select$ix1
    ix2 <- resultList$select$ix2
    subsetIdx <- (ix1 | ix2)
    conditions <- rep(NA, ncol(inSCE))
    conditions[ix1] <- 'cond1'
    conditions[ix2] <- 'cond2'
    conditions <- conditions[!is.na(conditions)]
    annotData <- data.frame(condition = conditions,
                            row.names = colnames(inSCE)[subsetIdx])
    cov <- SummarizedExperiment::colData(inSCE)[subsetIdx, covariates,
                                                drop = FALSE]
    annotData <- cbind(annotData, cov)
    mat <- SummarizedExperiment::assay(inSCE[,subsetIdx], useAssay)
    if(!inherits(mat, 'matrix')){
        mat <- as.matrix(mat)
    }
    mat <- featureNameDedup(mat)
    dds <- DESeq2::DESeqDataSetFromMatrix(
        countData = mat, colData = annotData,
        design = stats::as.formula(paste0("~", c('condition', covariates),
                                          collapse = "+"))
    )
    if(isTRUE(fullReduced)){
        dds <- DESeq2::DESeq(dds, test = "LRT", reduced = ~ 1)
    } else {
        dds <- DESeq2::DESeq(
            dds, test = "LRT",
            reduced = stats::as.formula(paste0('~condition', covariates,
                                               collapse = '+'))
        )
    }
    res <- DESeq2::results(dds, pAdjustMethod = 'fdr')
    deg <- data.frame(res)[,c(-1, -3, -4)]
    deg <- cbind(data.frame(Gene = rownames(deg), stringsAsFactors = FALSE),
                 deg)
    rownames(deg) <- NULL
    colnames(deg) <- c('Gene', 'Log2_FC', 'Pvalue', 'FDR')
    deg$Log2_FC <- - deg$Log2_FC # JNMLP
    # Result Filtration
    if(isTRUE(onlyPos)){
        deg <- deg[deg$Log2_FC > 0,]
    }
    if(!is.null(fdrThreshold)){
        deg <- deg[deg$FDR < fdrThreshold,]
    }
    if(!is.null(log2fcThreshold)){
        deg <- deg[abs(deg$Log2_FC) > log2fcThreshold,]
    }
    # Format output
    deg <- deg[order(deg$FDR, na.last = TRUE),]
    if (length(which(rowSums(is.na(deg)) > 2)) > 0) {
      deg <- deg[-which(rowSums(is.na(deg)) > 2),]
    }
    resultList$result <- deg
    resultList$method <- 'DESeq2'
    if ("diffExp" %in% names(S4Vectors::metadata(inSCE))){
        S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    } else {
        S4Vectors::metadata(inSCE)$diffExp <- list()
        S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    }
    return(inSCE)
}


#' Perform differential expression analysis on SCE with Limma.
#'
#' Condition specification allows two methods:
#' 1. Index level selection. Arguments \code{index1} and \code{index2} will be
#' used.
#' 2. Annotation level selection. Arguments \code{class}, \code{classGroup1} and
#' \code{classGroup2} will be used.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useAssay character. A string specifying which assay to use for the
#' Limma regression. Default \code{"logcounts"}.
#' @param index1 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. Specifies
#' which cells are of interests. Default \code{NULL}.
#' @param index2 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. specifies
#' the control group against those specified by \code{index1}. If
#' \code{NULL} when using index specification, \code{index1} cells will be
#' compared with all other cells. Default \code{NULL}.
#' @param class A vector/factor with \code{ncol(inSCE)} elements, or a character
#' scalar that specifies a column name of \code{colData(inSCE)}. Default
#' \code{NULL}.
#' @param classGroup1 a vector specifying which "levels" given in \code{class}
#' are of interests. Default \code{NULL}.
#' @param classGroup2 a vector specifying which "levels" given in \code{class}
#' is the control group against those specified by \code{classGroup1}. If
#' \code{NULL} when using annotation specification, \code{classGroup1} cells
#' will be compared with all other cells.
#' @param analysisName A character scalar naming the DEG analysis. Required
#' @param groupName1 A character scalar naming the group of interests. Required.
#' @param groupName2 A character scalar naming the control group. Required.
#' @param covariates A character vector of additional covariates to use when
#' building the model. All covariates must exist in
#' \code{names(colData(inSCE))}. Default \code{NULL}.
#' @param onlyPos Whether to only output DEG with positive log2_FC value.
#' Default \code{FALSE}.
#' @param log2fcThreshold Only out put DEGs with the absolute values of log2FC
#' greater than this value. Default \code{0.25}
#' @param fdrThreshold Only out put DEGs with FDR value less than this
#' value. Default \code{0.05}
#' @param overwrite A logical scalar. Whether to overwrite result if exists.
#' Default \code{FALSE}.
#' @examples
#' data(scExample, package = "singleCellTK")
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' library(scater)
#' sce <- logNormCounts(sce)
#' sce <- runLimmaDE(inSCE = sce, groupName1 = "Sample1",
#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
#'  analysisName = "Limma")
#'
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{metadata(inSCE)$diffExp} updated with the results: a list named by
#' \code{analysisName}, with \code{$groupNames} containing the naming of the
#' two conditions, \code{$useAssay} storing the assay name that was used for
#' calculation, \code{$select} storing the cell selection indices (logical) for
#' each condition, \code{$result} storing a \code{\link{data.frame}} of
#' the DEGs summary, and \code{$method} storing \code{"Limma"}.
#' @export
runLimmaDE <- function(inSCE, useAssay = 'logcounts', index1 = NULL,
                       index2 = NULL, class = NULL, classGroup1 = NULL,
                       classGroup2 = NULL, analysisName, groupName1,
                       groupName2, covariates = NULL, onlyPos = FALSE,
                       log2fcThreshold = 0.25, fdrThreshold = 0.05,
                       overwrite = FALSE){
    resultList <- .formatDEAList(inSCE, useAssay, index1, index2, class,
                                 classGroup1, classGroup2, groupName1,
                                 groupName2, analysisName, covariates,
                                 overwrite)
    ix1 <- resultList$select$ix1
    ix2 <- resultList$select$ix2
    subsetIdx <- (ix1 | ix2)
    conditions <- rep(NA, ncol(inSCE))
    conditions[ix1] <- 'cond1'
    conditions[ix2] <- 'cond2'
    conditions <- conditions[!is.na(conditions)]
    annotData <- data.frame(condition = conditions,
                            row.names = colnames(inSCE)[subsetIdx])
    cov <- SummarizedExperiment::colData(inSCE)[subsetIdx, covariates,
                                                drop = FALSE]
    annotData <- cbind(annotData, cov)
    mat <- SummarizedExperiment::assay(inSCE[,subsetIdx], useAssay)
    if(!inherits(mat, 'matrix')){
        mat <- as.matrix(mat)
    }
    mat <- featureNameDedup(mat)
    design <- stats::model.matrix(
        stats::as.formula(paste0("~", paste0(c('condition', covariates),
                                             collapse = "+"))),
        data = annotData)
    fit <- limma::lmFit(mat, design)
    ebayes <- limma::eBayes(fit)
    deg <- limma::topTable(ebayes, adjust = 'fdr',
                           number = nrow(inSCE))
    deg <- deg[,c(-2, -3, -6)]
    deg <- cbind(data.frame(Gene = rownames(deg), stringsAsFactors = FALSE),
                 deg)
    rownames(deg) <- NULL
    colnames(deg) <- c("Gene", "Log2_FC", "Pvalue", "FDR")
    deg$Log2_FC <- - deg$Log2_FC # JNMLP
    # Result Filtration
    if(isTRUE(onlyPos)){
      deg <- deg[deg$Log2_FC > 0,]
    }
    if(!is.null(fdrThreshold)){
      deg <- deg[deg$FDR < fdrThreshold,]
    }
    if(!is.null(log2fcThreshold)){
      deg <- deg[abs(deg$Log2_FC) > log2fcThreshold,]
    }
    # Format output
    deg <- deg[order(deg$FDR, na.last = TRUE),]
    if (length(which(rowSums(is.na(deg)) > 2)) > 0) {
      deg <- deg[-which(rowSums(is.na(deg)) > 2),]
    }
    resultList$result <- deg
    resultList$method <- 'Limma'
    if ("diffExp" %in% names(S4Vectors::metadata(inSCE))){
      S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    } else {
      S4Vectors::metadata(inSCE)$diffExp <- list()
      S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    }
    return(inSCE)
}

#' Perform differential expression analysis on SCE with ANOVA
#'
#' Condition specification allows two methods:
#' 1. Index level selection. Arguments \code{index1} and \code{index2} will be
#' used.
#' 2. Annotation level selection. Arguments \code{class}, \code{classGroup1} and
#' \code{classGroup2} will be used.
#'
#' NOTE that ANOVA method does not produce Log2FC value, but P-value and FDR
#' only.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useAssay character. A string specifying which assay to use for ANOVA.
#' Default \code{"logcounts"}.
#' @param index1 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. Specifies
#' which cells are of interests. Default \code{NULL}.
#' @param index2 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. specifies
#' the control group against those specified by \code{index1}. If
#' \code{NULL} when using index specification, \code{index1} cells will be
#' compared with all other cells. Default \code{NULL}.
#' @param class A vector/factor with \code{ncol(inSCE)} elements, or a character
#' scalar that specifies a column name of \code{colData(inSCE)}. Default
#' \code{NULL}.
#' @param classGroup1 a vector specifying which "levels" given in \code{class}
#' are of interests. Default \code{NULL}.
#' @param classGroup2 a vector specifying which "levels" given in \code{class}
#' is the control group against those specified by \code{classGroup1}. If
#' \code{NULL} when using annotation specification, \code{classGroup1} cells
#' will be compared with all other cells.
#' @param analysisName A character scalar naming the DEG analysis. Required
#' @param groupName1 A character scalar naming the group of interests. Required.
#' @param groupName2 A character scalar naming the control group. Required.
#' @param covariates A character vector of additional covariates to use when
#' building the model. All covariates must exist in
#' \code{names(colData(inSCE))}. Default \code{NULL}.
#' @param onlyPos Whether to only output DEG with positive log2_FC value.
#' Default \code{FALSE}.
#' @param log2fcThreshold Only out put DEGs with the absolute values of log2FC
#' greater than this value. Default \code{0.25}
#' @param fdrThreshold Only out put DEGs with FDR value less than this
#' value. Default \code{0.05}
#' @param overwrite A logical scalar. Whether to overwrite result if exists.
#' Default \code{FALSE}.
#' @examples
#' data(scExample, package = "singleCellTK")
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' library(scater)
#' sce <- logNormCounts(sce)
#' sce <- runANOVA(inSCE = sce, groupName1 = "Sample1",
#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
#'  analysisName = "ANOVA", fdrThreshold = NULL)
#'
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{metadata(inSCE)$diffExp} updated with the results: a list named by
#' \code{analysisName}, with \code{$groupNames} containing the naming of the
#' two conditions, \code{$useAssay} storing the assay name that was used for
#' calculation, \code{$select} storing the cell selection indices (logical) for
#' each condition, \code{$result} storing a \code{\link{data.frame}} of
#' the DEGs summary, and \code{$method} storing \code{"ANOVA"}.
#' @export
runANOVA <- function(inSCE, useAssay = 'logcounts', index1 = NULL,
                     index2 = NULL, class = NULL, classGroup1 = NULL,
                     classGroup2 = NULL, analysisName, groupName1,
                     groupName2, covariates = NULL, onlyPos = FALSE,
                     log2fcThreshold = 0.25, fdrThreshold = 0.05,
                     overwrite = FALSE){
    resultList <- .formatDEAList(inSCE, useAssay, index1, index2, class,
                                 classGroup1, classGroup2, groupName1,
                                 groupName2, analysisName, covariates,
                                 overwrite)

    ix1 <- resultList$select$ix1
    ix2 <- resultList$select$ix2
    subsetIdx <- (ix1 | ix2)
    conditions <- rep(NA, ncol(inSCE))
    conditions[ix1] <- 'cond1'
    conditions[ix2] <- 'cond2'
    conditions <- conditions[!is.na(conditions)]
    annotData <- data.frame(condition = conditions,
                            row.names = colnames(inSCE)[subsetIdx])
    cov <- SummarizedExperiment::colData(inSCE)[subsetIdx, covariates,
                                                drop = FALSE]
    annotData <- cbind(annotData, cov)
    if (is.null(covariates)) {
      mod <- stats::model.matrix(~condition, annotData)
      mod0 <- stats::model.matrix(~1, annotData)
    } else {
      mod <- stats::model.matrix(
        stats::as.formula(paste0("~", paste0(c("condition", covariates),
                                             collapse = "+"))),
        data = annotData)
      mod0 <- stats::model.matrix(
        stats::as.formula(paste0("~", paste0(covariates, collapse = "+"))),
        data = annotData)
    }
    dat <- as.matrix(SummarizedExperiment::assay(inSCE[,subsetIdx], useAssay))
    if(!inherits(dat, 'matrix')){
        dat <- as.matrix(dat)
    }
    dat <- featureNameDedup(dat)
    n <- dim(dat)[2]
    m <- dim(dat)[1]
    df1 <- dim(mod)[2]
    df0 <- dim(mod0)[2]
    p <- rep(0, m)
    Id <- diag(n)
    resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*%
                        t(mod))
    resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*%
                         t(mod0))
    rss1 <- resid ^ 2 %*% rep(1, n)
    rss0 <- resid0 ^ 2 %*% rep(1, n)
    fstats <- ((rss0 - rss1) / (df1 - df0)) / (rss1 / (n - df1))
    p <- 1 - stats::pf(fstats, df1 = (df1 - df0), df2 = (n - df1))
    deg <- data.frame(row.names = rownames(dat), p.value = p,
                      padj = stats::p.adjust(p, method = 'fdr'))
    deg <- cbind(data.frame(Gene = rownames(deg),
                            Log2_FC = NA, stringsAsFactors = FALSE),
                 deg)
    rownames(deg) <- NULL
    colnames(deg) <- c("Gene", "Log2_FC", "Pvalue", "FDR")
    # Result Filtration
    if(isTRUE(onlyPos)){
      warning("ANOVA method does not generate log2FC value. `onlyPos` ",
              "argument ignored")
    }
    if(!is.null(fdrThreshold)){
      deg <- deg[deg$FDR < fdrThreshold,]
    }
    if(!is.null(log2fcThreshold)){
      warning("ANOVA method does not generate log2FC value. `log2fcThreshold` ",
              "argument ignored")
      #deg <- deg[abs(deg$Log2_FC) > log2fcThreshold,]
    }
    # Format output
    deg <- deg[order(deg$FDR, na.last = TRUE),]
    if (length(which(rowSums(is.na(deg)) > 2)) > 0) {
      deg <- deg[-which(rowSums(is.na(deg)) > 2),]
    }
    resultList$result <- deg
    resultList$method <- 'ANOVA'

    if ("diffExp" %in% names(S4Vectors::metadata(inSCE))){
      S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    } else {
      S4Vectors::metadata(inSCE)$diffExp <- list()
      S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    }
    return(inSCE)
}

#' Perform differential expression analysis on SCE with MAST
#'
#' Condition specification allows two methods:
#' 1. Index level selection. Arguments \code{index1} and \code{index2} will be
#' used.
#' 2. Annotation level selection. Arguments \code{class}, \code{classGroup1} and
#' \code{classGroup2} will be used.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useAssay character. A string specifying which assay to use for MAST
#' Default \code{"logcounts"}.
#' @param index1 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. Specifies
#' which cells are of interests. Default \code{NULL}.
#' @param index2 Any type of indices that can subset a
#' \linkS4class{SingleCellExperiment} inherited object by cells. specifies
#' the control group against those specified by \code{index1}. If
#' \code{NULL} when using index specification, \code{index1} cells will be
#' compared with all other cells. Default \code{NULL}.
#' @param class A vector/factor with \code{ncol(inSCE)} elements, or a character
#' scalar that specifies a column name of \code{colData(inSCE)}. Default
#' \code{NULL}.
#' @param classGroup1 a vector specifying which "levels" given in \code{class}
#' are of interests. Default \code{NULL}.
#' @param classGroup2 a vector specifying which "levels" given in \code{class}
#' is the control group against those specified by \code{classGroup1}. If
#' \code{NULL} when using annotation specification, \code{classGroup1} cells
#' will be compared with all other cells.
#' @param analysisName A character scalar naming the DEG analysis. Required
#' @param groupName1 A character scalar naming the group of interests. Required.
#' @param groupName2 A character scalar naming the control group. Required.
#' @param covariates A character vector of additional covariates to use when
#' building the model. All covariates must exist in
#' \code{names(colData(inSCE))}. Default \code{NULL}.
#' @param onlyPos Whether to only output DEG with positive log2_FC value.
#' Default \code{FALSE}.
#' @param log2fcThreshold Only out put DEGs with the absolute values of log2FC
#' greater than this value. Default \code{0.25}
#' @param fdrThreshold Only out put DEGs with FDR value less than this
#' value. Default \code{0.05}
#' @param overwrite A logical scalar. Whether to overwrite result if exists.
#' Default \code{FALSE}.
#' @param check_sanity Logical scalar. Whether to perform MAST's sanity check
#' to see if the counts are logged. Default \code{TRUE}.
#' @examples
#' data(scExample, package = "singleCellTK")
#' sce <- subsetSCECols(sce, colData = "type != 'EmptyDroplet'")
#' logcounts(sce) <- log(counts(sce) + 1)
#' sce <- runMAST(inSCE = sce, groupName1 = "Sample1",
#'  groupName2 = "Sample2", index1 = 1:20, index2 = 21:40,
#'  analysisName = "MAST")
#'
#' @return The input \linkS4class{SingleCellExperiment} object with
#' \code{metadata(inSCE)$diffExp} updated with the results: a list named by
#' \code{analysisName}, with \code{$groupNames} containing the naming of the
#' two conditions, \code{$useAssay} storing the assay name that was used for
#' calculation, \code{$select} storing the cell selection indices (logical) for
#' each condition, \code{$result} storing a \code{\link{data.frame}} of
#' the DEGs summary, and \code{$method} storing \code{"MAST"}.
#' @export
runMAST <- function(inSCE, useAssay = 'logcounts', index1 = NULL,
                    index2 = NULL, class = NULL, classGroup1 = NULL,
                    classGroup2 = NULL, analysisName, groupName1,
                    groupName2, covariates = NULL, onlyPos = FALSE,
                    log2fcThreshold = NULL, fdrThreshold = 0.05,
                    overwrite = FALSE, check_sanity = TRUE){
    resultList <- .formatDEAList(inSCE, useAssay, index1, index2, class,
                                 classGroup1, classGroup2, groupName1,
                                 groupName2, analysisName, covariates,
                                 overwrite)

    ix1 <- resultList$select$ix1
    ix2 <- resultList$select$ix2
    cells1 <- which(ix1)
    cells2 <- which(ix2)
    subsetIdx <- (ix1 | ix2)
    ## Extract
    mat <-
      SummarizedExperiment::assay(inSCE[,subsetIdx], useAssay)
    if(!inherits(mat, 'matrix')){
      mat <- as.matrix(mat)
    }
    mat <- featureNameDedup(mat)
    cond <- rep(NA, ncol(inSCE))
    cond[ix1] <- 'c1'
    cond[ix2] <- 'c2'
    cond <- cond[subsetIdx]
    cdat <- data.frame(wellKey = colnames(mat),
                       condition = as.factor(cond),
                       ngeneson = rep("", (length(cells1) + length(cells2))),
                       stringsAsFactors = FALSE)
    covariateDat <-
      data.frame(SummarizedExperiment::colData(inSCE[,subsetIdx])[,
                                                                  covariates,
                                                                  drop = FALSE])
    cdat <- cbind(cdat, covariateDat)
    sca <- MAST::FromMatrix(mat, cdat, check_sanity = check_sanity)
    cdr2 <- colSums(SummarizedExperiment::assay(sca) > 0)
    SummarizedExperiment::colData(sca)$cngeneson <- scale(cdr2)
    cond <- factor(SummarizedExperiment::colData(sca)$condition)
    cond <- stats::relevel(cond, "c2")
    SummarizedExperiment::colData(sca)$condition <- cond
    # Calculation
    sca <- sca[which(MAST::freq(sca) > 0),]
    invisible(utils::capture.output(thresh <-
        MAST::thresholdSCRNACountMatrix(SummarizedExperiment::assay(sca),
                                        nbins = 20, min_per_bin = 30)))
    SummarizedExperiment::assays(sca) <-
        list(thresh = thresh$counts_threshold,
             tpm = SummarizedExperiment::assay(sca))
    SummarizedExperiment::colData(sca)$cngeneson <-
        scale(colSums(SummarizedExperiment::assay(sca) > 0))
    if(all(is.na(SummarizedExperiment::colData(sca)$cngeneson))){
        SummarizedExperiment::colData(sca)$cngeneson <- 0
    }
    zlmCond <- MAST::zlm(
        stats::as.formula(
            paste0("~", paste(c("condition", "cngeneson", covariates),
                              collapse = '+'))),
        sca)
    summaryCond <- MAST::summary(zlmCond, doLRT = "conditionc1")
    summaryDt <- summaryCond$datatable
    fcHurdle <- merge(summaryDt[summaryDt$contrast == "conditionc1" &
                                    summaryDt$component == "H",
                                c('primerid', 'Pr(>Chisq)')],
                      summaryDt[summaryDt$contrast == "conditionc1" &
                                    summaryDt$component == "logFC",
                                c('primerid', 'coef', 'ci.hi', 'ci.lo')],
                      by = "primerid")
    fcHurdle$fdr <- stats::p.adjust(fcHurdle$`Pr(>Chisq)`, "fdr")
    fcHurdleSig <- merge(fcHurdle,
                         data.table::as.data.table(S4Vectors::mcols(sca)),
                         by = "primerid")
    if(!is.null(log2fcThreshold)){
      fcHurdleSig <- fcHurdleSig[abs(fcHurdleSig$coef) > log2fcThreshold,]
    }
    if(isTRUE(onlyPos)){
      fcHurdleSig <- fcHurdleSig[fcHurdleSig$coef > 0,]
    }
    if(!is.null(fdrThreshold)){
      fcHurdleSig <- fcHurdleSig[fcHurdleSig$fdr < fdrThreshold,]
    }
    fcHurdleSig <- fcHurdleSig[, -c(4, 5)]
    names(fcHurdleSig)[c(1, 2, 3, 4)] <- c("Gene", "Pvalue",
                                           "Log2_FC", "FDR")
    fcHurdleSig <- fcHurdleSig[order(fcHurdleSig$FDR, na.last = TRUE),
    ]
    # Format output
    deg <- as.data.frame(fcHurdleSig)
    if (length(which(rowSums(is.na(deg)) > 2)) > 0) {
      deg <- deg[-which(rowSums(is.na(deg)) > 2),]
    }
    resultList$result <- deg
    resultList$method <- 'MAST'

    if ("diffExp" %in% names(S4Vectors::metadata(inSCE))){
        S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    } else {
        S4Vectors::metadata(inSCE)$diffExp <- list()
        S4Vectors::metadata(inSCE)$diffExp[[analysisName]] <- resultList
    }
    return(inSCE)
}
Any scripts or data that you put into this service are public.
singleCellTK documentation built on Nov. 8, 2020, 5:21 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
singleCellTK
Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/runDEAnalysis.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Defines functions runMAST runANOVA runLimmaDE runDESeq2 runDEAnalysis .formatDEAList

Documented in .formatDEAList runANOVA runDEAnalysis runDESeq2 runLimmaDE runMAST

Try the singleCellTK package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

singleCellTK Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/runDEAnalysis.R In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Defines functions runMAST runANOVA runLimmaDE runDESeq2 runDEAnalysis .formatDEAList

Documented in .formatDEAList runANOVA runDEAnalysis runDESeq2 runLimmaDE runMAST

Try the singleCellTK package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

singleCellTK
Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/runDEAnalysis.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data