R/post.R

Defines functions estimate_dbr_score top_genes de_basic de_ttest_all de_ttest summarize_clusters

Documented in de_basic de_ttest de_ttest_all estimate_dbr_score summarize_clusters top_genes

#' Output summary stats from a DIEM run
#'
#' Return summary statistics from a DIEM run to evaluate clustering 
#' and classification. The function calculates helpful statistics to 
#' evaluate each cluster, including the debris and putatitive cell 
#' types. This returns a data frame with the following 
#' summary stats per cluster:
#' \describe{
#' \item{Type}{Whether the cluster was considered as debris for 
#'  calculating the debris score.}
#' \item{Cluster}{Cluster for which summary stats are shown.}
#' \item{n_droplets}{Number of droplets.}
#' \item{avg_n_genes}{Average number of genes detected.}
#' \item{avg_n_counts}{Average number of counts.}
#' \item{avg_dbr_score}{Average debris score, if available.}
#' \item{genes}{Set of top \code{top_n} marker genes separated by 
#'  semicolons.}
#' }
#' The marker genes are calculated as the genes with the largest 
#' difference in proportion between the 
#'
#' @param x An SCE object.
#' @param top_n Number of top DE genes to return in column.
#'
#' @return A data frame
#'
#' @export
summarize_clusters <- function(x, top_n = 20){
    testdat <- droplet_data(x, type = "test")
    dropdat <- droplet_data(x, type = "all")
    dbr_clust <- x@debris_clusters


    if ( ! "Cluster" %in% colnames(testdat))
        stop("Run DIEM and ", sQuote("assign_clusters"), " before summarize")

    dropdat[,"Cluster"] <- "1"
    dropdat[rownames(testdat),"Cluster"] <- testdat[,"Cluster"]
    clusters_all <- sort(as.numeric(unique(dropdat[,"Cluster"])))
    clusters_all <- as.character(clusters_all)

    means <- Alpha(x)
    means <- means[,clusters_all]
    if (is.null(means))
        stop("Estimate parameters before ", sQuote("summarize_clusters"))

    means <- sweep(means, 2, colSums(means), "/")


    # Measures
    cs <- rep("Clean", ncol(means))
    cs[1] <- "Debris"
    ndrops <- sapply(clusters_all, function(ct){
                     sum(testdat[,"Cluster"] %in% ct) })
    ngenes <- sapply(clusters_all, function(ct){
                     ci <- which(testdat[,"Cluster"] %in% ct)
                     tdc <- testdat[ci, , drop = FALSE]
                     mean(tdc[, "n_genes"], na.rm = TRUE)})
    ncounts <- sapply(clusters_all, function(ct){
                      ci <- which(testdat[,"Cluster"] %in% ct)
                      tdc <- testdat[ci, , drop = FALSE]
                      mean(tdc[, "total_counts"], na.rm = TRUE) })
    if ("score.debris" %in% colnames(testdat)){
        cs[clusters_all %in% dbr_clust] <- "Debris"
        ds <- tapply(testdat[,"score.debris"], 
                     testdat[,"Cluster"],
                     mean, na.rm = TRUE)
    } else {
        ds <- rep(NA, length(clusters_all))
        names(ds) <- clusters_all
    }

    ret <- data.frame("Type" = cs, 
                      "Cluster" = clusters_all, 
                      "n_droplets" = ndrops[clusters_all], 
                      "avg_n_genes" = ngenes[clusters_all],
                      "avg_n_counts" = ncounts[clusters_all], 
                      "avg_dbr_score" = ds[clusters_all]) 

    # Get DE top geneu
    de <- top_genes(x, top_n = top_n)

    gene_ids <- sapply(clusters_all, function(ct){
                       ci <- which(de[,"Cluster"] %in% ct)
                       des <- de[ci, , drop=FALSE]
                       genes <- paste(des[,"gene"], collapse=";")
                       return(genes)
                      })

    ret[,"genes"] <- gene_ids

    return(ret)
}

#' Run a DE between two groups with Welch's t-test
#'
#' Run DE between two groups. \code{x} is a sparse matrix with 
#' count data. These counts can be normalized, or raw counts 
#' can be given. If providiing raw counts, set the \code{normalize} 
#' parameter to TRUE. \code{c1} 
#' and \code{c2} are vectors containing indices or names 
#' of the columns, or boolean indicies for the droplets 
#" that belong to group 1 and 2, respectively. 
#' A Welch's t-test is performed.
#'
#' @param x A sparse matrix
#' @param c1 Column indexes or names of first group
#' @param c2 Column indexes or names of second group
#' @param normalize Whether to scale the droplets to sum to \code{sf} and 
#'  and then log transform.
#' @param sf Scale the counts in each droplet to sum to this value.
#' 
#' @return A data frame of DE results with
#' \describe{
#' \item{gene}{gene name}
#' \item{diff}{difference between mu1 and mu2}
#' \item{logFC}{log2 fold change}
#' \item{mu1}{mean of group 1}
#' \item{mu2}{mean of group 2}
#' \item{t}{t statistic}
#' \item{p}{p-value}
#' }
#'
#' @importFrom Matrix rowMeans
#' @importFrom stats pt
#'
#' @export
de_ttest <- function(x, c1, c2, normalize = FALSE, sf = 1){

    if (normalize){
        x <- divide_by_colsum(x)
        x <- log1p(sf * x)
    }

    mu1 <- rowMeans(x[,c1,drop=FALSE])
    mu2 <- rowMeans(x[,c2,drop=FALSE])

    keep <- mu1 > 0 & mu2 > 0
    x <- x[keep,,drop=FALSE]
    mu1 <- mu1[keep]
    mu2 <- mu2[keep]
    
    xc1 <- x[,c1,drop=FALSE]
    xc2 <- x[,c2,drop=FALSE]
    if ( (ncol(xc1) == 1) | (ncol(xc2) == 1) ){
        return(NA)
    }

    var1 <- fast_varCPP(x = t(xc1), 
                        mu = mu1)
    var2 <- fast_varCPP(x = t(xc2), 
                        mu = mu2)
    n1 <- ncol(x[,c1,drop=FALSE])
    n2 <- ncol(x[,c2,drop=FALSE])

    t <- sapply(1:nrow(x), function(i) {
        r <- (mu1[i] - mu2[i]) / sqrt(var1[i]/n1 + var2[i]/n2)
        return(r)
    })
    p <- sapply(1:length(t), function(i){
                num <- (var1[i]/n1 + var2[i]/n2)^2
                d1 <- (var1[i]/n1)^2 / (n1-1)
                d2 <- (var2[i]/n2)^2 / (n2-1)
                dof <- num / (d1 + d2)
                2 * pt(q = -abs(t[i]), df = dof)
    })
    datf <- data.frame(gene = rownames(x), 
                       diff = mu1 - mu2, 
                       logFC = log2(mu1) - log2(mu2), 
                       mu1 = mu1, 
                       mu2 = mu2, 
                       t = t, 
                       p = p) 
    datf[,"gene"] <- as.character(datf[,"gene"])
    rownames(datf) <- rownames(x)
    return(datf)
}

#' Run a DE across clusters with Welch's t-test
#'
#' Given expression data in \code{counts}, run differential 
#' expression between droplets in one group against all others, for 
#' each group in \code{labels}.
#' These counts can be normalized, or raw counts 
#' can be given. If providiing raw counts, set the \code{normalize} 
#' parameter to TRUE. Returns only genes with an adjust p-value 
#' less than \code{p_thresh} and a log fold change greater than 
#' \code{logFC}.
#'
#' @param counts Gene by droplet sparse matrix of expression data.
#' @param labels Vector of droplet labels giving the cluster that the 
#'  droplet belongs to.
#' @param normalize Whether to scale the droplets to sum to \code{sf} and 
#'  and then log transform.
#' @param sf Scale the counts in each droplet to sum to this value.
#' @param p_thresh Return only genes that have an adjusted p-value less than 
#'  this value.
#' @param logfc_thresh Return genes that have a log fold change of at least 
#'  this value.
#' @param p_method Adjust p-values for multiple testing using this method.
#' 
#' @return A data frame of DE results with
#' \describe{
#' \item{gene}{gene name}
#' \item{diff}{difference between mu1 and mu2}
#' \item{logFC}{log2 fold change}
#' \item{mu1}{mean of group 1}
#' \item{mu2}{mean of group 2}
#' \item{t}{t statistic}
#' \item{p}{p-value}
#' \item{p_adj}{p-value corrected for multiple testing}
#' \item{cluster}{cluster the DE genes belong to}
#' }
#'
#' @importFrom Matrix rowMeans
#' @importFrom stats pt
#'
#' @export
#'
#' @examples
#' \donttest{
#' counts <- raw_counts(sce, type = "test")
#' clusts <- clusters(sce)
#' markers <- de_ttest_all(counts, clusts)
#' }
de_ttest_all <- function(counts, 
                         labels, 
                         normalize = TRUE,
                         sf = 1, 
                         p_thresh = 0.05, 
                         logfc_thresh = 0, 
                         p_method = "bonferroni"){ 

    if (normalize){
        counts_p <- divide_by_colsum(counts)
        counts_s <- log1p(sf * counts_p)
    } else {
        counts_s <- counts
    }

    clusters <- sort(unique(labels))

    de_all <- list()
    for (c1 in clusters){
        c2 <- setdiff(clusters, c1)
        l1 <- labels %in% c1
        l2 <- labels %in% c2

        if ( (sum(l1) == 1) | (sum(l2) == 1) )
            next
        datf <- de_ttest(counts_s, l1, l2)
        datf$cluster <- c1
        datf$p_adj <- p.adjust(datf$p, method = p_method)
        datf <- datf[order(datf$t, decreasing = TRUE),]
        datf <- datf[(datf$p_adj < p_thresh) & (datf$logFC > logfc_thresh),,drop=FALSE]
        de_all[[as.character(c1)]] <- datf
    }
    de <- do.call(rbind, de_all)
    de[,"gene"] <- as.character(de[,"gene"])
    rownames(de) <- NULL
    return(de)
}

#' Run a basic DE between clusters
#'
#' @param means A gene by cluster sata frame of cluster means, 
#'  of which the columns sum to 1.
#' @param weights weight to give each column of means when calculating the 
#'  average proportions in means across columns.
#' @param top_n Number of top DE genes to return.
#' 
#' @return A data frame of DE results
de_basic <- function(means, weights, top_n = 20){
    clusters <- 1:ncol(means)
    if (length(weights) != ncol(means))
        stop("length of weights must be the same as number of columns in means.")
    weights <- weights / sum(weights)
    names(weights) <- clusters
    de <- lapply(clusters, function(cl){
                 cln <- setdiff(clusters, cl)
                 p1 <- means[,cl]
                 if (!is.null(weights)){
                     wn <- weights[cln] / sum(weights[cln])
                     p2 <- means[,cln,drop=FALSE] %*% wn
                 }
                 else {
                     p2 <- rowMeans(means[,cln,drop=FALSE])
                 }
                 di <- p1 - p2
                 logfc <- log2(p1) - log2(p2)
                 datf <- data.frame("diff" = di, 
                                    "logFC" = logfc,
                                    "p1" = p1, 
                                    "p2" = p2, 
                                    "Cluster" = cl, 
                                    "gene" = rownames(means))
                 o <- order(di, decreasing = TRUE)
                 datf <- datf[o,]
                 return(datf[1:top_n,])
    })
    ret <- do.call(rbind, de)
    ret$gene <- as.character(ret$gene)
    rownames(ret) <- NULL
    return(ret)
}

#' Get top up-regulated genes per cluster
#'
#' Extract marker genes for each cluster by ranking them based on the 
#' difference of their proportion means from the multinomial
#' distributions estimated by DIEM. Output the top \code{top_n} 
#' genes for each cluster.
#'
#' @param x An SCE object.
#' @param top_n Number of top DE genes to return.
#'
#' @return An SCE object.
#'
#' @export
top_genes <- function(x, top_n = 20){
    genes.use <- rownames(x@gene_data)[x@gene_data$exprsd]
    means <- Alpha(x)
    if (is.null(means))
        stop("Estimate parameters before getting top DE genes")

    means <- means[genes.use,]
    means <- sweep(means, 2, colSums(means), "/")

    clusters <- 1:ncol(means)
    testdat <- droplet_data(x, type = "test")
    dropdat <- droplet_data(x, type = "all")
    dropdat[,"Cluster"] <- "1"
    dropdat[rownames(testdat),"Cluster"] <- testdat[,"Cluster"]

    clusters_all <- as.character(clusters)

    w <- sapply(clusters_all, function(ct){
                ci <- which(dropdat$Cluster %in% ct)
                sum(dropdat[ci, "total_counts"])
    })
    de <- de_basic(means, top_n = top_n, weights = w)
    return(de)
}

#' Estimate debris score per droplet
#'
#' Create a debris score based on expression of debris-enriched genes.
#' For each droplet, it sums the normalized read counts for these genes. 
#' Debris-enriched genes are calculated using a t-test between 
#' droplets in the debris and non-debris clusters. Only droplets in the 
#' test set are used. Genes with an 
#' adjusted p-value less than \code{thresh_p}, and a log fold change 
#' greater than \code{thresh_logfc} are included in the debris 
#' set. The number of genes included in the debris set can be capped 
#' by setting the \code{max_genes} parameter. This may help ensure 
#' that only the most specific genes are used in the case the 
#' many genes are significant in the DE test.
#' 
#' Differential expression is run by scaling the droplet counts to 
#' sum to \code{sf} and log transforming. Then, a t-test is run 
#' between droplets in the debris set and the cluster being tested.
#'
#' @param x An SCE object.
#' @param dbr_clust Specify additional debris clusters/
#' @param thresh_genes Threshold for cluster mean number of genes 
#'  detected. Clusters with an average number of genes detected 
#'  less than this value are set to debris clusters.
#' @param thresh_p P-value threshold for calculating differential 
#'  expression between droplets in the debris and non-debris clusters.
#' @param thresh_logfc Debris genes must have a log2 fold change greater 
#'  than this value.
#' @param max_genes The maximum number of debris genes to include, ranked 
#'  by difference in proportion. This helps if a large number of genes 
#'  are found significantly DE, and may improve the specificity of the 
#'  debris score.
#' @param p_method Method for p-value correction using 
#'  \code{\link[stats]{p.adjust}}.
#'  expression between droplets in the debris and non-debris clusters.
#' @param sf Scaling factor to multiple normalized counts by before 
#'  log transformation. The normalized counts are used for the 
#'  DE analysis.
#' @param verbose Verbosity, indicating whether to show a progress
#'  bar.
#' 
#' @importFrom Matrix colSums
#' @importFrom stats p.adjust
#'
#' @return An SCE object
#' @export
estimate_dbr_score <- function(x, 
                               dbr_clust = c(1), 
                               thresh_genes = 200, 
                               thresh_p = 0.05, 
                               thresh_logfc = 0, 
                               max_genes = 5000, 
                               p_method = "fdr", 
                               sf = 1,
                               verbose = TRUE){ 
    name <- "score.debris"

    if (thresh_logfc < 0)
        stop(sQuote("thresh_logfc"), " must be at least 0, not ", thresh_logfc)

    gene_mean <- tapply(x@test_data[,"n_genes"], x@test_data[,"Cluster"], mean)
    lc_clust <- which(gene_mean < thresh_genes)
    dbr_clust <- unique(c(1, dbr_clust, lc_clust))
    x@debris_clusters <- dbr_clust

    test_dat <- x@test_data
    test_drop <- rownames(test_dat)
    test_genes <- rownames(x@gene_data)[x@gene_data$exprsd]
    test_counts <- x@counts[test_genes, test_drop, drop = FALSE]
    test_clusters <- test_dat[,"Cluster"]

    c1 <- test_clusters %in% dbr_clust
    c2 <- ! c1

    if (sum(c1) == 0)
        stop("No test droplets in the debris cluster(s)")

    counts_s <- divide_by_colsum(test_counts)
    counts_s <- log1p(sf * counts_s)

    de <- de_ttest(counts_s, c1, c2)
    de[,"Debris"] <- FALSE

    # Specify all clusters
    de <- de[order(de[,"diff"], decreasing = TRUE), , drop = FALSE]
    de[,"p_adj"] <- p.adjust(de[,"p"], method = p_method)
    keep <- (de[,"p_adj"] < thresh_p) & (de[,"logFC"] > thresh_logfc)

    if (sum(keep) == 0){
        x@debris_genes <- de
        x@test_data[,name] <- NA
        message("No DE genes found: cannot calculate debris scores")
        return(x)
    }
    
    de_genes <- de[keep,"gene"]
    nr <- min(length(de_genes), max_genes)
    de_genes <- de_genes[1:nr]
    de[de_genes,"Debris"] <- TRUE

    scores <- colSums(counts_s[de_genes, test_drop, drop = FALSE])
    clust_mean <- tapply(scores, x@test_data[,"Cluster"], mean)

    scores <- scores - min(clust_mean)
    dbr_mean <- mean(scores[x@test_data[,"Cluster"] %in% dbr_clust])
    scores <- scores / dbr_mean

    x@test_data[,name] <- as.numeric(scores)
    x@debris_genes <- de

    if (verbose){
        message("calculated debris scores using ", length(dbr_clust), 
                " debris clusters and ", length(de_genes), " debris genes")
    }

    return(x)
}
marcalva/diem documentation built on Jan. 1, 2023, 2:33 a.m.