R/plotMarkerDiffExp.R

Defines functions plotMarkerDiffExp

Documented in plotMarkerDiffExp

#' Plot a heatmap to visualize the result of \code{\link{findMarkerDiffExp}}
#' @description This function will first reads the result saved in
#' \code{metadata} slot, named by \code{"findMarker"} and generated by
#' \code{\link{findMarkerDiffExp}}. Then it do the filtering on the statistics
#' based on the input parameters and get unique genes to plot. We choose the
#' genes that are identified as up-regulated only. As for the genes identified
#' as up-regulated for multiple clusters, we only keep the belonging towards the
#' one they have the highest Log2FC value.
#' In the heatmap, there will always be a cell annotation for the cluster
#' labeling used when finding the markers, and a feature annotation for which
#' cluster each gene belongs to. And by default we split the heatmap by these
#' two annotations. Additional legends can be added and the splitting can be
#' canceled.
#' @param inSCE \linkS4class{SingleCellExperiment} inherited object.
#' @param useAssay character. A string specifying which assay to use for the
#' expression values. Default \code{"logcounts"}.
#' @param log2fcThreshold Only use DEGs with the absolute values of log2FC
#' larger than this value. Default \code{1}
#' @param fdrThreshold Only use DEGs with FDR value smaller than this value.
#' Default \code{0.05}
#' @param topN An integer. Only to plot this number of top markers for each
#' cluster in maximum, in terms of log2FC value. Use \code{NULL} to cancel the
#' top N subscription. Default \code{10}.
#' @param orderBy The ordering method of the clusters on the splitted heatmap.
#' Can be chosen from \code{"size"} or \code{"name"}, specified with vector of
#' ordered unique cluster labels, or set as \code{NULL} for unsplitted heatmap.
#' Default \code{"size"}.
#' @param decreasing Order the cluster decreasingly. Default \code{TRUE}.
#' @param rowDataName character. The column name(s) in \code{rowData} that need
#' to be added to the annotation. Default \code{NULL}.
#' @param colDataName character. The column name(s) in \code{colData} that need
#' to be added to the annotation. Default \code{NULL}.
#' @param featureAnnotations \code{data.frame}, with \code{rownames} containing
#' all the features going to be plotted. Character columns should be factors.
#' Default \code{NULL}.
#' @param cellAnnotations \code{data.frame}, with \code{rownames} containing
#' all the cells going to be plotted. Character columns should be factors.
#' Default \code{NULL}.
#' @param featureAnnotationColor A named list. Customized color settings for
#' feature labeling. Should match the entries in the \code{featureAnnotations}
#' or \code{rowDataName}. For each entry, there should be a list/vector of
#' colors named with categories. Default \code{NULL}.
#' @param cellAnnotationColor A named list. Customized color settings for
#' cell labeling. Should match the entries in the \code{cellAnnotations} or
#' \code{colDataName}. For each entry, there should be a list/vector of colors
#' named with categories. Default \code{NULL}.
#' @param colSplitBy character vector. Do semi-heatmap based on the grouping of
#' this(these) annotation(s). Should exist in either \code{colDataName} or
#' \code{names(cellAnnotations)}. Default is the value of \code{cluster} in
#' \code{\link{findMarkerDiffExp}} when \code{orderBy} is not \code{NULL}, or
#' \code{NULL} otherwise.
#' @param rowSplitBy character vector. Do semi-heatmap based on the grouping of
#' this(these) annotation(s). Should exist in either \code{rowDataName} or
#' \code{names(featureAnnotations)}. Default \code{"marker"}, which indicates an
#' auto generated annotation for this plot.
#' @param ... Other arguments passed to \code{\link{plotSCEHeatmap}}.
#' @return A \code{\link[ComplexHeatmap]{Heatmap}} object
#' @author Yichen Wang
#' @export
plotMarkerDiffExp <- function(inSCE, useAssay = 'logcounts', orderBy = 'size',
    log2fcThreshold = 1, fdrThreshold = 0.05, topN = 10, decreasing = TRUE,
    rowDataName = NULL, colDataName = NULL, featureAnnotations = NULL,
    cellAnnotations = NULL, featureAnnotationColor = NULL,
    cellAnnotationColor = NULL,
    colSplitBy = ifelse(is.null(orderBy), NULL,
                        colnames(inSCE@metadata$findMarker)[5]),
    rowSplitBy = "marker", ...){
    if(!inherits(inSCE, 'SingleCellExperiment')){
        stop('"inSCE" should be a SingleCellExperiment inherited Object.')
    }
    if(!'findMarker' %in% names(S4Vectors::metadata(inSCE))){
        stop('"findMarker" result not found in metadata. ',
             'Run findMarkerDiffExp() before plotting.')
    }
    if(!is.null(orderBy)){
        if(length(orderBy) == 1){
            if(!orderBy %in% c('size', 'name')){
                stop('Single charater setting for "orderBy" should be chosen',
                     'from "size" or "name".')
            }
        }# else if(any(!SummarizedExperiment::colData(inSCE)[[cluster]] %in%
         #             orderBy)){
         #   stop('Invalid "orderBy", please input a vector of unique ordered ',
         #        'cluster identifiers that match all clusters in ',
         #        'colData(inSCE) specified by "cluster" to adjust the order ',
         #        'of clusters.')
        #}
    }
    # Extract and basic filter
    degFull <- S4Vectors::metadata(inSCE)$findMarker
    if(!all(c("Gene", "Pvalue", "Log2_FC", "FDR") %in%
            colnames(degFull)[seq_len(4)])){
        stop('"findMarker" result cannot be interpreted properly')
    }
    if(length(which(!degFull$Gene %in% rownames(inSCE))) > 0){
      # Remove genes happen in deg table but not in sce. Weird.
      degFull <- degFull[-which(!degFull$Gene %in% rownames(inSCE)),]
    }
    if(!is.null(log2fcThreshold)){
        degFull <- degFull[degFull$Log2_FC > log2fcThreshold,]
    }
    if(!is.null(fdrThreshold)){
        degFull <- degFull[degFull$FDR < fdrThreshold,]
    }
    # Remove duplicate by assigning the duplicated genes to the cluster where
    # their log2FC is the highest.
    # Done by keeping all unique genes and the rows  with highest Log2FC entry
    # for each duplicated gene.
    dup.gene <- unique(degFull$Gene[duplicated(degFull$Gene)])
    for(g in dup.gene){
        deg.gix <- degFull$Gene == g
        deg.gtable <- degFull[deg.gix,]
        toKeep <- which.max(deg.gtable$Log2_FC)
        toRemove <- which(deg.gix)[-toKeep]
        degFull <- degFull[-toRemove,]
    }
    clusterName <- colnames(degFull)[5]
    selected <- character()
    if (!is.null(topN)) {
      for (c in unique(degFull[[clusterName]])) {
        deg.cluster <- degFull[degFull[[clusterName]] == c,]
        deg.cluster <- deg.cluster[order(deg.cluster$Log2_FC,
                                         decreasing = TRUE),]
        if (dim(deg.cluster)[1] > topN) {
          deg.cluster <- deg.cluster[seq_len(topN),]
        }
        selected <- c(selected, deg.cluster$Gene)
      }
    } else {
      selected <- degFull$Gene
    }
    degFull <- degFull[degFull$Gene %in% selected,]
    inSCE <- inSCE[degFull$Gene,]

    z <- SummarizedExperiment::colData(inSCE)[[clusterName]]
    if(is.factor(z)){
        z.order <- levels(z)
        # When 'z' is a subset factor, there would be redundant levels.
        z.order <- z.order[z.order %in% as.vector(unique(z))]
        z <- factor(z, levels = z.order)
        SummarizedExperiment::rowData(inSCE)[[clusterName]] <-
            factor(degFull[[clusterName]], levels = z.order)
    } else {
        SummarizedExperiment::rowData(inSCE)[[clusterName]] <-
          degFull[[clusterName]]
    }
    y <- SummarizedExperiment::rowData(inSCE)[[clusterName]]
    if(!is.null(orderBy)){
        if(length(orderBy) == 1 && orderBy == 'size'){
            z.order <- names(table(z)[order(table(z), decreasing = decreasing)])
        } else if(length(orderBy) == 1 && orderBy == 'name'){
            if(is.factor(z)){
                z.order <- sort(levels(z), decreasing = decreasing)
            } else {
                z.order <- sort(unique(z), decreasing = decreasing)
            }
        } else {
            z.order <- orderBy[-which(!orderBy %in% z)]
        }
    }
    SummarizedExperiment::colData(inSCE)[[clusterName]] <-
        factor(z, levels = z.order)
    SummarizedExperiment::rowData(inSCE)[["marker"]] <-
        factor(y, levels = z.order)
    # Organize plotSCEHeatmap arguments
    colDataName <- c(colDataName, clusterName)
    rowDataName <- c(rowDataName, "marker")
    markerConsistColor <-
        list(marker = dataAnnotationColor(inSCE, 'col')[[clusterName]])
    featureAnnotationColor <- c(featureAnnotationColor, markerConsistColor)
    hm <- plotSCEHeatmap(inSCE, useAssay = useAssay, colDataName = colDataName,
                         rowDataName = rowDataName, colSplitBy = colSplitBy,
                         rowSplitBy = rowSplitBy,
                         featureAnnotations = featureAnnotations,
                         cellAnnotations = cellAnnotations,
                         featureAnnotationColor = featureAnnotationColor,
                         cellAnnotationColor = cellAnnotationColor,
                         cluster_row_slices = FALSE,
                         cluster_column_slices = FALSE, ...)
    return(hm)
}

Try the singleCellTK package in your browser

Any scripts or data that you put into this service are public.

singleCellTK documentation built on Nov. 8, 2020, 5:21 p.m.