R/methods-knownMarkersDetected.R

#' Known Markers Detected
#'
#' @note Both the `all` and `known` objects must contain Ensembl gene
#'   identifiers in the `geneID` column. We must avoid any matching operations
#'   based on the gene names, since these change often and can mismatch
#'   easily.
#'
#' @name knownMarkersDetected
#' @family Clustering Functions
#' @author Michael Steinbaugh
#'
#' @inheritParams general
#' @param object All markers data, generated by [Seurat::FindAllMarkers()]
#'   return and sanitized with [sanitizeMarkers()].
#' @param known Known markers `data.frame` imported by
#'   [readCellTypeMarkers()] or pulled from internal [cellCycleMarkers] data.
#' @param alpha Alpha cutoff (adjusted P value; false discovery rate).
#' @param filterPromiscuous Remove genes with poor specificity, that are present
#'   in as many clusters as defined by `promiscuousCutoff`.
#' @param promiscuousCutoff Minimum number of clusters required to consider a
#'   gene marker promiscuous.
#'
#' @return `grouped_df`, grouped by "`cellType`" column.
#'
#' @examples
#' # grouped_df ====
#' x <- knownMarkersDetected(
#'     object = all_markers_small,
#'     known = cellTypeMarkers[["homoSapiens"]]
#' )
#' head(x)
NULL



# Methods ======================================================================
#' @rdname knownMarkersDetected
#' @export
setMethod(
    "knownMarkersDetected",
    signature("grouped_df"),
    function(
        object,
        known,
        alpha = 0.05,
        filterPromiscuous = FALSE,
        promiscuousCutoff = 5L
    ) {
        stopifnot(.isSanitizedMarkers(object))
        assert_is_tbl_df(known)
        # Check for `geneID` column in both `object` and `known`
        assert_is_subset("geneID", colnames(object))
        assert_is_subset("geneID", colnames(known))
        # Check for `geneID` overlap; avoid accidental organism mismatch
        assert_are_intersecting_sets(known[["geneID"]], object[["geneID"]])
        # Check for valid promiscuous cutoff
        assertIsAnImplicitInteger(promiscuousCutoff)
        assert_all_are_in_left_open_range(promiscuousCutoff, 1L)
        # Group by cell type and arrange by P value
        markers <- object %>%
            ungroup() %>%
            # Apply alpha cutoff, before adding cell type annotations
            filter(!!sym("padj") < !!alpha) %>%
            left_join(known[, c("cellType", "geneID")], by = "geneID") %>%
            select(
                !!!syms(c("cellType", "cluster", "geneID", "geneName")),
                everything()
            ) %>%
            mutate_at(c("cellType", "cluster"), as.factor) %>%
            filter(!is.na(!!sym("cellType"))) %>%
            group_by(!!sym("cellType")) %>%
            arrange(!!sym("padj"), .by_group = TRUE)
        if (isTRUE(filterPromiscuous)) {
            # Filter out promiscuous markers present in multiple clusters
            promiscuous <- markers %>%
                ungroup() %>%
                group_by(!!!syms(c("cellType", "geneID", "geneName"))) %>%
                summarize(n = n()) %>%
                filter(!!sym("n") >= !!promiscuousCutoff) %>%
                pull("geneID")
            if (length(promiscuous)) {
                message(paste(
                    "Promiscuous markers:", toString(promiscuous)
                ))
                markers <- filter(markers, !!sym("geneID") %in% !!!promiscuous)
            }
        }
        markers
    }
)
roryk/bcbioSinglecell documentation built on May 27, 2019, 10:44 p.m.