R/plotting.R

#### plotting functions for slalom models

#' Plot results of a Slalom model
#'
#' @param object an object of class \code{Rcpp_SlalomModel}
#' @param n_active number of terms (factors) to be plotted (default is 20)
#' @param mad_filter numeric(1), filter factors by this mean absolute deviation
#' to exclude outliers. For large datasets this can be set to 0
#' @param annotated logical(1), should annotated factors be plotted? Default is
#' \code{TRUE}
#' @param unannotated_dense logical(1), should dense unannotated factors be
#' plotted? Default is \code{FALSE}
#' @param unannotated_sparse logical(1), should sparse unannotated factors be
#' plotted? Default is \code{FALSE}
#'
#' @return invisibly returns a list containing the two ggplot objects that make
#' up the plot
#'
#' @import ggplot2
#' @import grid
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
#' plotRelevance(model)
plotRelevance <- function(
    object, n_active = 20, mad_filter = 0.4, annotated = TRUE,
    unannotated_dense = FALSE, unannotated_sparse = FALSE) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")

    df <- topTerms(object, n_active, mad_filter, annotated, unannotated_dense,
                   unannotated_sparse)
    df[["term"]] <- factor(df[["term"]], levels = rev(df[["term"]]))
    df[["n_loss"]] <- -df[["n_loss"]]
    p1 <- ggplot(df, aes_string(x = "relevance", y = "term", size = "n_prior")) +
        geom_segment(aes_string(x = 0, xend = "relevance",
                                y = "term", yend = "term"),
                     colour = "gray60", size = 1, show.legend = FALSE) +
        scale_size(range = c(1, 6), name = "Gene set size") +
        xlab("Relevance") +
        ylab("Active pathways") +
        geom_point() +
        theme_classic() + theme(legend.justification = c(1,0),
                                legend.position = c(0.99,0.01))
    p2 <- ggplot(df) +
        geom_segment(aes(x = 0, xend = n_gain,
                                y = term, yend = term),
                     colour = "firebrick", size = 1) +
        geom_point(aes(x = n_gain, y = term, colour = "firebrick"),
                   size = 3) +
        geom_segment(aes(x = 0, xend = n_loss,
                                y = term, yend = term),
                     colour = "dodgerblue", size = 1) +
        geom_point(aes(x = n_loss, y = term, colour = "dodgerblue"),
                   size = 3) +
        scale_colour_manual(name = "Gene set change", guide = "legend",
                            values = c('firebrick' = 'firebrick',
                                       'dodgerblue' = 'dodgerblue'),
                            labels = c('loss','gain')) +
        guides(size = FALSE) +
        xlab("Gene set augmentation") +
        theme_classic() + theme(legend.justification = c(1,0),
                                legend.position = c(0.99,0.01),
                                axis.text.y = element_blank(),
                                axis.title.y = element_blank(),
                                axis.ticks.y = element_blank())

    grid::grid.newpage()
    grid::grid.draw(cbind(ggplotGrob(p1), ggplotGrob(p2), size = "last"))
    invisible(list(p1, p2))
}


#' Plot highest loadings of a factor
#'
#' @param object an object of class \code{Rcpp_SlalomModel}
#' @param term integer(1) or character(1), providing either index for desired
#' term (if an integer) or the term name (if character)
#' @param n_genes integer(1), number of loadings (genes) to show
#'
#' @details Show the factor loadings for a genes with the highest loadings for
#' a given factor. Absolute weights are shown, with genes ordered by absolute
#' weight. Indications are given on the plot as to whether the gene was
#' originally in the factor geneset or added to it by the slalom model.
#'
#' If more genes are shown (n_genes) than are in the geneset or added to the
#' geneset based on sufficient weight and relevance, then these genes will be
#' shown in the plot in grey, with the "low weight" annotation.
#'
#'
#' @return a ggplot plot object
#'
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
#' plotLoadings(model, term = 2)
plotLoadings <- function(object, term, n_genes = 10) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")
    if (is.character(term)) {
        term_match <- object$termNames == term
        if (all(!term_match))
            stop("Provided term not found in object.")
        term <- which(term_match)
    }
    Zchanged <- .getZchanged(object, term)
    W <- object$W_E1[, term]
    Z <- .getZ(object, term)
    gene_labels <- object$geneNames

    Wabs <- abs(W) * abs(Z)
    gene_index <- order(Wabs, decreasing = TRUE)[1:n_genes]

    anno <- ifelse(object$I[gene_index, term] > 0.5,
                   "in geneset", "low weight")
    anno[Zchanged[gene_index] == 1] <- "gained"

    df <- data.frame(
        gene = gene_labels[gene_index],
        absolute_weight = Wabs[gene_index],
        Annotation = anno
    )
    df[["gene"]] <- factor(df[["gene"]],
                           levels = rev(df[["gene"]]))
    df[["Annotation"]] <- factor(df[["Annotation"]],
                           levels = c("in geneset", "gained", "low weight"))

    ggplot(df, aes_string(x = "absolute_weight", y = "gene",
                          colour = "Annotation")) +
        geom_segment(aes_string(x = 0, xend = "absolute_weight",
                                y = "gene", yend = "gene"),
                     colour = "gray60", size = 0.5, show.legend = FALSE) +
        scale_size(range = c(1, 6), name = "Gene set size") +
        xlab("Abs. weight") +
        ylab("Genes") +
        geom_point(size = 3) +
        scale_color_manual(values = c("dodgerblue", "firebrick", "gray60"),
                           drop = FALSE) +
        theme_classic() + theme(legend.justification = c(1, 0),
                                legend.position = c(0.99, 0.01))
}


#' Plot relevance for all terms
#'
#'
#'
#' @param object an object of class \code{Rcpp_SlalomModel}
#' @param terms integer or character vector, providing either indices for
#' desired terms (if an integer) or the term names (if character); default is
#' \code{NULL}, in which case all terms are plotted.
#' @param order_terms logical(1), should factors be ordered by relevance (
#' \code{TRUE}, default), or in the order the come
#' @param mad_filter numeric(1), filter factors by this mean absolute deviation
#' to exclude outliers. For large datasets this can be set close to 0; default
#' is \code{0.2}.
#' @param annotated logical(1), should annotated factors be plotted? Default is
#' \code{TRUE}
#' @param unannotated_dense logical(1), should dense unannotated factors be
#' plotted? Default is \code{TRUE}
#' @param unannotated_sparse logical(1), should sparse unannotated factors be
#' plotted? Default is \code{TRUE}
#'
#' @return a ggplot plot object
#'
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
#' plotTerms(model)
#' plotTerms(model, unannotated_dense = FALSE)
plotTerms <- function(
    object, terms = NULL, order_terms = TRUE, mad_filter = 0.2,
    annotated = TRUE, unannotated_dense = TRUE, unannotated_sparse = FALSE) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")
    if (is.character(terms)) {
        term_match <- object$termNames == terms
        if (all(!term_match))
            stop("None of the provided terms found in object.")
        terms <- which(term_match)
    }

    df <- topTerms(object, n_active = object$K, mad_filter, annotated,
                   unannotated_dense, unannotated_sparse)
    ## trim extreme term names
    df[["term"]] <- substr(df[["term"]], 1, 40)
    ## order terms for nice plotting
    if (order_terms)
        df[["term"]] <- factor(df[["term"]], levels = rev(df[["term"]]))
    else
        df[["term"]] <- factor(df[["term"]], levels = rev(sort(df[["term"]])))

    ggplot(df, aes_string(x = "relevance", y = "term",
                          colour = "type")) +
        geom_segment(aes_string(x = 0, xend = "relevance",
                                y = "term", yend = "term"),
                     colour = "gray80", size = 0.5, show.legend = FALSE) +
        scale_size(range = c(1, 4), name = "Gene set size") +
        xlab("Relevance") +
        ylab("Terms") +
        geom_point(size = 3) +
        scale_color_manual(values = c("dodgerblue", "firebrick"),
                           name = "Term type") +
        theme_classic() + theme(legend.justification = c(1, 0),
                                legend.position = c(0.99,0.01))
}



## Get posterior of Z (Bernoulli part of spike-and-slab prior) :math:`Q(Z)`
## Args: term        (str): optional list of terms for which weights are returned. Default None=all terms.
.getZ <- function(object, terms = NULL) {
    if (is.null(terms))
        object$W_gamma0
    else
        object$W_gamma0[, terms]
}


## get matrix indicating whether the posterior distribution has changed for individual terms/genes
## Args: terms        (str): optional list of terms for which weights are returned. Default None=all terms.
## Rv: matrix [0,-1,1]: 0: no change, -1: loss, +1: gain
.getZchanged <- function(object, terms = NULL, threshold = 0.5) {
    if (is.null(terms)) terms <- seq_len(object$K)
    Z <- .getZ(object, terms)
    Pi <- object$Pi_E1[, terms]
    I <- matrix(0, nrow = object$G, ncol = length(terms))
    Igain <- (Z > threshold) & (Pi < threshold)
    Iloss <- (Z < threshold) & (Pi > threshold)
    I[Igain] <- 1
    I[Iloss] <- -1
    I
}
PMBio/Rslalom documentation built on May 28, 2019, 2:23 p.m.