monocle3: Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

Documented in plot_cells_3d

monocle_theme_opts <- function()
{
  theme(strip.background = element_rect(colour = 'white', fill = 'white')) +
    theme(panel.border = element_blank()) +
    theme(axis.line.x = element_line(size=0.25, color="black")) +
    theme(axis.line.y = element_line(size=0.25, color="black")) +
    theme(panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank()) +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank()) +
    theme(panel.background = element_rect(fill='white')) +
    theme(legend.key=element_blank())
}

#' Plot a dataset and trajectory in 3 dimensions
#'
#' @param cds cell_data_set to plot
#' @param dims numeric vector that indicates the dimensions used to create the
#'   3D plot, by default it is the first three dimensions.
#' @param reduction_method string indicating the reduction method to plot.
#' @param color_cells_by the cell attribute (e.g. the column of colData(cds))
#'   to map to each cell's color. Default is cluster.
#' @param genes a gene name or gene id to color the plot by.
#' @param show_trajectory_graph a logical used to indicate whether to graph the
#'   principal graph backbone. Default is TRUE.
#' @param trajectory_graph_color the color of graph backbone. Default is black.
#' @param trajectory_graph_segment_size numeric indicating the width of the
#'   graph backbone. Default is 5.
#' @param norm_method string indicating the method used to transform gene
#'   expression when gene markers are provided. Default is "log". "size_only"
#'   is also supported.
#' @param cell_size numeric indicating the size of the point to be plotted.
#'   Default is 25.
#' @param alpha numeric indicating the alpha value of the plotted cells.
#'   Default is 1.
#' @param min_expr numeric indicating the minimum marker gene value to be
#'   colored. Default is 0.1.
#' @param color_palette List of colors to pass to plotly for coloring cells by
#'   categorical variables. Default is NULL. When NULL, plotly uses default
#'   colors.
#' @param color_scale The name of the color scale passed to plotly for coloring
#'   cells by numeric scale. Default is "Viridis".
#' @return a plotly plot object
#' @export
#'
#' @examples
#' \dontrun{
#' plot_cells_3d(cds, markers=c("Rbfox3, Neurod1", "Sox2"))
#' }
#'
#' @importFrom future value
#' @importFrom plyr "."
#' @export
plot_cells_3d <- function(cds,
                          dims = c(1,2,3),
                          reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"),
                          color_cells_by="cluster",
                          #group_cells_by=c("cluster", "partition"), #
                          genes=NULL,
                          show_trajectory_graph=TRUE,
                          trajectory_graph_color="black",
                          trajectory_graph_segment_size=5,
                          norm_method = c("log", "size_only"),
                          color_palette = NULL,
                          color_scale = "Viridis",
                          #label_cell_groups = TRUE,#
                          #label_groups_by_cluster=TRUE,#
                          #group_label_size=2,#
                          #labels_per_group=1,#
                          #label_branch_points=TRUE,#
                          #label_roots=TRUE,#
                          #label_leaves=TRUE,#
                          #graph_label_size=2,#
                          cell_size=25,
                          alpha = 1,
                          min_expr=0.1) {

  reduction_method <- match.arg(reduction_method)
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
                          msg = paste("No dimensionality reduction for",
                                      reduction_method, "calculated.",
                                      "Please run reduce_dimension with",
                                      "reduction_method =", reduction_method,
                                      "before attempting to plot."))
  low_dim_coords <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
  if(!is.null(color_cells_by)) {
    assertthat::assert_that(color_cells_by %in% c("cluster", "partition",
                                                  "pseudotime") |
                              color_cells_by %in% names(colData(cds)),
                            msg = paste("color_cells_by must be a column in",
                                        "the colData table."))
  }

  assertthat::assert_that(!is.null(color_cells_by) || !is.null(markers),
                          msg = paste("Either color_cells_by or markers must",
                                      "be NULL, cannot color by both!"))
  norm_method = match.arg(norm_method)

  if (show_trajectory_graph &&
      is.null(principal_graph(cds)[[reduction_method]])) {
    message("No trajectory to plot. Has learn_graph() been called yet?")
    show_trajectory_graph = FALSE
  }

  gene_short_name <- NA
  sample_name <- NA

  x <- dims[[1]]
  y <- dims[[2]]
  z <- dims[[3]]

  S_matrix <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
  data_df <- data.frame(S_matrix[,c(dims)])

  colnames(data_df) <- c("data_dim_1", "data_dim_2", "data_dim_3")
  data_df$sample_name <- row.names(data_df)

  data_df <- as.data.frame(cbind(data_df, colData(cds)))

  if (color_cells_by == "cluster"){
    data_df$cell_color <- tryCatch({
      clusters(cds, reduction_method = reduction_method)[data_df$sample_name]},
      error = function(e) {NULL})
  } else if (color_cells_by == "partition") {
    data_df$cell_color <- tryCatch({
      partitions(cds,
                 reduction_method = reduction_method)[data_df$sample_name]},
      error = function(e) {NULL})
  } else if (color_cells_by == "pseudotime") {
    data_df$cell_color <- tryCatch({
      pseudotime(cds,
                 reduction_method = reduction_method)[data_df$sample_name]},
      error = function(e) {NULL})
  } else{
    data_df$cell_color <- colData(cds)[data_df$sample_name,color_cells_by]
  }

  ## Marker genes
  markers_exprs <- NULL
  if (!is.null(genes)) {
    if ((is.null(dim(genes)) == FALSE) && dim(genes)[[2]] >= 2){
      markers <- unlist(genes[,1], use.names=FALSE)
    } else {
      markers <- genes
    }
    markers_rowData <-
      as.data.frame(subset(rowData(cds), gene_short_name %in% markers |
                             row.names(rowData(cds)) %in% markers))
    if (nrow(markers_rowData) >= 1) {
      cds_exprs <- SingleCellExperiment::counts(cds)[row.names(markers_rowData), ,drop=FALSE]    # plot_cells_3d() row selection needs help
      cds_exprs <- Matrix::t(Matrix::t(cds_exprs)/size_factors(cds))

      if ((is.null(dim(genes)) == FALSE) && dim(genes)[[2]] >= 2){
        genes <- as.data.frame(genes)
        row.names(genes) <- genes[,1]
        genes <- genes[row.names(cds_exprs),]
        agg_mat <-
          as.matrix(my.aggregate.Matrix(cds_exprs,
                                        as.factor(genes[,2]),
                                        fun="sum"))
        agg_mat <- t(scale(t(log10(agg_mat + 1))))
        agg_mat[agg_mat < -2] <- -2
        agg_mat[agg_mat > 2] <- 2
        markers_exprs <- agg_mat
        markers_exprs <- reshape2::melt(markers_exprs)
        colnames(markers_exprs)[1:2] <- c('feature_id','cell_id')

        markers_exprs$feature_label <- markers_exprs$feature_id
        #markers_linear <- TRUE
      } else {
          cds_exprs <- round(cds_exprs, digits=4)
        # bge
        if(!is(cds_exprs, 'IterableMatrix')) {
          markers_exprs <- matrix(cds_exprs, nrow=nrow(markers_rowData))
        }
        else {
          markers_exprs <- matrix(as(cds_exprs, 'dgCMatrix'), nrow=nrow(markers_rowData))
        }
        colnames(markers_exprs) <- colnames(SingleCellExperiment::counts(cds))    # plot_cells_3d()   col_names  OK
        row.names(markers_exprs) <- row.names(markers_rowData)
        markers_exprs <- reshape2::melt(markers_exprs)
        colnames(markers_exprs)[1:2] <- c('feature_id','cell_id')
        markers_exprs <- merge(markers_exprs, markers_rowData,
                               by.x = "feature_id", by.y="row.names")
        markers_exprs$feature_label <-
          as.character(markers_exprs$gene_short_name)
        markers_exprs$feature_label[is.na(markers_exprs$feature_label)] <-
          markers_exprs$feature_id
        markers_exprs$feature_label <- factor(markers_exprs$feature_label,
                                              levels = markers)
      }
    }
  }

  if (is.null(markers_exprs) == FALSE && nrow(markers_exprs) > 0){
    data_df <- merge(data_df, markers_exprs, by.x="sample_name",
                     by.y="cell_id")
    data_df$expression <- with(data_df, ifelse(value >= min_expr, value, NA))
    sub1 <- data_df[!is.na(data_df$expression),]
    sub2 <- data_df[is.na(data_df$expression),]
    if(norm_method == "size_only"){
      p <- plotly::plot_ly(sub1) %>%
        plotly::add_trace(x = ~data_dim_1, y = ~data_dim_2, z = ~data_dim_3,
                          type = 'scatter3d', size=I(cell_size), alpha = I(alpha),
                          mode="markers", marker=list(
                            colorbar = list(title = "Expression", len=0.5),
                            color=~expression,
                            colors=color_scale,
                            line=list(width = 1,
                                      color = ~expression,
                                      colorscale=color_scale),
                            colorscale=color_scale)) %>%
        plotly::add_markers(x = sub2$data_dim_1, y = sub2$data_dim_2,
                            z = sub2$data_dim_3, color = I("lightgrey"),
                            size=I(cell_size),
                            marker=list(opacity = .4), showlegend=FALSE)
    } else {
      sub1$log10_expression <- log10(sub1$expression + min_expr)
      p <- plotly::plot_ly(sub1) %>%
        plotly::add_trace(x = ~data_dim_1, y = ~data_dim_2, z = ~data_dim_3,
                          type = 'scatter3d', size=I(cell_size), alpha = I(alpha),
                          mode="markers", marker=list(
                            colorbar = list(title = "Log10\nExpression", len=0.5),
                            color=~log10_expression,
                            colors=color_scale,
                            line=list(width = 1,
                                      color = ~log10_expression,
                                      colorscale=color_scale),
                            colorscale=color_scale)) %>%
        plotly::add_markers(x = sub2$data_dim_1, y = sub2$data_dim_2,
                            z = sub2$data_dim_3, color = I("lightgrey"),
                            size=I(cell_size),
                            marker=list(opacity = .4), showlegend=FALSE)
    }
  } else {
    if(color_cells_by %in% c("cluster", "partition")){
      if (is.null(data_df$cell_color)){
        p <- plotly::plot_ly(data_df, x = ~data_dim_1, y = ~data_dim_2,
                             z = ~data_dim_3, type = 'scatter3d',
                             size=I(cell_size), color=I("gray"),
                             mode="markers", alpha = I(alpha))
        message("cluster_cells() has not been called yet, can't color ",
                "cells by cluster or partition")
      } else{
        if(is.null(color_palette)) {
          N <- length(unique(data_df$cell_color))
          color_palette <- RColorBrewer::brewer.pal(N, "Set2")
        }
        p <- plotly::plot_ly(data_df, x = ~data_dim_1, y = ~data_dim_2,
                             z = ~data_dim_3, type = 'scatter3d',
                             size=I(cell_size), color=~cell_color,
                             colors = color_palette,
                             mode="markers", alpha = I(alpha))
      }
    } else if(methods::is(data_df$cell_color, "numeric")) {
      p <- plotly::plot_ly(data_df) %>%
        plotly::add_trace(x = ~data_dim_1, y = ~data_dim_2, z = ~data_dim_3,
                          type = 'scatter3d', size=I(cell_size), alpha = I(alpha),
                          mode="markers", marker=list(
                            colorbar = list(title = color_cells_by, len=0.5),
                            color=~cell_color,
                            colors=color_scale,
                            line=list(width = 1,
                                      color = ~cell_color,
                                      colorscale=color_scale),
                            colorscale=color_scale))
    } else {
      if(is.null(color_palette)) {
        N <- length(unique(data_df$cell_color))
        color_palette <- RColorBrewer::brewer.pal(N, "Set2")
      }
      p <- plotly::plot_ly(data_df, x = ~data_dim_1, y = ~data_dim_2,
                           z = ~data_dim_3, type = 'scatter3d',
                           size=I(cell_size), color=~cell_color,
                           colors = color_palette,
                           mode="markers", alpha = I(alpha))
    }
  }
  p <- p %>%
    plotly::layout(scene = list(xaxis=list(title=paste("Component", x)),
                                yaxis=list(title=paste("Component", y)),
                                zaxis=list(title=paste("Component", z))))
  ## Graph info
  if (show_trajectory_graph) {

    ica_space_df <- t(cds@principal_graph_aux[[reduction_method]]$dp_mst) %>%
      as.data.frame() %>%
      dplyr::select(prin_graph_dim_1 = {{x}}, prin_graph_dim_2 = {{y}},
                     prin_graph_dim_3 = {{z}}) %>%
      dplyr::mutate(sample_name = rownames(.),
                    sample_state = rownames(.))

    dp_mst <- cds@principal_graph[[reduction_method]]

    edge_df <- dp_mst %>%
      igraph::as_data_frame() %>%
      dplyr::select(source = "from", target = "to") %>%
      dplyr::left_join(ica_space_df %>%
                         dplyr::select(source="sample_name",
                                        source_prin_graph_dim_1="prin_graph_dim_1",
                                        source_prin_graph_dim_2="prin_graph_dim_2",
                                        source_prin_graph_dim_3="prin_graph_dim_3"),
                       by = "source") %>%
      dplyr::left_join(ica_space_df %>%
                         dplyr::select(target="sample_name",
                                        target_prin_graph_dim_1="prin_graph_dim_1",
                                        target_prin_graph_dim_2="prin_graph_dim_2",
                                        target_prin_graph_dim_3="prin_graph_dim_3"),
                       by = "target")

    if(nrow(edge_df) < 1) warning('bad loop: nrow(edge_df) < 1')
    for (i in 1:nrow(edge_df)) {
      p <- p %>%
        plotly::add_trace(
          x = as.vector(t(edge_df[i, c("source_prin_graph_dim_1",
                                       "target_prin_graph_dim_1")])),
          y = as.vector(t(edge_df[i, c("source_prin_graph_dim_2",
                                       "target_prin_graph_dim_2")])),
          z = as.vector(t(edge_df[i, c("source_prin_graph_dim_3",
                                       "target_prin_graph_dim_3")])),
          color = trajectory_graph_color,
          line = list(color = I(trajectory_graph_color),
                      width = trajectory_graph_segment_size), mode = 'lines',
          type = 'scatter3d', showlegend = FALSE)
    }
  }
  p
}

#' Plots the cells along with their trajectories.
#'
#' @param cds cell_data_set for the experiment
#' @param x the column of SingleCellExperiment::reducedDims(cds) to plot on the horizontal axis
#' @param y the column of SingleCellExperiment::reducedDims(cds) to plot on the vertical axis
#' @param cell_size The size of the point for each cell
#' @param cell_stroke The stroke used for plotting each cell - default is 1/2
#'   of the cell_size
#' @param reduction_method The lower dimensional space in which to plot cells.
#'   Must be one of "UMAP", "tSNE", "PCA" and "LSI".
#' @param color_cells_by What to use for coloring the cells. Must be either the
#'   name of a column of colData(cds), or one of "clusters", "partitions", or
#'   "pseudotime".
#' @param group_cells_by How to group cells when labeling them. Must be either
#'   the name of a column of colData(cds), or one of "clusters" or "partitions".
#'   If a column in colData(cds), must be a categorical variable.
#' @param genes Facet the plot, showing the expression of each gene in a facet
#'   panel. Must be either a list of gene ids (or short names), or a dataframe
#'   with two columns that groups the genes into modules that will be
#'   aggregated prior to plotting. If the latter, the first column must be gene
#'   ids, and the second must the group for each gene.
#' @param show_trajectory_graph Whether to render the principal graph for the
#'   trajectory. Requires that learn_graph() has been called on cds.
#' @param trajectory_graph_color The color to be used for plotting the
#'   trajectory graph.
#' @param trajectory_graph_segment_size The size of the line segments used for
#'   plotting the trajectory graph.
#' @param norm_method How to normalize gene expression scores prior to plotting
#'   them. Must be one of "log" or "size_only".
#' @param label_cell_groups Whether to label cells in each group (as specified
#'   by group_cells_by) according to the most frequently occurring label(s) (as
#'   specified by color_cells_by) in the group. If false, plot_cells() simply
#'   adds a traditional color legend.
#' @param label_groups_by_cluster Instead of labeling each cluster of cells,
#'   place each label once, at the centroid of all cells carrying that label.
#' @param group_label_size Font size to be used for cell group labels.
#' @param labels_per_group How many labels to plot for each group of cells.
#'   Defaults to 1, which plots only the most frequent label per group.
#' @param label_branch_points Whether to plot a label for each branch point in
#'   the principal graph.
#' @param label_roots Whether to plot a label for each root in the principal
#'   graph.
#' @param label_leaves Whether to plot a label for each leaf node in the
#'   principal graph.
#' @param graph_label_size How large to make the branch, root, and leaf labels.
#' @param alpha Alpha for the cells. Useful for reducing overplotting.
#' @param min_expr Minimum expression threshold for plotting genes
#' @param rasterize Whether to plot cells as a rastered bitmap. Requires the
#'   ggrastr package.
#' @param scale_to_range Logical indicating whether to scale expression to
#'   percent of maximum expression.
#' @param label_principal_points Logical indicating whether to label roots,
#'   leaves, and branch points with principal point names. This is useful for
#'   order_cells and choose_graph_segments in non-interactive mode.
#'
#' @return a ggplot2 plot object
#' @importFrom plyr "."
#' @importFrom future value
#'
#' @examples
#'   \dontrun{
#'     lung <- load_A549()
#'     plot_cells(lung)
#'     plot_cells(lung, color_cells_by="log_dose")
#'     plot_cells(lung, markers="GDF15")
#'   }
#'
#' @importFrom dplyr n
#' @export
plot_cells <- function(cds,
                       x=1,
                       y=2,
                       reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"),
                       color_cells_by="cluster",
                       group_cells_by="cluster",
                       genes=NULL,
                       show_trajectory_graph=TRUE,
                       trajectory_graph_color="grey28",
                       trajectory_graph_segment_size=0.75,
                       norm_method = c("log", "size_only"),
                       label_cell_groups = TRUE,
                       label_groups_by_cluster=TRUE,
                       group_label_size=2,
                       labels_per_group=1,
                       label_branch_points=TRUE,
                       label_roots=TRUE,
                       label_leaves=TRUE,
                       graph_label_size=2,
                       cell_size=0.35,
                       cell_stroke= I(cell_size / 2),
                       alpha = 1,
                       min_expr=0.1,
                       rasterize=FALSE,
                       scale_to_range=TRUE,
                       label_principal_points = FALSE) {
  feature_label <- min_val_for_feature <- max_val_for_feature <- NULL # no visible binding
  cell_group <- cell_color <- cells_in_cluster <- per <- NULL # no visible binding
  reduction_method <- match.arg(reduction_method)
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[[reduction_method]]),
                          msg = paste("No dimensionality reduction for",
                                      reduction_method, "calculated.",
                                      "Please run reduce_dimension with",
                                      "reduction_method =", reduction_method,
                                      "before attempting to plot."))
  low_dim_coords <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
  assertthat::assert_that(ncol(low_dim_coords) >=max(x,y),
                          msg = paste("x and/or y is too large. x and y must",
                                      "be dimensions in reduced dimension",
                                      "space."))
  if(!is.null(color_cells_by)) {
    assertthat::assert_that(color_cells_by %in% c("cluster", "partition",
                                                  "pseudotime") |
                              color_cells_by %in% names(colData(cds)),
                            msg = paste("color_cells_by must one of",
                                        "'cluster', 'partition', 'pseudotime,",
                                        "or a column in the colData table."))

    if(color_cells_by == "pseudotime") {
      tryCatch({pseudotime(cds, reduction_method = reduction_method)},
               error = function(x) {
                 stop("No pseudotime for ", reduction_method,
                            " calculated. Please run order_cells with ",
                            "reduction_method = ", reduction_method,
                            " before attempting to color by pseudotime.")})

    }
  }
  assertthat::assert_that(!is.null(color_cells_by) || !is.null(markers),
                          msg = paste("Either color_cells_by or markers must",
                                      "be NULL, cannot color by both!"))

  norm_method = match.arg(norm_method)

  assertthat::assert_that((group_cells_by %in% c("cluster", "partition")) ||
                          (group_cells_by %in% names(SummarizedExperiment::colData(cds))),
                          msg = paste("group_cells_by must be one of",
                          "'cluster', 'partition', or a column in the colData table."))

  assertthat::assert_that(!is.null(color_cells_by) || !is.null(genes),
                          msg = paste("Either color_cells_by or genes must be",
                                      "NULL, cannot color by both!"))

  if (show_trajectory_graph &&
      is.null(principal_graph(cds)[[reduction_method]])) {
    message("No trajectory to plot. Has learn_graph() been called yet?")
    show_trajectory_graph = FALSE
  }
  if (label_principal_points &&
      is.null(principal_graph(cds)[[reduction_method]])) {
    message("Cannot label principal points when no trajectory to plot. Has learn_graph() been called yet?")
    label_principal_points = FALSE
  }

  if (label_principal_points) {
    label_branch_points <- FALSE
    label_leaves <- FALSE
    label_roots <- FALSE
  }


  gene_short_name <- NA
  sample_name <- NA
  #sample_state <- colData(cds)$State
  data_dim_1 <- NA
  data_dim_2 <- NA
  if (rasterize){
    plotting_func <- ggrastr::geom_point_rast
  }else{
    plotting_func <- ggplot2::geom_point
  }

  S_matrix <- SingleCellExperiment::reducedDims(cds)[[reduction_method]]
  data_df <- data.frame(S_matrix[,c(x,y)])

  colnames(data_df) <- c("data_dim_1", "data_dim_2")
  data_df$sample_name <- row.names(data_df)

  data_df <- as.data.frame(cbind(data_df, colData(cds)))
  if (group_cells_by == "cluster") {
    data_df$cell_group <-
      tryCatch({clusters(cds,
                         reduction_method = reduction_method)[
                           data_df$sample_name]},
               error = function(e) {NULL})
  } else if (group_cells_by == "partition") {
    data_df$cell_group <-
      tryCatch({partitions(cds,
                           reduction_method = reduction_method)[
                             data_df$sample_name]},
               error = function(e) {NULL})
  } else{
    data_df$cell_group <- SummarizedExperiment::colData(cds)[data_df$sample_name, group_cells_by]
  }

  if (color_cells_by == "cluster"){
    data_df$cell_color <-
      tryCatch({clusters(cds,
                         reduction_method = reduction_method)[
                           data_df$sample_name]},
               error = function(e) {NULL})
  } else if (color_cells_by == "partition") {
    data_df$cell_color <-
      tryCatch({partitions(cds,
                           reduction_method = reduction_method)[
                             data_df$sample_name]},
               error = function(e) {NULL})
  } else if (color_cells_by == "pseudotime") {
    data_df$cell_color <-
      tryCatch({pseudotime(cds,
                           reduction_method = reduction_method)[
                             data_df$sample_name]}, error = function(e) {NULL})
  } else{
    data_df$cell_color <- colData(cds)[data_df$sample_name, color_cells_by]
  }

  ## Graph info
  if (show_trajectory_graph) {

    ica_space_df <- t(cds@principal_graph_aux[[reduction_method]]$dp_mst) %>%
      as.data.frame() %>%
      dplyr::select(prin_graph_dim_1 = {{x}}, prin_graph_dim_2 = {{y}}) %>%
      dplyr::mutate(sample_name = rownames(.),
                    sample_state = rownames(.))

    dp_mst <- cds@principal_graph[[reduction_method]]

    edge_df <- dp_mst %>%
      igraph::as_data_frame() %>%
      dplyr::select(source = "from", target = "to") %>%
      dplyr::left_join(ica_space_df %>%
                         dplyr::select(
                           source="sample_name",
                           source_prin_graph_dim_1="prin_graph_dim_1",
                           source_prin_graph_dim_2="prin_graph_dim_2"),
                       by = "source") %>%
      dplyr::left_join(ica_space_df %>%
                         dplyr::select(
                           target="sample_name",
                           target_prin_graph_dim_1="prin_graph_dim_1",
                           target_prin_graph_dim_2="prin_graph_dim_2"),
                       by = "target")
  }

  ## Marker genes
  markers_exprs <- NULL
  expression_legend_label <- NULL
  if (!is.null(genes)) {
    if (!is.null(dim(genes)) && dim(genes)[[2]] >= 2){
      markers = unlist(genes[,1], use.names=FALSE)
    } else {
      markers = genes
    }
	markers_rowData <- rowData(cds)[(rowData(cds)$gene_short_name %in% markers) |
							        (row.names(rowData(cds)) %in% markers),,drop=FALSE]
	markers_rowData <- as.data.frame(markers_rowData)
    if (nrow(markers_rowData) == 0) {
      stop("None of the provided genes were found in the cds")
    }
    if (nrow(markers_rowData) >= 1) {
      cds_exprs <- SingleCellExperiment::counts(cds)[row.names(markers_rowData), ,drop=FALSE]   # plot_cells    row selection  needs help
      assertthat::assert_that(!is.null(size_factors(cds)))
      cds_exprs <- Matrix::t(Matrix::t(cds_exprs)/size_factors(cds))

      if (!is.null(dim(genes)) && dim(genes)[[2]] >= 2){
        #genes = as.data.frame(genes)
        #row.names(genes) = genes[,1]
        #genes = genes[row.names(cds_exprs),]

        tmp_matrix <- tryCatch(aggregate_gene_expression(cds, genes, norm_method=norm_method, gene_agg_fun="mean", scale_agg_values=FALSE),
                        error = function(c) { stop(paste0(trimws(c), '\n* error in plot_cells')) })
        agg_mat = as.matrix(tmp_matrix)
        markers_exprs = agg_mat
        markers_exprs <- reshape2::melt(markers_exprs)
        colnames(markers_exprs)[1:2] <- c('feature_id','cell_id')
        if (is.factor(genes[,2]))
          markers_exprs$feature_id = factor(markers_exprs$feature_id,
                                            levels=levels(genes[,2]))

        markers_exprs$feature_label <- markers_exprs$feature_id
        norm_method = "size_only"
        expression_legend_label = "Expression score"
      } else {
        cds_exprs = round(cds_exprs, digits=4)
        # bge
        if(!is(cds_exprs, 'IterableMatrix')) {
          markers_exprs = matrix(cds_exprs, nrow=nrow(markers_rowData))   # bge
        }
        else {
          markers_exprs = matrix(as(cds_exprs, 'dgCMatrix'), nrow=nrow(markers_rowData))
        }
        colnames(markers_exprs) = colnames(SingleCellExperiment::counts(cds))     # plot_cells  col_names OK
        row.names(markers_exprs) = row.names(markers_rowData)
        markers_exprs <- reshape2::melt(markers_exprs)
        colnames(markers_exprs)[1:2] <- c('feature_id','cell_id')
        markers_exprs <- merge(markers_exprs, markers_rowData,
                               by.x = "feature_id", by.y="row.names")
        if (is.null(markers_exprs$gene_short_name)) {
          markers_exprs$feature_label <-
            as.character(markers_exprs$feature_id)
        } else {
          markers_exprs$feature_label <-
            as.character(markers_exprs$gene_short_name)
        }

        markers_exprs$feature_label <- ifelse(is.na(markers_exprs$feature_label) | !as.character(markers_exprs$feature_label) %in% markers,
                                              as.character(markers_exprs$feature_id),
                                              as.character(markers_exprs$feature_label))

        markers_exprs$feature_label <- factor(markers_exprs$feature_label,
                                              levels = markers)
        if (norm_method == "size_only")
          expression_legend_label = "Expression"
        else
          expression_legend_label = "log10(Expression)"
      }

      if (scale_to_range){
				markers_exprs = dplyr::group_by(markers_exprs, feature_label) %>%
				  dplyr::mutate(max_val_for_feature = max(value),
								min_val_for_feature = min(value)) %>%
				  dplyr::mutate(value = 100 * (value - min_val_for_feature) / (max_val_for_feature - min_val_for_feature))
				expression_legend_label = "% Max"
			  }
			}
		  }

		  if (label_cell_groups && is.null(color_cells_by) == FALSE){
			if (is.null(data_df$cell_color)){
			  if (is.null(genes)){
				message(color_cells_by, " not found in colData(cds), cells will ",
							  "not be colored")
			  }
			  text_df = NULL
			  label_cell_groups = FALSE
			}else{
			  if(is.character(data_df$cell_color) || is.factor(data_df$cell_color)) {

				if (label_groups_by_cluster && is.null(data_df$cell_group) == FALSE){
				  text_df = data_df %>%
					dplyr::group_by(cell_group) %>%
					dplyr::mutate(cells_in_cluster= dplyr::n()) %>%
					dplyr::group_by(cell_color, .add=TRUE) %>%
					dplyr::mutate(per=dplyr::n()/cells_in_cluster)
				  median_coord_df = text_df %>%
					dplyr::summarize(fraction_of_group = dplyr::n(),
									 text_x = stats::median(x = data_dim_1),
									 text_y = stats::median(x = data_dim_2))
				  text_df = suppressMessages(text_df %>% dplyr::select(per) %>%
											   dplyr::distinct())
				  text_df = suppressMessages(dplyr::inner_join(text_df,
															   median_coord_df))
				  text_df = text_df %>% dplyr::group_by(cell_group) %>%
					dplyr::top_n(labels_per_group, per)
				} else {
				  text_df = data_df %>% dplyr::group_by(cell_color) %>%
					dplyr::mutate(per=1)
				  median_coord_df = text_df %>%
					dplyr::summarize(fraction_of_group = dplyr::n(),
									 text_x = stats::median(x = data_dim_1),
									 text_y = stats::median(x = data_dim_2))
				  text_df = suppressMessages(text_df %>% dplyr::select(per) %>%
											   dplyr::distinct())
				  text_df = suppressMessages(dplyr::inner_join(text_df,
															   median_coord_df))
				  text_df = text_df %>% dplyr::group_by(cell_color) %>%
					dplyr::top_n(labels_per_group, per)
				}

				text_df$label = as.character(text_df %>% dplyr::pull(cell_color))
				# I feel like there's probably a good reason for the bit below, but I
				# hate it and I'm killing it for now.
				# text_df$label <- paste0(1:nrow(text_df))
				# text_df$process_label <- paste0(1:nrow(text_df), '_',
				# as.character(as.matrix(text_df[, 1])))
				# process_label <- text_df$process_label
				# names(process_label) <- as.character(as.matrix(text_df[, 1]))
				# data_df[, group_by] <-
				#  process_label[as.character(data_df[, group_by])]
				# text_df$label = process_label
			  } else {
				message("Cells aren't colored in a way that allows them to ",
                        "be grouped.")
				text_df = NULL
				label_cell_groups = FALSE
			  }
			}
		  }

		  if (!is.null(markers_exprs) && nrow(markers_exprs) > 0){
			data_df <- merge(data_df, markers_exprs, by.x="sample_name",
							 by.y="cell_id")
			data_df$value <- with(data_df, ifelse(value >= min_expr, value, NA))
			ya_sub <- data_df[!is.na(data_df$value),]
			na_sub <- data_df[is.na(data_df$value),]
			if(norm_method == "size_only"){
			  g <- ggplot(data=data_df, aes(x=data_dim_1, y=data_dim_2)) +
				plotting_func(aes(data_dim_1, data_dim_2), size=I(cell_size),
							  stroke = I(cell_stroke), color = "grey80", alpha = alpha,
							  data = na_sub) +
				plotting_func(aes(color=value), size=I(cell_size),
							  stroke = I(cell_stroke),
							  data = ya_sub[order(ya_sub$value),]) +
				viridis::scale_color_viridis(option = "viridis",
											 name = expression_legend_label,
											 na.value = NA, end = 0.8,
											 alpha = alpha) +
				guides(alpha = 'none') + facet_wrap(~feature_label)
    } else {
      g <- ggplot(data=data_df, aes(x=data_dim_1, y=data_dim_2)) +
        plotting_func(aes(data_dim_1, data_dim_2), size=I(cell_size),
                      stroke = I(cell_stroke), color = "grey80",
                      data = na_sub, alpha = alpha)
      if (scale_to_range){
        g <- g + plotting_func(aes(color=value),
                               size=I(cell_size), stroke = I(cell_stroke),
                               data = ya_sub[order(ya_sub$value),],
                               alpha = alpha)
      }else{
        g <- g + plotting_func(aes(color=log10(value+min_expr)),
                               size=I(cell_size), stroke = I(cell_stroke),
                               data = ya_sub[order(ya_sub$value),],
                               alpha = alpha)
      }

      g <- g +
        viridis::scale_color_viridis(option = "viridis",
                                     name = expression_legend_label,
                                     na.value = NA, end = 0.8,
                                     alpha = alpha) +
        guides(alpha = 'none') + facet_wrap(~feature_label)
    }
  } else {
    g <- ggplot(data=data_df, aes(x=data_dim_1, y=data_dim_2))

    g <- g + geom_point(color=I("black"), size=1.5*cell_size,
                        stroke = I(cell_stroke), na.rm = TRUE,
                        alpha = I(alpha))

    # We don't want to force users to call order_cells before even being able
    # to look at the trajectory, so check whether it's null and if so, just
    # don't color the cells
    if(color_cells_by %in% c("cluster", "partition")){
      if (is.null(data_df$cell_color)){
        g <- g + geom_point(color=I("gray"), size=I(cell_size),
                            stroke = I(cell_stroke), na.rm = TRUE,
                            alpha = I(alpha))
        message("cluster_cells() has not been called yet, can't ",
                      "color cells by cluster")
      } else{
        g <- g + geom_point(aes(color = cell_color), size=I(cell_size),
                            stroke = I(cell_stroke), na.rm = TRUE,
                            alpha = alpha)
      }
      g <- g + guides(color = guide_legend(title = color_cells_by,
                                           override.aes = list(size = 4)))
    } else if (methods::is(data_df$cell_color, "numeric")) {
      g <- g + geom_point(aes(color = cell_color), size=I(cell_size),
                          stroke = I(cell_stroke), na.rm = TRUE, alpha = alpha)
      g <- g + viridis::scale_color_viridis(name = color_cells_by, option="C")
    } else {
      g <- g + geom_point(aes(color = cell_color), size=I(cell_size),
                          stroke = I(cell_stroke), na.rm = TRUE, alpha = alpha)
      g <- g + guides(color = guide_legend(title = color_cells_by,
                                           override.aes = list(size = 4)))
    }

  }
  if (show_trajectory_graph){
    g <- g + geom_segment(aes_string(x="source_prin_graph_dim_1",
                                     y="source_prin_graph_dim_2",
                                     xend="target_prin_graph_dim_1",
                                     yend="target_prin_graph_dim_2"),
                          size=trajectory_graph_segment_size,
                          color=I(trajectory_graph_color),
                          linetype="solid",
                          na.rm=TRUE,
                          data=edge_df)


    if (label_principal_points) {
      mst_branch_nodes <- branch_nodes(cds, reduction_method)
      mst_leaf_nodes <- leaf_nodes(cds, reduction_method)
      mst_root_nodes <- root_nodes(cds, reduction_method)
      pps <- c(mst_branch_nodes, mst_leaf_nodes, mst_root_nodes)
      princ_point_df <- ica_space_df %>%
        dplyr::slice(match(names(pps), sample_name))

      g <- g +
        geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
                   shape = 21, stroke=I(trajectory_graph_segment_size),
                   color="white",
                   fill="black",
                   size=I(graph_label_size * 1.5),
                   na.rm=TRUE, princ_point_df) +
        ggrepel::geom_text_repel(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
                             label="sample_name"),
                  size=I(graph_label_size * 1.5), color="Black", na.rm=TRUE,
                  princ_point_df)
    }
    if (label_branch_points){
      mst_branch_nodes <- branch_nodes(cds, reduction_method)
      branch_point_df <- ica_space_df %>%
        dplyr::slice(match(names(mst_branch_nodes), sample_name)) %>%
        dplyr::mutate(branch_point_idx = seq_len(dplyr::n()))

      g <- g +
        geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
                   shape = 21, stroke=I(trajectory_graph_segment_size),
                   color="white",
                   fill="black",
                   size=I(graph_label_size * 1.5),
                   na.rm=TRUE, branch_point_df) +
        geom_text(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
                             label="branch_point_idx"),
                  size=I(graph_label_size), color="white", na.rm=TRUE,
                  branch_point_df)
    }

    if (label_leaves){
      mst_leaf_nodes <- leaf_nodes(cds, reduction_method)
      leaf_df <- ica_space_df %>%
        dplyr::slice(match(names(mst_leaf_nodes), sample_name)) %>%
        dplyr::mutate(leaf_idx = seq_len(dplyr::n()))

      g <- g +
        geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
                   shape = 21, stroke=I(trajectory_graph_segment_size),
                   color="black",
                   fill="lightgray",
                   size=I(graph_label_size * 1.5),
                   na.rm=TRUE,
                   leaf_df) +
        geom_text(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
                             label="leaf_idx"),
                  size=I(graph_label_size), color="black", na.rm=TRUE, leaf_df)
    }

    if (label_roots){
      mst_root_nodes <- root_nodes(cds, reduction_method)
      root_df <- ica_space_df %>%
        dplyr::slice(match(names(mst_root_nodes), sample_name)) %>%
        dplyr::mutate(root_idx = seq_len(dplyr::n()))

      g <- g +
        geom_point(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2"),
                   shape = 21, stroke=I(trajectory_graph_segment_size),
                   color="black",
                   fill="white",
                   size=I(graph_label_size * 1.5),
                   na.rm=TRUE,
                   root_df) +
        geom_text(aes_string(x="prin_graph_dim_1", y="prin_graph_dim_2",
                             label="root_idx"),
                  size=I(graph_label_size), color="black", na.rm=TRUE, root_df)
    }
  }

  if(label_cell_groups) {
    g <- g + ggrepel::geom_text_repel(data = text_df,
                                      mapping = aes_string(x = "text_x",
                                                           y = "text_y",
                                                           label = "label"),
                                      size=I(group_label_size))
    # If we're coloring by gene expression, don't hide the legend
    if (is.null(markers_exprs))
      g <- g + theme(legend.position="none")
  }

  g <- g +
    #scale_color_brewer(palette="Set1") +
    monocle_theme_opts() +
    xlab(paste(reduction_method, x)) +
    ylab(paste(reduction_method, y)) +
    #guides(color = guide_legend(label.position = "top")) +
    theme(legend.key = element_blank()) +
    theme(panel.background = element_rect(fill='white'))
  g
}



#' Plots expression for one or more genes as a function of pseudotime
#'
#' @param cds_subset subset cell_data_set including only the genes to be
#'   plotted.
#' @param min_expr the minimum (untransformed) expression level to plot.
#' @param cell_size the size (in points) of each cell used in the plot.
#' @param nrow the number of rows used when laying out the panels for each
#'   gene's expression.
#' @param ncol the number of columns used when laying out the panels for each
#'   gene's expression
#' @param panel_order vector of gene names indicating the order in which genes
#'   should be laid out (left-to-right, top-to-bottom). If
#'   \code{label_by_short_name = TRUE}, use gene_short_name values, otherwise
#'   use feature IDs.
#' @param color_cells_by the cell attribute (e.g. the column of colData(cds))
#'   to be used to color each cell.
#' @param trend_formula the model formula to be used for fitting the expression
#'   trend over pseudotime.
#' @param label_by_short_name label figure panels by gene_short_name (TRUE) or
#'   feature ID (FALSE).
#' @param vertical_jitter A value passed to ggplot to jitter the points in the
#'   vertical dimension. Prevents overplotting, and is particularly helpful for
#'   rounded transcript count data.
#' @param horizontal_jitter A value passed to ggplot to jitter the points in
#'   the horizontal dimension. Prevents overplotting, and is particularly
#'   helpful for rounded transcript count data.
#' @return a ggplot2 plot object
#' @export
plot_genes_in_pseudotime <-function(cds_subset,
                                    min_expr=NULL,
                                    cell_size=0.75,
                                    nrow=NULL,
                                    ncol=1,
                                    panel_order=NULL,
                                    color_cells_by="pseudotime",
                                    trend_formula="~ splines::ns(pseudotime, df=3)",
                                    label_by_short_name=TRUE,
                                    vertical_jitter=NULL,
                                    horizontal_jitter=NULL){
  assertthat::assert_that(methods::is(cds_subset, "cell_data_set"))
  tryCatch({pseudotime(cds_subset)}, error = function(x) {
    stop("No pseudotime calculated. Must call order_cells first.")})
  colData(cds_subset)$pseudotime <- pseudotime(cds_subset)
  if(!is.null(min_expr)) {
    assertthat::assert_that(assertthat::is.number(min_expr))
  }
  assertthat::assert_that(assertthat::is.number(cell_size))

  if(!is.null(nrow)) {
    assertthat::assert_that(assertthat::is.count(nrow))
  }

  assertthat::assert_that(assertthat::is.count(ncol))
  assertthat::assert_that(is.logical(label_by_short_name))
  if (label_by_short_name) {
    assertthat::assert_that("gene_short_name" %in% names(rowData(cds_subset)),
                            msg = paste("When label_by_short_name = TRUE,",
                                        "rowData must have a column of gene",
                                        "names called gene_short_name."))
  }
  assertthat::assert_that(color_cells_by %in% c("cluster", "partition") |
                            color_cells_by %in% names(colData(cds_subset)),
                          msg = paste("color_cells_by must be a column in the",
                                      "colData table."))

  if(!is.null(panel_order)) {
    if (label_by_short_name) {
      assertthat::assert_that(all(panel_order %in%
                                    rowData(cds_subset)$gene_short_name))
    } else {
      assertthat::assert_that(all(panel_order %in%
                                    row.names(rowData(cds_subset))))
    }
  }
  assertthat::assert_that(nrow(rowData(cds_subset)) <= 100,
                          msg = paste("cds_subset has more than 100 genes -",
                                      "pass only the subset of the CDS to be",
                                      "plotted."))

  assertthat::assert_that(methods::is(cds_subset, "cell_data_set"))
  assertthat::assert_that("pseudotime" %in% names(colData(cds_subset)),
                          msg = paste("pseudotime must be a column in",
                                      "colData. Please run order_cells",
                                      "before running",
                                      "plot_genes_in_pseudotime."))
  if(!is.null(min_expr)) {
    assertthat::assert_that(assertthat::is.number(min_expr))
  }
  assertthat::assert_that(assertthat::is.number(cell_size))
  assertthat::assert_that(!is.null(size_factors(cds_subset)))
  if(!is.null(nrow)) {
    assertthat::assert_that(assertthat::is.count(nrow))
  }

  assertthat::assert_that(assertthat::is.count(ncol))
  assertthat::assert_that(is.logical(label_by_short_name))
  if (label_by_short_name) {
    assertthat::assert_that("gene_short_name" %in% names(rowData(cds_subset)),
                            msg = paste("When label_by_short_name = TRUE,",
                                        "rowData must have a column of gene",
                                        "names called gene_short_name."))
  }
  assertthat::assert_that(color_cells_by %in% c("cluster", "partition") |
                            color_cells_by %in% names(colData(cds_subset)),
                          msg = paste("color_cells_by must be a column in the",
                                      "colData table."))

  if(!is.null(panel_order)) {
    if (label_by_short_name) {
      assertthat::assert_that(all(panel_order %in%
                                    rowData(cds_subset)$gene_short_name))
    } else {
      assertthat::assert_that(all(panel_order %in%
                                    row.names(rowData(cds_subset))))
    }
  }
  assertthat::assert_that(nrow(rowData(cds_subset)) <= 100,
                          msg = paste("cds_subset has more than 100 genes -",
                                      "pass only the subset of the CDS to be",
                                      "plotted."))

  f_id <- NA
  Cell <- NA
  cds_subset = cds_subset[,is.finite(colData(cds_subset)$pseudotime)]

  cds_exprs <- SingleCellExperiment::counts(cds_subset)                          #  plot_genes_in_pseudotime()
  cds_exprs <- Matrix::t(Matrix::t(cds_exprs)/size_factors(cds_subset))          #  scale
  cds_exprs <- reshape2::melt(round(as.matrix(cds_exprs)))                       #  convert to dense

  if (is.null(min_expr)) {
    min_expr <- 0
  }
  colnames(cds_exprs) <- c("f_id", "Cell", "expression")
  cds_colData <- colData(cds_subset)
  cds_rowData <- rowData(cds_subset)
  cds_exprs <- merge(cds_exprs, cds_rowData, by.x = "f_id", by.y = "row.names")
  cds_exprs <- merge(cds_exprs, cds_colData, by.x = "Cell", by.y = "row.names")

  cds_exprs$adjusted_expression <- cds_exprs$expression

  if (label_by_short_name == TRUE) {
    if (is.null(cds_exprs$gene_short_name) == FALSE) {
      cds_exprs$feature_label <- as.character(cds_exprs$gene_short_name)
      cds_exprs$feature_label[is.na(cds_exprs$feature_label)] <- cds_exprs$f_id
    }
    else {
      cds_exprs$feature_label <- cds_exprs$f_id
    }
  }
  else {
    cds_exprs$feature_label <- cds_exprs$f_id
  }
  cds_exprs$f_id <- as.character(cds_exprs$f_id)
  cds_exprs$feature_label <- factor(cds_exprs$feature_label)


  new_data <- as.data.frame(colData(cds_subset))
  new_data$Size_Factor = 1
  model_tbl = fit_models(cds_subset, model_formula_str = trend_formula)

  model_expectation <- model_predictions(model_tbl,
                                         new_data = new_data)

  colnames(model_expectation) <- colnames(cds_subset)
  expectation <- plyr::ddply(cds_exprs, plyr::.(f_id, Cell),
                             function(x) {
                               data.frame(
                                 "expectation"=model_expectation[x$f_id,
                                                                 x$Cell])
                             })
  cds_exprs <- merge(cds_exprs, expectation)

  cds_exprs$expression[cds_exprs$expression < min_expr] <- min_expr
  cds_exprs$expectation[cds_exprs$expectation < min_expr] <- min_expr
  if (!is.null(panel_order)) {
    cds_exprs$feature_label <- factor(cds_exprs$feature_label,
                                      levels = panel_order)
  }
  q <- ggplot(aes(pseudotime, expression), data = cds_exprs)

  if (!is.null(color_cells_by)) {
    q <- q + geom_point(aes_string(color = color_cells_by),
                        size = I(cell_size),
                        position=position_jitter(horizontal_jitter,
                                                 vertical_jitter))
    if (methods::is(colData(cds_subset)[,color_cells_by], "numeric")) {
      q <- q + viridis::scale_color_viridis(option="C")
    }
  }
  else {
    q <- q + geom_point(size = I(cell_size),
                        position=position_jitter(horizontal_jitter,
                                                 vertical_jitter))
  }

  q <- q + geom_line(aes(x = pseudotime, y = expectation), data = cds_exprs)

  q <- q + scale_y_log10() + facet_wrap(~feature_label, nrow = nrow,
                                        ncol = ncol, scales = "free_y")
  if (min_expr < 1) {
    q <- q + expand_limits(y = c(min_expr, 1))
  }

  q <- q + ylab("Expression")

  q <- q + xlab("pseudotime")
  q <- q + monocle_theme_opts()
  q
}

#' Plots the fraction of variance explained by the each component based on
#' PCA from the normalized expression data determined using preprocess_cds.
#' This is the fraction of the component variance relative to the variance
#' of the components retained in the PCA; not the total variance.
#'
#' @param cds cell_data_set of the experiment.
#' @return ggplot object.
#'
#' @examples
#'   \donttest{
#'     cds <- load_a549()
#'     cds <- preprocess_cds(cds)
#'     plot_pc_variance_explained(cds)
#'   }
#'
#' @export
plot_pc_variance_explained <- function(cds) {
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(!is.null(SingleCellExperiment::reducedDims(cds)[["PCA"]]),
                          msg = paste("Data has not been preprocessed with",
                                      "PCA. Please run preprocess_cds with",
                                      "method = 'PCA' before running",
                                      "plot_pc_variance_explained."))

  prop_varex <- cds@reduce_dim_aux[['PCA']][['model']][['prop_var_expl']]

  if(length(prop_varex) < 1) warning('bad loop: length(prop_varex) < 1')
  p <- qplot(1:length(prop_varex), prop_varex, alpha = I(0.5)) +
    monocle_theme_opts() +
    theme(legend.position="top", legend.key.height=grid::unit(0.35, "in")) +
    theme(panel.background = element_rect(fill='white')) +
    xlab('PCA components') +
    ylab('Variance explained \n by each component')
  return(p)
}

#' Plot expression for one or more genes as a violin plot
#'
#' @description Accepts a subset of a cell_data_set and an attribute to group
#' cells by, and produces a ggplot2 object that plots the level of expression
#' for each group of cells. The median value is plotted as a black dot.
#'
#' @param cds_subset Subset cell_data_set to be plotted.
#' @param group_cells_by NULL of the cell attribute (e.g. the column of
#'   colData(cds)) to group cells by on the horizontal axis. If NULL, all cells
#'   are plotted together.
#' @param min_expr the minimum (untransformed) expression level to be plotted.
#'   Default is 0.
#' @param nrow the number of panels per row in the figure.
#' @param ncol the number of panels per column in the figure.
#' @param panel_order the order in which genes should be laid out
#'   (left-to-right, top-to-bottom). Should be gene_short_name if
#'   \code{label_by_short_name = TRUE} or feature ID if
#'   \code{label_by_short_name = FALSE}.
#' @param label_by_short_name label figure panels by gene_short_name (TRUE) or
#'   feature id (FALSE). Default is TRUE.
#' @param normalize Logical, whether or not to normalize expression by size
#'   factor. Default is TRUE.
#' @param log_scale Logical, whether or not to scale data logarithmically.
#'   Zero count cells are excluded from the plot and median when log_scale
#'   is TRUE. Default is TRUE.
#' @param pseudocount A pseudo-count added to the gene expression. A
#'   pseudocount value greater than 0 is reset to 1. Default is 0.
#' @return a ggplot2 plot object
#' @import ggplot2
#'
#' @examples
#'   \donttest{
#'     cds <- load_a549()
#'     cds_subset <- cds[row.names(subset(rowData(cds),
#'                      gene_short_name %in% c("ACTA1", "ID1", "CCNB2"))),]
#'     plot_genes_violin(cds_subset, group_cells_by="culture_plate", ncol=2,
#'                       min_expr=0.1)
#'   }
#'
#' @export
plot_genes_violin <- function (cds_subset,
                               group_cells_by = NULL,
                               min_expr = 0,
                               nrow = NULL,
                               ncol = 1,
                               panel_order = NULL,
                               label_by_short_name = TRUE,
                               normalize = TRUE,
                               log_scale = TRUE,
                               pseudocount = 0) {

  assertthat::assert_that(methods::is(cds_subset, "cell_data_set"))

  if(!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% names(colData(cds_subset)),
                            msg = paste("group_cells_by must be a column in",
                                        "the colData table"))
  }

  assertthat::assert_that(assertthat::is.number(min_expr))

  if(!is.null(nrow)) {
    assertthat::assert_that(assertthat::is.count(nrow))
  }

  assertthat::assert_that(assertthat::is.count(ncol))
  assertthat::assert_that(assertthat::is.number(pseudocount))
  assertthat::assert_that(is.logical(label_by_short_name))
  if (label_by_short_name) {
    assertthat::assert_that("gene_short_name" %in% names(rowData(cds_subset)),
                            msg = paste("When label_by_short_name = TRUE,",
                                        "rowData must have a column of gene",
                                        "names called gene_short_name."))
  }
  if(!is.null(panel_order)) {
    if (label_by_short_name) {
      assertthat::assert_that(all(panel_order %in%
                                    rowData(cds_subset)$gene_short_name))
    } else {
      assertthat::assert_that(all(panel_order %in%
                                    row.names(rowData(cds_subset))))
    }
  }

  assertthat::assert_that(is.logical(normalize))
  assertthat::assert_that(is.logical(log_scale))

  assertthat::assert_that(nrow(rowData(cds_subset)) <= 100,
                          msg = paste("cds_subset has more than 100 genes -",
                                      "pass only the subset of the CDS to be",
                                      "plotted."))
  if (pseudocount > 0) {
    cds_exprs <- SingleCellExperiment::counts(cds_subset) + 1  # plot_genes_violin()
  } else {
    cds_exprs <- SingleCellExperiment::counts(cds_subset)      # plot_genes_violin()
  }

  if (normalize) {
    cds_exprs <- Matrix::t(Matrix::t(cds_exprs)/size_factors(cds_subset))     #  scale
    cds_exprs <- reshape2::melt(as.matrix(cds_exprs))                         #  convert to dense
  } else {
    cds_exprs <- reshape2::melt(as.matrix(cds_exprs))                         #  convert to dense
  }

  colnames(cds_exprs) <- c("f_id", "Cell", "expression")
  cds_exprs$expression[cds_exprs$expression < min_expr] <- min_expr


  cds_exprs <- merge(cds_exprs, rowData(cds_subset), by.x = "f_id",
                     by.y = "row.names")
  cds_exprs <- merge(cds_exprs, colData(cds_subset), by.x = "Cell",
                     by.y = "row.names")

  if (label_by_short_name) {
    if (!is.null(cds_exprs$gene_short_name)) {
      cds_exprs$feature_label <- cds_exprs$gene_short_name
      cds_exprs$feature_label[is.na(cds_exprs$feature_label)] <- cds_exprs$f_id
    } else {
      cds_exprs$feature_label <- cds_exprs$f_id
    }
  } else {
    cds_exprs$feature_label <- cds_exprs$f_id
  }

  if (!is.null(panel_order)) {
    cds_exprs$feature_label = factor(cds_exprs$feature_label,
                                     levels = panel_order)
  }

  cds_exprs[,group_cells_by] <- as.factor(cds_exprs[,group_cells_by])

  q <- ggplot(aes_string(x = group_cells_by, y = "expression"),
              data = cds_exprs) +
    monocle_theme_opts()

  cds_exprs[,group_cells_by] <- as.factor(cds_exprs[,group_cells_by])
  q <- q + geom_violin(aes_string(fill = group_cells_by), scale="width") +
    guides(fill='none')
  q <- q + stat_summary(fun=median, geom="point", size=1, color="black")
  q <- q + facet_wrap(~feature_label, nrow = nrow,
                      ncol = ncol, scales = "free_y")
  if (min_expr < 1) {
    q <- q + expand_limits(y = c(min_expr, 1))
  }

  q <- q + ylab("Expression") + xlab(group_cells_by)

  if (log_scale){
    q <- q + scale_y_log10()
  }
  q
}


#' Plot expression for one or more genes as a hybrid histogram-interval with
#' a Sina plot overlay.
#'
#' @description Accepts a subset of a cell_data_set and an attribute to group
#' cells by, and produces a ggplot2 object that plots the level of expression
#' for each group of cells. The cells appear as red dots in a Sina plot
#' and the cell distribution appears as a histogram with green bars and
#' a blue median_qi interval. See help for ggforce::geom_sina and
#' ggdist::stat_histinterval for additional information.
#'
#' @param cds_subset Subset cell_data_set to be plotted.
#' @param group_cells_by NULL of the cell attribute (e.g. the column of
#'   colData(cds)) to group cells by on the horizontal axis. If NULL, all cells
#'   are plotted together.
#' @param min_expr the minimum (untransformed) expression level to be plotted.
#'   Default is 0.
#' @param nrow the number of panels per row in the figure.
#' @param ncol the number of panels per column in the figure.
#' @param panel_order the order in which genes should be laid out
#'   (left-to-right, top-to-bottom). Should be gene_short_name if
#'   \code{label_by_short_name = TRUE} or feature ID if
#'   \code{label_by_short_name = FALSE}.
#' @param label_by_short_name label figure panels by gene_short_name (TRUE) or
#'   feature id (FALSE). Default is TRUE.
#' @param normalize Logical, whether or not to normalize expression by size
#'   factor. Default is TRUE.
#' @param log_scale Logical, whether or not to scale data logarithmically.
#'   Zero count cells are excluded from the plot, interval, and median
#'   when log_scale is TRUE. Default is TRUE.
#' @param pseudocount A pseudo-count added to the gene expression. A
#'   pseudocount value greater than 0 is reset to 1. Default is 0.
#' @return a ggplot2 plot object
#' @import ggplot2
#'
#' @examples
#'   \donttest{
#'     cds <- load_a549()
#'     cds_subset <- cds[row.names(subset(rowData(cds),
#'                      gene_short_name %in% c("ACTA1", "ID1", "CCNB2"))),]
#'     plot_genes_hybrid(cds_subset, group_cells_by="culture_plate", ncol=2,
#'                       min_expr=0.1)
#'   }
#'
#' @export
plot_genes_hybrid <- function (cds_subset,
                               group_cells_by = NULL,
                               min_expr = 0,
                               nrow = NULL,
                               ncol = 1,
                               panel_order = NULL,
                               label_by_short_name = TRUE,
                               normalize = TRUE,
                               log_scale = TRUE,
                               pseudocount = 0) {

  assertthat::assert_that(methods::is(cds_subset, "cell_data_set"))

  if(!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% names(colData(cds_subset)),
                            msg = paste("group_cells_by must be a column in",
                                        "the colData table"))
  }

  assertthat::assert_that(assertthat::is.number(min_expr))

  if(!is.null(nrow)) {
    assertthat::assert_that(assertthat::is.count(nrow))
  }

  assertthat::assert_that(assertthat::is.count(ncol))
  assertthat::assert_that(assertthat::is.number(pseudocount))
  assertthat::assert_that(is.logical(label_by_short_name))
  if (label_by_short_name) {
    assertthat::assert_that("gene_short_name" %in% names(rowData(cds_subset)),
                            msg = paste("When label_by_short_name = TRUE,",
                                        "rowData must have a column of gene",
                                        "names called gene_short_name."))
  }
  if(!is.null(panel_order)) {
    if (label_by_short_name) {
      assertthat::assert_that(all(panel_order %in%
                                    rowData(cds_subset)$gene_short_name))
    } else {
      assertthat::assert_that(all(panel_order %in%
                                    row.names(rowData(cds_subset))))
    }
  }

  assertthat::assert_that(is.logical(normalize))
  assertthat::assert_that(is.logical(log_scale))

  assertthat::assert_that(nrow(rowData(cds_subset)) <= 100,
                          msg = paste("cds_subset has more than 100 genes -",
                                      "pass only the subset of the CDS to be",
                                      "plotted."))
  if (pseudocount > 0) {
    cds_exprs <- SingleCellExperiment::counts(cds_subset) + 1
  } else {
    cds_exprs <- SingleCellExperiment::counts(cds_subset)
  }

  if (normalize) {
    cds_exprs <- Matrix::t(Matrix::t(cds_exprs)/size_factors(cds_subset))
    cds_exprs <- reshape2::melt(as.matrix(cds_exprs))
  } else {
    cds_exprs <- reshape2::melt(as.matrix(cds_exprs))
  }

  colnames(cds_exprs) <- c("f_id", "Cell", "expression")
  cds_exprs$expression[cds_exprs$expression < min_expr] <- min_expr


  cds_exprs <- merge(cds_exprs, rowData(cds_subset), by.x = "f_id",
                     by.y = "row.names")
  cds_exprs <- merge(cds_exprs, colData(cds_subset), by.x = "Cell",
                     by.y = "row.names")

  if (label_by_short_name) {
    if (!is.null(cds_exprs$gene_short_name)) {
      cds_exprs$feature_label <- cds_exprs$gene_short_name
      cds_exprs$feature_label[is.na(cds_exprs$feature_label)] <- cds_exprs$f_id
    } else {
      cds_exprs$feature_label <- cds_exprs$f_id
    }
  } else {
    cds_exprs$feature_label <- cds_exprs$f_id
  }

  if (!is.null(panel_order)) {
    cds_exprs$feature_label = factor(cds_exprs$feature_label,
                                     levels = panel_order)
  }

  cds_exprs[,group_cells_by] <- as.factor(cds_exprs[,group_cells_by])

  #
  # For log-scaled plots, Drop cells with zero counts.
  if(log_scale) {
    cds_exprs <- cds_exprs[cds_exprs[['expression']] > 0,]
  }

  q <- ggplot(data=cds_exprs, aes(x = .data[[group_cells_by]], y = .data[['expression']])) +
       monocle_theme_opts()
  q <- q + facet_wrap(~feature_label, nrow = nrow, ncol = ncol, scales = "free_y")

  q <- q + ggdist::stat_histinterval(mapping=aes(x=.data[[group_cells_by]], y=.data[['expression']]),
                                     breaks=ggdist::breaks_fixed(width=.05),
                                     linewidth=20,
                                     color='blue',
                                     alpha=0.1,
                                     size=1,
                                     normalize='groups',
                                     point_interval = 'median_qi',
                                     point_alpha=1.0,
                                     outline_bars=TRUE,
                                     slab_color='black',
                                     slab_linewidth=0.1,
                                     slab_fill='green',
                                     slab_alpha=0.3)

  q <- q + ggforce::geom_sina(maxwidth = .6,
                              scale = "count",
                              size = 0.2,
                              alpha = 0.3,
                              seed = 0,
                              color='red',
                              position='dodge')

  if (min_expr < 1) {
    q <- q + expand_limits(y = c(min_expr, 1))
  }

  q <- q + ylab("Expression") + xlab(group_cells_by)

  if (log_scale){
    q <- q + scale_y_log10()
  }
  q
}


#' Plots the number of cells expressing one or more genes above a given value
#' as a barplot
#'
#'  @description Accepts a subset cell_data_set and the parameter
#'  \code{group_cells_by}, used for dividing cells into groups. Returns one or
#'  more bar graphs (one graph for each gene in the cell_data_set). Each graph
#'  shows the percentage (or number) of cells that express a gene in each
#'  sub-group in the cell_data_set.
#'
#' @param cds_subset Subset cell_data_set to be plotted.
#' @param group_cells_by the cell attribute (e.g. the column of colData(cds))
#'   to group cells by on the horizontal axis. If NULL, all cells plotted as
#'   one group.
#' @param min_expr the minimum (untransformed) expression level to consider the
#'   gene 'expressed'. Default is 0.
#' @param nrow the number of panels per row in the figure.
#' @param ncol the number of panels per column in the figure.
#' @param panel_order the order in which genes should be laid out
#'   (left-to-right, top-to-bottom). Should be gene_short_name if
#'   \code{label_by_short_name = TRUE} or feature ID if
#'   \code{label_by_short_name = FALSE}.
#' @param plot_as_count Logical, whether to plot as a count of cells rather
#'   than a percent. Default is FALSE.
#' @param label_by_short_name label figure panels by gene_short_name (TRUE) or
#'   feature id (FALSE). Default is TRUE.
#' @param normalize Logical, whether or not to normalize expression by size
#'   factor. Default is TRUE.
#' @param plot_limits A pair of number specifying the limits of the y axis. If
#'   \code{NULL}, scale to the range of the data. Example \code{c(0,100)}.
#' @param bootstrap_samples The number of bootstrap replicates to generate when
#'   plotting error bars. Default is 100.
#' @param conf_int_alpha The size of the confidence interval to use when plotting
#'   error bars. Default is 0.95.
#' @return a ggplot2 plot object
#' @import ggplot2
#'
#' @examples
#'   \donttest{
#'     cds <- load_a549()
#'     cds_subset <- cds[row.names(subset(rowData(cds),
#'                                       gene_short_name %in% c("NDRG4", "HBG2"))),]
#'     plot_percent_cells_positive(cds_subset, group_cells_by="culture_plate")
#'   }
#'
#' @export
plot_percent_cells_positive <- function(cds_subset,
                                        group_cells_by = NULL,
                                        min_expr = 0,
                                        nrow = NULL,
                                        ncol = 1,
                                        panel_order = NULL,
                                        plot_as_count = FALSE,
                                        label_by_short_name=TRUE,
                                        normalize = TRUE,
                                        plot_limits = NULL,
                                        bootstrap_samples=100,
                                        conf_int_alpha = .95){
  splits <- summary_stats <- target <- NULL # no visible binding
  target_fraction <- target_fraction_low <- target_fraction_high <- NULL # no visible binding
  assertthat::assert_that(methods::is(cds_subset, "cell_data_set"))

  if(!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% names(colData(cds_subset)),
                            msg = paste("group_cells_by must be a column in",
                                        "the colData table"))
  }
  assertthat::assert_that(assertthat::is.number(min_expr))

  if(!is.null(nrow)) {
    assertthat::assert_that(assertthat::is.count(nrow))
  }

  assertthat::assert_that(assertthat::is.count(ncol))
  assertthat::assert_that(is.logical(plot_as_count))
  assertthat::assert_that(is.logical(label_by_short_name))
  assertthat::assert_that(is.logical(normalize))

  assertthat::assert_that(nrow(rowData(cds_subset)) <= 100,
                          msg = paste("cds_subset has more than 100 genes -",
                                      "pass only the subset of the CDS to be",
                                      "plotted."))

  marker_exprs <- SingleCellExperiment::counts(cds_subset)   # plot_percent_cells_positive()

  if (normalize) {
    marker_exprs <- Matrix::t(Matrix::t(marker_exprs)/size_factors(cds_subset))
    marker_exprs_melted <- reshape2::melt(round(as.matrix(marker_exprs), digits=4))
  } else {
    marker_exprs_melted <- reshape2::melt(as.matrix(marker_exprs))
  }

  colnames(marker_exprs_melted) <- c("f_id", "Cell", "expression")

  marker_exprs_melted <- merge(marker_exprs_melted, colData(cds_subset),
                               by.x="Cell", by.y="row.names")
  marker_exprs_melted <- merge(marker_exprs_melted, rowData(cds_subset),
                               by.x="f_id", by.y="row.names")

  if (label_by_short_name) {
    if (!is.null(marker_exprs_melted$gene_short_name)){
      marker_exprs_melted$feature_label <- marker_exprs_melted$gene_short_name
      marker_exprs_melted$feature_label[
        is.na(marker_exprs_melted$feature_label)] <- marker_exprs_melted$f_id
    } else {
      marker_exprs_melted$feature_label <- marker_exprs_melted$f_id
    }
  } else {
    marker_exprs_melted$feature_label <- marker_exprs_melted$f_id
  }

  if (!is.null(panel_order)) {
    marker_exprs_melted$feature_label <-
      factor(marker_exprs_melted$feature_label, levels=panel_order)
  }

  if(is.null(group_cells_by)) {
    marker_exprs_melted$all_cell <- "All"
    group_cells_by <- "all_cell"
  }

  # marker_counts <-
  #   plyr::ddply(marker_exprs_melted,
  #               c("feature_label", group_cells_by),
  #               function(x) {
  #                 data.frame(target = sum(x$expression > min_expr),
  #                            target_fraction = sum(x$expression >
  #                                                    min_expr)/nrow(x))
  #               })

  marker_counts_bootstrap = rsample::bootstraps(marker_exprs_melted, times = bootstrap_samples)

  group_mean_bootstrap <- function(split) {
    rsample::analysis(split) %>%
      dplyr::group_by(!!as.name("feature_label"), !!as.name(group_cells_by)) %>%
      dplyr::summarize(target = sum(expression > min_expr),
                       target_fraction = sum(expression > min_expr)/dplyr::n())
  }

  marker_counts <-
    marker_counts_bootstrap %>%
    dplyr::mutate(summary_stats = purrr::map(splits, group_mean_bootstrap)) %>%
    tidyr::unnest(summary_stats)
  marker_counts <- marker_counts %>% dplyr::ungroup() %>%
    dplyr::group_by(!!as.name("feature_label"), !!as.name(group_cells_by)) %>%
    dplyr::summarize(target_mean = mean(target),
                     target_fraction_mean = mean(target_fraction),
                     target_low = stats::quantile(target, conf_int_alpha / 2),
                     target_high = stats::quantile(target, 1 - conf_int_alpha / 2),
                     target_fraction_low = stats::quantile(target_fraction, (1 - conf_int_alpha) / 2),
                     target_fraction_high = stats::quantile(target_fraction, 1 - (1 - conf_int_alpha) / 2))


  # marker_counts <-
  #   marker_exprs_melted %>% dplyr::group_by(!!as.name("feature_label"), !!as.name(group_cells_by)) %>%
  #   dplyr::summarize(target = sum(expression > min_expr),
  #             target_fraction = sum(expression > min_expr)/dplyr::n())

  if (!plot_as_count){
    marker_counts$target_fraction_mean <- marker_counts$target_fraction_mean * 100
    marker_counts$target_fraction_low <- marker_counts$target_fraction_low * 100
    marker_counts$target_fraction_high <- marker_counts$target_fraction_high * 100
    qp <- ggplot(aes_string(x=group_cells_by, y="target_fraction_mean",
                            fill=group_cells_by),
                 data=marker_counts) +
      ylab("Cells (percent)")
  } else {
    qp <- ggplot(aes_string(x=group_cells_by, y="target_mean", fill=group_cells_by),
                 data=marker_counts) +
      ylab("Cells")
  }
  if (group_cells_by == "all_cell") {
    group_cells_by <- ""
    qp <- qp + theme(legend.title = element_blank())
  }
  qp <- qp + xlab(group_cells_by)

  if (is.null(plot_limits) == FALSE) {
    qp <- qp + scale_y_continuous(limits=plot_limits)
  }

  qp <- qp + facet_wrap(~feature_label, nrow=nrow, ncol=ncol, scales="free_y")
  qp <-  qp +
    geom_bar(stat="identity") +
    geom_linerange(aes(ymin=target_fraction_low, ymax=target_fraction_high))+
    monocle_theme_opts()

  return(qp)
}

#' Create a dot plot to visualize the mean gene expression and percentage of
#' expressed cells in each group of cells
#'
#' @param cds A cell_data_set for plotting.
#' @param markers A list of gene ids (or short names) to show in the plot
#' @param group_cells_by How to group cells when labeling them. Must be either
#'   the name of a column of colData(cds), or one of "clusters" or "partitions".
#'   If a column in colData(cds), must be a categorical variable.
#' @param reduction_method The dimensionality reduction method used for clusters
#'   and partitions.
#' @param norm_method Determines how to transform expression values prior to
#'   plotting. Options are "log" and "size_only". Default is "log".
#' @param lower_threshold The lowest gene expressed treated as expressed. By
#'   default, zero.
#' @param max.size The maximum size of the dot. By default, it is 10.
#' @param ordering_type How to order the genes / groups on the dot plot. Only
#'   accepts 'cluster_row_col' (use biclustering to cluster the rows and
#'   columns), 'maximal_on_diag' (position each column so that the maximal color
#'   shown on each column on the diagonal, if the current maximal is used in
#'   earlier columns, the next largest one is position), and 'none' (preserve
#'   the ordering from the input gene or alphabetical ordering of groups).
#'   Default is 'cluster_row_col'.
#' @param axis_order Whether to put groups on x-axis, genes on y-axis (option
#'   'group_marker') or the reverse order (option 'marker_group'). Default is
#'   "group_marker".
#' @param flip_percentage_mean Logical indicating whether to use color of the
#'   dot to represent the percentage (by setting flip_percentage_mean = FALSE,
#'   default) and size of the dot the mean expression, or the opposite (by
#'   setting flip_percentage_mean = TRUE).
#' @param pseudocount A pseudo-count added to the average gene expression.
#' @param scale_max The maximum value (in standard deviations) to show in the
#'   heatmap. Values larger than this are set to the max.
#' @param scale_min The minimum value (in standard deviations) to show in the
#'   heatmap. Values smaller than this are set to the min.
#' @param color_by_group Color cells by the group to which they belong.
#'
#' @return a ggplot2 plot object
#' @import ggplot2
#' @export
plot_genes_by_group <- function(cds,
                                markers,
                                group_cells_by="cluster",
                                reduction_method = "UMAP",
                                norm_method = c("log", "size_only"),
                                lower_threshold = 0,
                                max.size = 10,
                                # maybe be also do the maximum color on the
                                # diagonal; the axis change be switched too
                                ordering_type = c('cluster_row_col',
                                                  'maximal_on_diag',
                                                  'none'),
                                axis_order = c('group_marker', 'marker_group'),
                                flip_percentage_mean = FALSE,
                                pseudocount = 1,
                                scale_max = 3,
                                scale_min = -3,
                                color_by_group=FALSE) {
  rowname <- gene_short_name <- Group <- Gene <- Expression <- percentage <- NULL # no visible binding
  assertthat::assert_that(methods::is(cds, "cell_data_set"))

  if(!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% c("cluster", "partition") |
                              group_cells_by %in% names(colData(cds)),
                            msg = paste("group_cells_by must be a column in",
                                        "the colData table."))
  }
  assertthat::assert_that("gene_short_name" %in% names(rowData(cds)), msg =
                            paste("This function requires a gene_short_name",
                                  "column in your rowData. If you do not have",
                                  "gene symbols, you can use other gene ids",
                                  "by running",
                                  "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))

  assertthat::assert_that(
    tryCatch(expr = ifelse(match.arg(norm_method) == "",TRUE, TRUE),
             error = function(e) FALSE),
    msg = "method must be one of 'log' or 'size_only'")
  norm_method = match.arg(norm_method)

  assertthat::assert_that(
    tryCatch(expr = ifelse(match.arg(ordering_type) == "",TRUE, TRUE),
             error = function(e) FALSE),
    msg = "method must be one of 'cluster_row_col', 'maximal_on_diag', or 'none'")
  ordering_type <- match.arg(ordering_type)

  assertthat::assert_that(
    tryCatch(expr = ifelse(match.arg(axis_order) == "",TRUE, TRUE),
             error = function(e) FALSE),
    msg = "method must be one of 'group_marker' or 'marker_group'")
  axis_order <- match.arg(axis_order)

  gene_ids = as.data.frame(fData(cds)) %>%
    tibble::rownames_to_column() %>%
    dplyr::filter(rowname %in% markers | gene_short_name %in% markers) %>%
    dplyr::pull(rowname)
  if(length(gene_ids) < 1)
    stop('Please make sure markers are included in the gene_short_name ",
               "column of the rowData!')

  if(flip_percentage_mean == FALSE){
    major_axis <- 1
    minor_axis <- 2
  } else if (flip_percentage_mean == TRUE){
    major_axis <- 2
    minor_axis <- 1
  }

  exprs_mat <- t(as.matrix(normalized_counts(cds=cds, norm_method=norm_method)[gene_ids, ]))
  exprs_mat <- reshape2::melt(exprs_mat)
  colnames(exprs_mat) <- c('Cell', 'Gene', 'Expression')
  exprs_mat$Gene <- as.character(exprs_mat$Gene)


  if (group_cells_by == "cluster"){
    cell_group <- tryCatch({clusters(cds,
                                     reduction_method = reduction_method)},
                           error = function(e) {NULL})
  } else if (group_cells_by == "partition") {
    cell_group <- tryCatch({partitions(cds,
                                       reduction_method = reduction_method)},
                           error = function(e) {NULL})
  } else{
    cell_group <- colData(cds)[,group_cells_by]
  }

  if (length(unique(cell_group)) < 2) {
    stop("Only one type in group_cells_by. To use plot_genes_by_group, ",
               "please specify a group with more than one type. ")
  }

  names(cell_group) = colnames(cds)

  exprs_mat$Group <- cell_group[exprs_mat$Cell]
  exprs_mat = exprs_mat %>% dplyr::filter(is.na(Group) == FALSE)
  ExpVal <- exprs_mat %>% dplyr::group_by(Group, Gene) %>%
    dplyr::summarize(mean = mean(log(Expression + pseudocount)),
                     percentage = sum(Expression > lower_threshold) /
                       length(Expression))
  ExpVal$mean <- ifelse(ExpVal$mean < scale_min, scale_min, ExpVal$mean)
  ExpVal$mean <- ifelse(ExpVal$mean > scale_max, scale_max, ExpVal$mean)

  ExpVal$Gene <- fData(cds)[ExpVal$Gene, 'gene_short_name']

  res <- reshape2::dcast(ExpVal[, 1:4], Group ~ Gene,
                         value.var = colnames(ExpVal)[2 + major_axis])
  group_id <- res[, 1]
  res <- res[, -1]
  row.names(res) <- group_id

  if(major_axis == 1){
    ExpVal = ExpVal %>% dplyr::group_by(Gene) %>% dplyr::mutate(max_value = max(mean),
                                                         is_max = max_value == mean) %>% dplyr::ungroup()
  }
  else{
    ExpVal = ExpVal %>% dplyr::group_by(Gene) %>% dplyr::mutate(max_value = max(percentage),
                                                         is_max = max_value == percentage) %>% dplyr::ungroup()
  }

  ExpVal = ExpVal %>% dplyr::mutate(group_color_class = ifelse(is_max, Group, NA_character_))

  if(ordering_type == 'cluster_row_col') {
    row_dist <- stats::as.dist((1 - stats::cor(t(res)))/2)
    row_dist[is.na(row_dist)] <- 1

    col_dist <- stats::as.dist((1 - stats::cor(res))/2)
    col_dist[is.na(col_dist)] <- 1

    ph <- pheatmap::pheatmap(res,
                             useRaster = TRUE,
                             cluster_cols=TRUE,
                             cluster_rows=TRUE,
                             show_rownames=FALSE,
                             show_colnames=FALSE,
                             clustering_distance_cols=col_dist,
                             clustering_distance_rows=row_dist,
                             clustering_method = 'ward.D2',
                             silent=TRUE,
                             filename=NA)


    ExpVal$Gene <- factor(ExpVal$Gene,
                          levels = colnames(res)[ph$tree_col$order])
    ExpVal$Group <- factor(ExpVal$Group,
                           levels = row.names(res)[ph$tree_row$order])

  } else if(ordering_type == 'maximal_on_diag'){
    group_ordering_df = ExpVal %>% dplyr::filter(is_max) %>% dplyr::group_by(Group) %>% dplyr::summarize(num_genes = n())
    ExpVal = dplyr::left_join(ExpVal, group_ordering_df, by=c("Group")) %>% dplyr::arrange(dplyr::desc(num_genes), dplyr::desc(max_value))

    ExpVal$Group <- factor(ExpVal$Group,
                           levels = ExpVal %>% dplyr::select(Group) %>% dplyr::distinct() %>% dplyr::pull(Group))

    gene_ordering = ExpVal %>% dplyr::filter(is_max) %>% dplyr::group_by(Gene) %>% dplyr::slice_head(n=1) %>% dplyr::arrange(Group, dplyr::desc(max_value)) %>% dplyr::pull(Gene)
    ExpVal$Gene <- factor(ExpVal$Gene,
                          levels = gene_ordering)

  } else if(ordering_type == 'none'){
    ExpVal$Gene <- factor(ExpVal$Gene, levels = markers)
  }

  if(flip_percentage_mean){
    if (color_by_group){
      g <- ggplot(ExpVal, aes(y = Gene,  x = Group)) +
        geom_point(aes(colour = group_color_class,  size = mean)) +
        #viridis::scale_color_viridis(name = 'percentage') +
        scale_size(name = paste0('log(mean + ', pseudocount, ')'), range = c(0, max.size))
    }else{
      g <- ggplot(ExpVal, aes(y = Gene,  x = Group)) +
        geom_point(aes(colour = percentage,  size = mean)) +
        viridis::scale_color_viridis(name = 'percentage') +
        scale_size(name = paste0('log(mean + ', pseudocount, ')'), range = c(0, max.size))
    }
  } else {
    if (color_by_group){
      g <- ggplot(ExpVal, aes(y = Gene,  x = Group)) +
        geom_point(aes(colour = group_color_class,  size = percentage)) +
        #viridis::scale_color_viridis(name = paste0('log(mean + ', pseudocount, ')')) +
        scale_size(name = 'percentage', range = c(0, max.size))
    }else{
      g <- ggplot(ExpVal, aes(y = Gene,  x = Group)) +
        geom_point(aes(colour = mean,  size = percentage)) +
        viridis::scale_color_viridis(name = paste0('log(mean + ', pseudocount, ')')) +
        scale_size(name = 'percentage', range = c(0, max.size))
    }
  }

  if (group_cells_by == "cluster"){
    g <- g + xlab("Cluster")
  } else if (group_cells_by == "partition") {
    g <- g + xlab("Partition")
  } else{
    g <- g + xlab(group_cells_by)
  }

  g <- g + ylab("Gene") + monocle_theme_opts() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))
  if(axis_order == 'marker_group') {
    g <- g + coord_flip()
  }

  g
}

#' Plot a histogram of the number of UMIs per cell with an optional vertical line showing the cutoff for outliers.
#'
#' @description If \code{max_zscore} is specified, then there is both a high and a low cutoff, both of which are determined by Z-scoring the log-transformed UMI counts. Otherwise if \code{min_rna_umi} is specified, then there is only a low cutoff. If neither is specified, then no cutoff is shown.
#'
#' @param cds A cell_data_set for plotting.
#' @param max_zscore Cutoff for the maximum Z-score of the log-transformed UMIs per cell. If NULL, then \code{min_rna_umi} must be specified.
#' @param min_rna_umi Cutoff for the minimum number of UMIs per cell. Ignored if \code{max_zscore} is specified.
#'
#' @returns A ggplot2 object.
#' @import ggplot2
#' @export
plot_umi_per_cell <- function(cds,
                              max_zscore = NULL,
                              min_rna_umi = NULL) {
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(
    is.null(max_zscore) || is.numeric(max_zscore) && max_zscore > 0,
    msg = "max_zscore must be a positive number."
  )
  assertthat::assert_that(
    is.null(min_rna_umi) || is.numeric(min_rna_umi) && min_rna_umi > 0,
    msg = "min_rna_umi must be a positive number."
  )

  tryCatch(
    assertthat::assert_that("log.n.umi" %in% colnames(colData(cds))),
    error = function(e) {
      colData(cds)$log.n.umi <- log10(colData(cds)$n.umi)
    }
  )

  g <- colData(cds) %>%
    as.data.frame() %>%
    dplyr::select(cell, log.n.umi) %>%
    dplyr::distinct() %>%
    ggplot() +
    geom_histogram(aes(x = log.n.umi),
                   fill = "grey80",
                   color = "black",
                   binwidth = 0.2,
                   linewidth = 0.1,
                   bins = 50) +
    scale_x_continuous(name = "RNA UMIs",
                       breaks = c(0, 1, 2, 3, 4),
                       labels = as.character(c(0, 10, 100, 1000, 10000))) +
    ylab("Number of Cells")

  if (is.null(max_zscore) == FALSE) {
    mean_umi <- mean(colData(cds)$log.n.umi)
    sd_umi <- sd(colData(cds)$log.n.umi)
    g <- g + geom_vline(xintercept = mean_umi - max_zscore * sd_umi,
                        color = "red",
                        linewidth = 0.5) +
      geom_vline(xintercept = mean_umi + max_zscore * sd_umi,
                 color = "red",
                 linewidth = 0.5)
  } else if (is.null(min_rna_umi) == FALSE) {
    g <- g + geom_vline(xintercept = log10(min_rna_umi),
                        color = "red",
                        linewidth = 0.5)
  }

  g <- g + monocle_theme_opts()

  return(g)
}

#' Plot a histogram of the percentage of UMIs mapping to mitochondria for each cell.
#' @param cds A cell_data_set for plotting.
#' @param max_mito_pct Cutoff for the maximum percentage of UMIs mapping to mitochondria.
#' @param pseudocount Value to add to the percentages before taking the log. Note that this is in terms of percent, not fractional.
#' @returns A ggplot2 object.
#' @import ggplot2
#' @export
plot_mito_umi_per_cell <- function(cds,
                                   max_mito_pct = .max_mito_pct,
                                   pseudocount = 1e-3) {
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(all(c("cell", "perc_mitochondrial_umis") %in% colnames(colData(cds))))

  g <- colData(cds) %>%
    as.data.frame() %>%
    dplyr::select(cell, perc_mitochondrial_umis) %>%
    dplyr::distinct() %>%
    ggplot() +
    geom_histogram(aes(x = log10(perc_mitochondrial_umis + pseudocount)),
                   fill = "grey80",
                   color = "black",
                   binwidth = 0.2,
                   linewidth = 0.1,
                   bins = 50) +
    scale_x_continuous(name = "Percent Mitochondrial UMIs",
                       breaks = c(log10(pseudocount), -2, -1, 0, 1, 2),
                       labels = as.character(c(0, 0.001, 0.1, 1, 10, 100))) +
    ylab("Number of Cells") +
    geom_vline(xintercept = log10(max_mito_pct),
               color = "red",
               linewidth = 0.5) +
    monocle_theme_opts()

  return(g)
}

#' Make a boxplot of the number of UMIs per cell, with each perturbation as a separate box. Optionally facet by a second value.
#' @param cds A cell_data_set for plotting.
#' @param perturbation_col How to group the cells for the plot. Must be a column of the colData.
#' @param facet_by Facet the plot by this column of the colData. Each unique value is its own facet.
#' @param color_palette List of colors to color perturbation groups. Default is NULL. When NULL, use a default set.
#' @param yticks List of numeric values to put on the y-axis ticks.
#' @returns A ggplot2 object.
#' @import ggplot2
#' @export
plot_umi_per_cell_and_perturbation <- function(cds,
                                               perturbation_col = "perturbation",
                                               facet_by = NULL,
                                               color_palette = NULL,
                                               yticks = NULL) {
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(perturbation_col %in% colnames(colData(cds)),
                          msg = "perturbation_col is not in the colData of the CDS.")
  assertthat::assert_that(is.null(facet_by) || (facet_by %in% colnames(colData(cds))),
                          msg = "facet_by is not in the colData of the CDS.")
  assertthat::assert_that(is.null(yticks) || is.numeric(yticks),
                          msg = "yticks are not numeric.")

  coldata_df <- colData(cds) %>%
    as.data.frame() %>%
    dplyr::filter(!is.na(perturbation_col))

  if (is.null(color_palette)) {
    N <- length(unique(coldata_df[[perturbation_col]]))
    color_palette <- get_n_colors(N)
  }

  if (is.null(yticks)) {
    yticks <- c(100, 250, 500, 1000, 2500, 5000, 10000)
  }

  g <- coldata_df %>%
    ggplot() +
    geom_boxplot(aes(x = reorder(!!sym(perturbation_col), log.n.umi),
                     y = log.n.umi,
                     fill = !!sym(perturbation_col)),
                 color = "black",
                 outlier.stroke = 0.1,
                 outlier.size = 0.1,
                 size = 0.2) +
    scale_fill_manual(values = color_palette, name = stringr::str_to_title(perturbation_col)) +
    scale_y_continuous(breaks = log10(yticks),
                       labels = as.character(yticks))

  if (!is.null(facet_by)) {
    g <- g +
      facet_grid(cols = vars(!!sym(facet_by))) +
      ggtitle(stringr::str_to_title(facet_by))
  }

  g <- g +
    ylab("UMIs per Cell") +
    monocle_theme_opts() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          plot.title = element_text(hjust = 0.5))

  return(g)
}

#' Make a boxplot of the number of cells per sample, with each perturbation as a separate box. Optionally facet by a second value.
#' @param cds A cell_data_set for plotting.
#' @param perturbation_col How to group the cells for the plot. Must be a column of the colData.
#' @param count_per_sample_col Which column of the colData to put in the boxplots.
#' @param zscore logical value indicating whether to z-score counts before plotting. If \code{TRUE}, then the values in \code{count_per_sample_col} are grouped by the \code{perturbation_col} and z-scored.
#' @param cutoff The minimum number of cells per sample, drawn as a horizontal line. If \code{zscore == TRUE} then this corresponds to the absolute value of the z-score cutoff, and two horizontal lines are drawn.
#' @param facet_by Facet the plot by this column of the colData. Each unique value is its own facet.
#' @param color_palette List of colors to color perturbation groups. Default is NULL. When NULL, use a default set.
#' @param yticks List of numeric values to put on the y-axis ticks.
#' @returns A ggplot2 object.
#' @import ggplot2
#' @export
plot_cells_per_sample_and_perturbation <- function(cds,
                                                   perturbation_col = "perturbation",
                                                   count_per_sample_col = "count_per_embryo",
                                                   zscore = FALSE,
                                                   cutoff = NULL,
                                                   facet_by = NULL,
                                                   color_palette = NULL,
                                                   yticks = NULL) {
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(all(c(perturbation_col, count_per_sample_col) %in% colnames(colData(cds))))
  assertthat::assert_that(is.null(facet_by) || (facet_by %in% colnames(colData(cds))),
                          msg = "facet_by is not in the colData of the CDS.")
  assertthat::assert_that(is.logical(zscore),
                          msg = "zscore must be a logical value.")
  assertthat::assert_that(is.null(cutoff) || is.numeric(cutoff) && cutoff > 0,
                          msg = "cutoff must be a positive numeric value.")
  assertthat::assert_that(is.null(yticks) || is.numeric(yticks),
                          msg = "yticks are not numeric.")

  counts_per_sample_df <- colData(cds) %>%
    as.data.frame() %>%
    dplyr::select(count_per_sample_col, perturbation_col, facet_by) %>%
    dplyr::filter(!is.na(perturbation_col))

  if (zscore) {
    counts_per_sample_df <- counts_per_sample_df %>%
      group_by(!!sym(perturbation_col)) %>%
      mutate(
        !!sym(count_per_sample_col) := as.vector(scale(!!sym(count_per_sample_col)))
      )
    ylabel <- "Z-scored Cells per Sample"
    if (is.null(yticks)) {
      yticks <- seq(-3, 3)
      yticklabels <- as.character(yticks)
    }
  } else {
    counts_per_sample_df <- counts_per_sample_df %>%
      mutate(
        !!sym(count_per_sample_col) := log10(!!sym(count_per_sample_col))
      )
    ylabel <- "Cells per Sample"
    if (is.null(cutoff) == FALSE) {
      cutoff <- log10(cutoff)
    }
    if (is.null(yticks)) {
      yticks <- c(100, 500, 1000, 2500, 5000, 10000, 20000, 40000)
      yticklabels <- as.character(yticks)
      yticks <- log10(yticks)
    }
  }

  if (is.null(color_palette)) {
    N <- length(unique(counts_per_sample_df[[perturbation_col]]))
    color_palette <- get_n_colors(N)
  }

  g <- counts_per_sample_df %>%
    ggplot() +
    geom_boxplot(aes(x = reorder(!!sym(perturbation_col), !!sym(count_per_sample_col)),
                     y = !!sym(count_per_sample_col),
                     # y = log10(!!sym(count_per_sample_col)),
                     fill = !!sym(perturbation_col)),
                 color = "black",
                 outlier.stroke = 0.1,
                 outlier.size = 0.1,
                 size = 0.2) +
    scale_fill_manual(values = color_palette, name = stringr::str_to_title(perturbation_col)) +
    scale_y_continuous(breaks = yticks,
                       labels = yticklabels)

  if (is.null(cutoff) == FALSE) {
    g <- g +
      geom_hline(yintercept = cutoff,
                 color = "red",
                 linewidth = 0.5)
    if (zscore) {
      g <- g +
        geom_hline(yintercept = -cutoff,
                   color = "red",
                   linewidth = 0.5)
    }
  }

  if (!is.null(facet_by)) {
    g <- g +
      facet_grid(cols = vars(!!sym(facet_by))) +
      ggtitle(stringr::str_to_title(facet_by))
  }
  g <- g +
    ylab(ylabel) +
    monocle_theme_opts() +
    theme(axis.title.x = element_blank(),
          axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          plot.title = element_text(hjust = 0.5))

  return(g)
}

#' Get a list of color hex codes from a custom palette.
#' @noRd
get_n_colors <- function(n) {
  vibrant_colors <- c('#EE7733',
                      '#0077BB',
                      '#228833',
                      '#33BBEE',
                      '#EE3377',
                      '#CC3311',
                      '#AA3377',
                      '#009988',
                      '#004488',
                      '#DDAA33',
                      '#99CC66',
                      '#D590DD')

  bright_colors <- c('#4477AA',
                     '#EE6677',
                     '#228833',
                     '#CCBB44',
                     '#66CCEE',
                     '#AA3377',
                     '#BBBBBB')

  palette <- colorRampPalette(c(vibrant_colors, bright_colors))
  return(palette(n))
}
cole-trapnell-lab/monocle3 documentation built on June 11, 2025, 11:22 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cole-trapnell-lab/monocle3
Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

R/plotting.R
In cole-trapnell-lab/monocle3: Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

Defines functions plot_cells_3d monocle_theme_opts

Documented in plot_cells_3d

R Package Documentation

Browse R Packages

We want your feedback!

cole-trapnell-lab/monocle3 Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

R/plotting.R In cole-trapnell-lab/monocle3: Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

Defines functions plot_cells_3d monocle_theme_opts

Documented in plot_cells_3d

R Package Documentation

Browse R Packages

We want your feedback!

cole-trapnell-lab/monocle3
Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq

R/plotting.R
In cole-trapnell-lab/monocle3: Clustering, Differential Expression, and Trajectory Analysis for Single-Cell RNA-Seq