R/cumulative_fraction_reads_per_cell.R

Defines functions cumfrac_rpc plot_cumfrac_rpc

Documented in cumfrac_rpc plot_cumfrac_rpc

#' Cumulative fraction of reads per cell barcode
#'
#' Calculate cumulative fraction of reads per detected cell barcode. The
#' cumulative fraction can be plotted to estimate the number of sequenced cells
#' by identifying the "knee" of the distribution. For further details on
#' selection of cells, see the Drop-seq alignment cookbook provided by the
#' \href{http://mccarrolllab.com/dropseq/}{McCarroll lab}. This function is
#' implemented in \link[dropseqr]{plot_cumfrac_rpc} and is only required when
#' an alternative plot or analysis of the cumulative fraction is needed.
#'
#' @param read_counts A data.frame containing the number of reads per cell
#' barcode. Should consist of two columns, whereas the number of reads must in
#' the first and the barcode identity in the second column. This is typically
#' produced by running BAMTagHistogram from Drop-seq tools.
#'
#' @return A data.frame containing the original data in read_counts with an
#' added column containing the cumulative fraction.
#'
#' @examples
#' library(dropseqr)
#'
#' # load example dataset
#' data(reads_per_cell_barcode)
#'
#' # calculate cumulative fraction
#' cf <- cumfrac_rpc(reads_per_cell_barcode)
#'
#' @export
cumfrac_rpc <- function(read_counts){

  # abort if read_counts is not a data.frame
  if (class(read_counts) != "data.frame"){

    stop("read_counts not of class data.frame!")

  }

  # set colnames for read_counts
  colnames(read_counts)[1:2] <- c("reads", "cell_barcode")

  # make sure that read_counts is ordered correctly
  read_counts <- read_counts[order(read_counts[, 1], decreasing = TRUE), ]

  # calculate cumulative sum
  reads_cumsum <- cumsum(read_counts[, 1])

  # calculate cumulative fraction
  cumfrac <- reads_cumsum / max(reads_cumsum)

  # add to read_counts
  cbind(read_counts, cumfrac)

}

#' Plot cumulative fraction of reads per cell barcode
#'
#' Calculate and plot cumulative fraction of reads per detected cell barcode.
#' The cumulative fraction can be plotted to estimate the number of sequenced
#' cells by identifying the "knee" of the distribution. For further details on
#' selection of cells, see the Drop-seq alignment cookbook provided by the
#' \href{http://mccarrolllab.com/dropseq/}{McCarroll lab}.
#'
#' @param read_counts A data.frame containing the number of reads per cell
#' barcode. Should consist of two columns, whereas the number of reads must in
#' the first and the barcode identity in the second column. This is typically
#' produced by running BAMTagHistogram from Drop-seq tools.
#'
#' @param nbcs Number of cell barcodes to be plotted on the x-axis.
#'
#' @param title String containing plot title.
#'
#' @param size Size used for plotting the cumulative fraction and optional
#' ncell line.
#'
#' @param cumfrac_col Color for plotting the cumulative fraction line
#'
#' @param ncells (optional) Estimate of how many cells are inculed in sample.
#' A vertical line will be drawn at x = ncells.
#'
#' @param ncells_col Color for plotting the vertical line if ncells is
#' specified.
#'
#' @examples
#' library(dropseqr)
#'
#' # load example dataset
#' data(reads_per_cell_barcode)
#'
#' # cumulative fraction with default settings
#' plot_cumfrac_rpc(reads_per_cell_barcode, title = "Experiment 42")
#'
#' # zoom in and draw line at estimated number of cells
#' plot_cumfrac_rpc(reads_per_cell_barcode, title = "Experiment 42",
#'               nbcs = 30000, ncells = 3500)
#'
#' @export
plot_cumfrac_rpc <- function(read_counts, nbcs = 50000, title = NULL, size = 1,
                          cumfrac_col = "steelblue", ncells = NULL,
                          ncells_col = "red"){

  # calculate cumulative fraction
  cumfrac <- cumfrac_rpc(read_counts)

  # only retain nbcs number of barcodes
  cumfrac <- cumfrac[1:nbcs, ]

  # add index used for plotting
  cumfrac$index <- 1:nrow(cumfrac)

  # plot cumulative fraction vs cell barcodes
  if (is.null(ncells)){

    ggplot2::ggplot(cumfrac, ggplot2::aes_string(x = "index", y = "cumfrac")) +
      ggplot2::geom_line(color = cumfrac_col, size = size) +
      ggplot2::scale_x_continuous(name = "Cell barcodes sorted by number of reads [descending]") +
      ggplot2::scale_y_continuous(name = "Cumulative fraction of reads",
                                  limits = c(0, 1),
                                  breaks = c(0, 0.25, 0.5, 0.75, 1)) +
      ggplot2::ggtitle(title)

  }else{

    ggplot2::ggplot(cumfrac, ggplot2::aes_string(x = "index", y = "cumfrac")) +
      ggplot2::geom_line(color = cumfrac_col, size = size) +
      ggplot2::scale_x_continuous(name = "Cell barcodes sorted by number of reads [descending]") +
      ggplot2::scale_y_continuous(name = "Cumulative fraction of reads",
                                  limits = c(0, 1),
                                  breaks = c(0, 0.25, 0.5, 0.75, 1)) +
      ggplot2::ggtitle(title) +
      ggplot2::geom_vline(xintercept = ncells, linetype = "dashed",
                          size = size, color = ncells_col)

  }

}
argschwind/dropseqr documentation built on May 23, 2019, 4:24 p.m.