R/plot_functions.R

Defines functions plot_length_dist plot_Shannon_index plot_treemap plot_N50_N75 plot_counts plot_exon_dist jellyfish_plot get_colors

#' HCL Colors Wrapper
#'
#' Generates a set number of colors, for use by plot functions such as
#' jellyfish_plot which can re-use colors for different subsets of the data.
#'
#' @param n Number of colors to generate
#' @return n colors, generated by hcl, from R base's grDevices package.
#' @references
#' [1] R Core Team (2017). R: A language and environment for statistical
#' computing. R Foundation for Statistical Computing, Vienna, Austria.
#' URL https://www.R-project.org/.
#' # TODO: citations for other packages involved elsewhere
get_colors <- function(n) {
  grDevices::hcl(h = (seq(15, 375, length = n)), c = 100, l = 65)
}

#' Jellyfish Plot
#'
#' Generates a plot showing the log cumulative isoform abundance
#' distribution for one or more genes. Normalized read count is along the
#' x-axis, and log cumulative isoform count is along the y-axis. Each gene is
#' represented by one line, allowing for comparison between the skewness of
#' relative abundance of isoforms between multiple genes.
#'
#' @param database A compiled Database object.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param dot_size Size of points to plot. Default is to scale this value by
#' the number of genes being plotted, but can be manually set for appearance.
#' @param stabilize_colors Logical. Set to TRUE if you would like to plot a
#' subset of genes, and have the colors match the colors used when all genes
#' are plotted. Default is FALSE, and colors are generated according to the
#' number of genes currently being plotted.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param insert_title String to customize the title of the plot.
#' @return Jellyfish plot (ggplot object).
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1"), Name = c("Gene1"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' jellyfish <- jellyfish_plot(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
jellyfish_plot <- function(database,
                           genes_to_include = unique(database$GeneDB$Name),
                           dot_size = -1, stabilize_colors = F, use_ORFs = F,
                           insert_title = NULL) {

  if (!requireNamespace("scales", quietly = T)) {
    stop("Error: missing the scales package.")
    return(NULL)
  }
  if (use_ORFs) {
    db <- database$OrfDB
    if (is.null(insert_title)) insert_title <- "ORF Jellyfish Plot"
  } else {
    db <- database$TranscriptDB
    if (is.null(insert_title)) insert_title <- "Isoform Jellyfish Plot"
  }
  # recommended dot_size is 1.5 for a few genes, default 0.75 for many genes
  if (dot_size == -1) {
    dot_size <- ifelse(length(genes_to_include) >= 6, 0.75, 1.5)
  }
  genes_sorted <- sort(unique(db$Gene))
  all_colors <- get_colors(length(genes_sorted))

  # for overriding R's color recalculation by # of genes
  included_colors <- all_colors[sapply(genes_to_include,
                                       function(x) {which(genes_sorted == x)})]

  # reformat as lists, add 0 (one isoform exists until 0% cutoff)
  data_reformat <- lapply(genes_to_include, function(name) {
    c(0,sort(db$CumPercAbundance[db$Gene == name]))
  })
  data_reformat <- lapply(data_reformat, function(x) {
    cbind(Counts = seq_along(x), Percents = x)
  })
  # get number of isoforms for each gene
  rowcounts <- sapply(data_reformat, nrow)
  data_reformat <- as.data.frame(do.call("rbind", data_reformat))
  # add column for gene names in order to group genes in plot
  data_reformat$Genes <- rep(genes_to_include, rowcounts)

  yaxis <- c(2)
  while (yaxis[length(yaxis)] < max(rowcounts)) {
    yaxis <- c(yaxis, yaxis[length(yaxis)] * 4)
  }

  jellyfish <- ggplot(data_reformat,
                      aes_string(x = "Percents", y = "Counts",
                                 colour = "Genes")) +
    scale_y_continuous(trans = scales::log2_trans(),
                       breaks = yaxis + 1,
                       labels = yaxis,
                       limits = c(1,max(yaxis)),
                       expand = c(0, 0)) +
    geom_line(size = 0.4) + geom_point(size = dot_size) +
    labs(title = insert_title) +
    xlab("Percent of Gene's Total Reads") +
    ylab("Log Isoform Count") + theme_classic()

  if (stabilize_colors) {
    jellyfish <- jellyfish + scale_color_manual(values = included_colors)
  }
  return(jellyfish)
}


#' Exon-Abundance Distribution Plots
#'
#' Generates a bar plot showing the abundances of normalized exon
#' counts for one or more genes. If each transcript is represented as the
#' fraction of exons it contains out of the maximum number of exons found in a
#' gene, this plot is merely a histogram of those representations, weighted by
#' the read count for each transcript. Normalized exon percent is along the
#' x-axis, and abundance is along the y-axis. If only one gene name is given, a
#' second plot is generated where the x-axis is not normalized, instead showing
#' the exon count of each transcript individually, and the y-axis is
#' log-transformed.
#'
#' @param database A compiled Database object.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param sum_dist Logical. If TRUE, the result is a histogram-like bar plot,
#' where the x-axis is binned. Otherwise, individual isoforms are plotted
#' as points, and the y-axis is log-transformed (single gene only).
#' @param bin_width The histogram bin width, used only when multiple genes
#' are input and the x-axis is the fraction of total exons per gene.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' # one plot, x-axis normalized
#' plot_exon_dist(DB)
#' # two plots, x-axis not normalized
#' plot_exon_dist(DB, genes_to_include = c("Gene1"))
#' }
#' @seealso Database
#' @export
plot_exon_dist <- function(database,
                           genes_to_include = unique(database$GeneDB$Name),
                           sum_dist = T, bin_width = 0.02,
                           insert_title = "Exon-Abundance Distribution") {
  transcriptDB <- database$TranscriptDB
  if (length(genes_to_include) == 1) {
    # plot just one gene's exon distribution
    gene <- genes_to_include[1]
    db <- transcriptDB[transcriptDB$Gene == gene, ]
    if (insert_title == "Exon-Abundance Distribution") { # default
      insert_title <- paste(insert_title, gene, sep = ": ")
    }

    if (sum_dist) {
      gene_max_exons <- max(database$GeneDB$Exons[database$GeneDB$Name == gene])
      exon_abundances <- sapply(1:gene_max_exons, function(i) {
        sum(db$FL_reads[db$ExonCount == i])
      })
      df <- data.frame(Exons = 1:gene_max_exons, Reads = exon_abundances)
      ggplot(df, aes_string(x = "Exons", y = "Reads")) +
        geom_bar(stat = "identity", fill = "blue", color = "blue4") +
        scale_y_continuous(expand = c(0, 0)) + theme_classic() +
        labs(title = insert_title)
    } else {
      df <- data.frame(xaxis = db$ExonCount, yaxis = db$FL_reads)

      ggplot(df, aes_string(x = "xaxis", y = "yaxis")) +
        geom_jitter(width = 0.15, size = 3, col = grDevices::rgb(0, 0, 1, 0.4)) +
        scale_y_continuous(trans = "log2") +
        xlab("Exons") + ylab("Log Reads per Transcript") +
        labs(title = insert_title) + theme_classic()
    }

  } else {
    db <- transcriptDB[transcriptDB$Gene %in% genes_to_include, ]

    norm_exon_counts <- c()
    for (i in 1:nrow(db)) {
      gene_max <- database$GeneDB$Exons[database$GeneDB$Name == db$Gene[i]]
      norm_count <- db$ExonCount[i] / gene_max
      norm_exon_counts <- c(norm_exon_counts, rep(norm_count, db$FL_reads[i]))
    }

    ggplot(data.frame(Counts = norm_exon_counts), aes_string(x = "Counts")) +
      geom_histogram(binwidth = bin_width, fill = "blue", color = "blue4") +
      scale_y_continuous(expand = c(0, 0)) + theme_classic() +
      xlab("Fraction of Gene's Total Exons") +
      ylab("Reads") +
      labs(title = insert_title)
  }
}

#' Unique Isoforms Barplot
#'
#' Generates a bar plot showing the number of unique isoform
#' transcripts and unique ORFs for each gene.
#'
#' @param database A compiled Database object.
#' @param use_log Logical. If true, y-axis is plotted on a log scale (base 2).
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param use_counts One of both of the strings "Isoforms" and "ORFs",
#' indicating which whould be included in the plot. Default is both and to
#' give a warning if no ORF information is in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1"), Name = c("Gene1"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_isoform_orf_counts(DB)
#' }
#' @seealso Database
#' @export
plot_counts <- function(database, use_log = F,
                        genes_to_include = unique(database$GeneDB$Name),
                        use_counts = c("Isoforms", "ORFs"),
                        insert_title = NULL) {
  # TODO: standardize filtering by gene
  df <- database$GeneDB[database$GeneDB$Name %in% genes_to_include, ]
  if ("Isoforms" %in% use_counts) {
    if ("ORFs" %in% use_counts & "UniqueORFs" %in% colnames(df)) {
      df <- df[order(df$UniqueORFs),]
      df <- df[, c("Name", "Isoforms", "UniqueORFs")]
      cols <- c("blue4", "cornflowerblue")
      if (is.null(insert_title)) {
        insert_title <- "Unique Isoforms & ORFs Per Gene"
        ylabel <- "Counts"
      }
    } else {
      if ("ORFs" %in% use_counts) {
        warning("The Database is missing ORF counts,
              so none will be plotted.")
      }
      df <- df[order(df$Isoforms),]
      df <- df[, c("Name", "Isoforms")]
      cols <- c("blue4")
      insert_title <- "Unique Isoforms Per Gene"
      ylabel <- "Isoforms"
    }
  } else {
    if ("ORFs" %in% use_counts & "UniqueORFs" %in% colnames(df)) {
      df <- df[order(df$UniqueORFs),]
      df <- df[, c("Name", "UniqueORFs")]
      cols <- c("cornflowerblue")
      insert_title <- "Unique ORFs Per Gene"
      ylabel <- "ORFs"
    } else {
      stop("Ensure that the use_counts argument is correctly set,
            and that if ORF coutns are included, the database has an OrfDB.")
      return(NULL)
    }
  }

  melted <- reshape::melt(df, id.vars = 1)
  colnames(melted) <- c("Gene", "Statistic", "Count")
  melted$Statistic <- paste(" ", melted$Statistic) # fix legend spacing

  if (use_log) {
    upper_lim <- max(melted$Count) * 2
  } else {
    upper_lim <- max(melted$Count) * 1.1
  }

  ggplot(melted, aes_string(fill = "Statistic", x = "Gene", y = "Count")) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(title = insert_title) + xlab("Gene") +
    ylab(ifelse(use_log, paste("Log", ylabel, sep = " "), ylabel)) +
    scale_fill_manual(values = cols) +
    scale_y_continuous(expand = c(0, 0), limits = c(NA, upper_lim),
                       trans = ifelse(use_log, "log2", "identity")) +
    theme(axis.text.x = element_text(angle = 60, hjust = 1),
          axis.ticks = element_blank(), legend.title = element_blank(),
          legend.position = ifelse("Counts" %in% ylabel, "right", "none"),
          panel.background = element_blank(),
          axis.line = element_line(colour = "black", size = 0.2)) +
  geom_text(aes_string(group = "Statistic", label = "Count"), vjust = -0.5,
            color = "black", size = 2, position = position_dodge(width = 0.9))
}

#' Gene N50/N75 Barplot
#'
#' Generates a bar plot showing the N50 and N75 statistics for each
#' gene (transcripts or ORFs).
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_N50_N75(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_N50_N75 <- function(database, use_ORFs = F,
                         genes_to_include = unique(database$GeneDB$Name),
                         insert_title = NULL) {
  if (!(requireNamespace("reshape", quietly = T) |
      requireNamespace("reshape2", quietly = T))) {
    stop("Error: reshape/reshape2 package required.")
    return(NULL)
  } else {
    if (use_ORFs) {
      select_cols <- c("Name", "ORFN50", "ORFN75")
      y_axis_label <- "Unique ORFs"
      if (is.null(insert_title)) {
        insert_title <- "Gene ORF Statistics"
      }
    } else {
      select_cols <- c("Name", "N50", "N75")
      y_axis_label <- "Isoforms"
      if (is.null(insert_title)) {
        insert_title <- "Gene Isoform Statistics"
      }
    }
    select_genes <- sapply(database$GeneDB$Name, function(gene) {
      return(gene %in% genes_to_include)
    })
    if (requireNamespace("reshape", quietly = T)) {
      melted <- reshape::melt(database$GeneDB[select_genes, select_cols],
                            id.vars = 1)
    } else {
      melted <- reshape2::melt(database$GeneDB[select_genes, select_cols],
                            id.vars = 1)
    }

    colnames(melted) <- c("Gene", "Statistic", "Count")
    melted$Statistic <- paste(" ", melted$Statistic) # fixes legend spacing

    ggplot(melted, aes_string(x = "Gene", y = "Count")) +
      geom_bar(aes_string(fill = "Statistic"),
               stat = "identity", position = "dodge") +
      geom_text(aes_string(fill = "Statistic", label = "Count"), vjust = -0.5,
                color = "black", size = 2, position = position_dodge(width = 0.9)) +
      xlab("Gene") + ylab(y_axis_label) + labs(title = insert_title) +
      scale_y_continuous(expand = c(0, 0)) +
      theme(axis.text.x = element_text(angle = 60, hjust = 1),
            axis.ticks = element_blank(), legend.title = element_blank(),
            panel.background = element_blank(),
            axis.line = element_line(colour = "black", size = 0.2))
  }
}

#' Treemap Plot
#'
#' Generates a treemap plot showing how individual transcripts/ORFs and genes
#'account for abundance within the dataset as a whole.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_length_dist(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_treemap <- function(database, use_ORFs = F,
                         genes_to_include = unique(database$GeneDB$Name),
                         insert_title = NULL) {
  if (!(requireNamespace("treemapify", quietly = T))) {
    stop("Error: treemapify packages required.")
    return(NULL)
  }
  if (use_ORFs) {
    db <- database$OrfDB[, c("PBID", "FL_reads", "Gene")]
    db <- db[db$Gene %in% genes_to_include, ]
    db$PBID <- as.character(seq_len(nrow(db)))
    if (is.null(insert_title)) insert_title <- "ORF Distributions"
  } else {
    db <- database$TranscriptDB[, c("PBID", "FL_reads", "Gene")]
    db <- db[db$Gene %in% genes_to_include, ]
    if (is.null(insert_title)) insert_title <- "Isoform Distributions"
  }

  ggplot(db, aes_string(area = "FL_reads", group = "Gene",
                        fill = "Gene", subgroup = "Gene",
                        label = "PBID", subgroup2 = "PBID")) +
    treemapify::geom_treemap() +
    treemapify::geom_treemap_subgroup2_border(colour = "black", size = 0.25) +
    treemapify::geom_treemap_subgroup_border(colour = "black", size = 0.75) +
    treemapify::geom_treemap_subgroup_text(colour = "white",
                                           place = "center",
                                           fontface = "italic") +
    theme(legend.position = "none") + labs(title = insert_title)
}

#' Shannon Index Plot
#'
#' Generates a plot showing the Shannon Index for isoform/ORF diversity on
#' the x-axis, and each gene on the y-axis.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_Shannon_index(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_Shannon_index <- function(database, use_ORFs = F,
                               genes_to_include = unique(database$GeneDB$Name),
                               insert_title = NULL) {
  if (use_ORFs) {
    if (!("ORFShannon" %in% colnames(database$GeneDB))) {
      stop("Error: ORF Shannon index not calculated in this database.")
      return(NULL)
    }
    db <- database$GeneDB[, c("Name", "ORFShannon")]
    db <- db[db$Name %in% genes_to_include, ]
    db$PBID <- as.character(seq_len(nrow(db)))
    if (is.null(insert_title)) insert_title <- "ORF Shannon Index"
  } else {
    if (!("Shannon" %in% colnames(database$GeneDB))) {
      stop("Error: isoform Shannon index not calculated in this database.")
      return(NULL)
    }
    db <- database$GeneDB[, c("Name", "Shannon")]
    db <- db[db$Name %in% genes_to_include, ]
    if (is.null(insert_title)) insert_title <- "Isoform Shannon Index"
  }
  names(db) <- c("Gene", "Shannon_Index")
  buffer <- (max(db$Shannon_Index) - min(db$Shannon_Index)) * 0.025
  db$Max <- rep(max(db$Shannon_Index) + buffer, nrow(db))
  db$Min <- rep(min(db$Shannon_Index) - buffer, nrow(db))
  db <- db[order(db$Shannon_Index), ]
  db$Gene <- factor(db$Gene, levels = db$Gene)

  ggplot(db, aes_string(x = "Gene",
                        y = "Shannon_Index",
                        colour = "Shannon_Index")) +
    geom_segment(aes_string(x = "Gene", xend = "Gene",
                            y = "Min",  yend = "Max"),
                 linetype = "dashed", size = 0.1, colour = "black") +
    geom_point(size = 3) + scale_y_continuous(expand = c(0, 0)) +
    scale_colour_gradientn(colours = c("blue", "red")) +
    ylab("Shannon Index") +
    labs(title = insert_title) + coord_flip() + theme_classic() +
    theme(axis.ticks.y = element_blank(), legend.position = "none")
}

#' Isoform Length Distribution Plot
#'
#' Generates a dot plot showing how transcripts/ORFs are distributed in length
#' for each gene in the database.
#'
#' @param database A compiled Database object.
#' @param use_ORFs Logical. Set to TRUE to use abundances from OrfDB instead
#' of abundances from TranscriptDB. Note that OrfDB collapses isoforms with
#' non-unique transcripts, so abundances may differ significantly.
#' @param bins The number of bins to use in segmenting the range of lengths
#' over the entire dataset. This parameter determines vertical dot spread.
#' @param genes_to_include Vector of gene names to subset from the database.
#' Default is to plot all genes in the database.
#' @param horiz_spread Numeric. This parameter determines horizontal dot
#' spread across all the genes visualized.
#' @param insert_title String to customize the title of the plot.
#' @return A ggplot object.
#' @examples
#' \dontrun{
#' gene_ID_table <- data.frame(ID = c("PB.1", "PB.2"),
#' Name = c("Gene1", "Gene2"))
#' rawDB <- compile_raw_db(transcript_file, abundance_file, gff_file, ORF_file)
#' DB <- process_db(rawDB, gene_ID_table)
#' plot_length_dist(DB)
#' }
#' @seealso Database
#' @import ggplot2
#' @export
plot_length_dist <- function(database, use_ORFs = F, bins = 200,
                             genes_to_include = unique(database$GeneDB$Name),
                             horiz_spread = 0.3, insert_title = NULL) {
  if (use_ORFs) {
    if (is.null(database$OrfDB)) {
      stop("Error: ORFs not included in this database.")
      return(NULL)
    }
    db <- database$OrfDB[, c("Gene", "ORFLength")]
    db <- db[db$Gene %in% genes_to_include, ]
    if (is.null(insert_title)) insert_title <- "Unique ORF Length Distributions"
  } else {
    db <- database$TranscriptDB[, c("Gene", "TranscriptLength")]
    db <- db[db$Gene %in% genes_to_include, ]
    if (is.null(insert_title)) insert_title <- "Transcript Length Distributions"
  }
  names(db) <- c("Gene", "Length")
  len_range <- max(db$Length) - min(db$Length)
  binwidth <- ceiling(len_range / bins)

  ggplot(db, aes_string(x = "Gene", y = "Length")) +
    geom_dotplot(binaxis = 'y', dotsize = 1, binwidth = binwidth,
                 method = "histodot", stackdir = 'center',
                 stackratio = horiz_spread) +
    labs(title = insert_title) + theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}
kellycochran/IsoDB documentation built on July 7, 2020, 2:17 p.m.