R/RNAseq.R

Defines functions ProcessDEGs RunDESeq2

Documented in ProcessDEGs RunDESeq2

#' Run DESeq2 analyses
#'
#' \code{RunDESeq2} performs a high-level differential gene expression analysis 
#' with \code{DESeq2}. It, along with \link{ProcessDEGs}, form the crux of the
#' RNA-seq portion of this package. 
#'
#' The default parameters should be an adequate starting place for most users,
#' but lazy folks can provide multiple thresholds to \code{padj.thresh}
#' and/or \code{fc.thresh} if they aren't sure how stringent or lenient they 
#' need to be with their data.
#'
#' It's generally best to provide an empty directory as the output path, as
#' several directories will be generated.
#'
#' @param outpath Path to directory to be used for output. Additional 
#'   directories will be generated within this folder.
#' @param quants.path Path to directory containing a directory for each sample
#'   with a salmon-generated \code{quant.sf} file inside.
#' @param samplesheet Path to samplesheet containing sample metadata.
#' @param tx2gene Path to file with transcript IDs in first column and gene 
#'   identifiers in second. Used for gene-level summarization.
#' @param level String defining variable of interest.
#' @param padj.thresh Number or numeric scalar indicating the adjusted p-value 
#'   cutoff(s) to be used for determining "significant" differential expression.
#'   If multiple are given, multiple tables/plots will be generated using all 
#'   combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param fc.thresh Number or numeric scalar indicating the log2 fold-change 
#'   cutoff(s) to be used for determining "significant" differential expression.
#'   If multiple are given, multiple tables/plots will be generated using all 
#'   combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param plot.annos String or character vector defining the column(s) in 
#'   \code{samplesheet} to use to annotate figures.
#' @param plot.box Boolean indicating whether box plots for DEGs should be 
#'   created for each comparison. If so, the \code{top.n} genes will be plotted.
#'   This step is quite time-consuming with many genes.
#' @param plot.enrich Boolean indicating whether enrichment analyses for DEGs 
#'   should be run and plotted for each comparison. 
#' @param enrich.libs Vector of valid \code{enrichR} libraries to test the 
#'   genes against.
#'
#'   Available libraries can be viewed with \code{listEnrichrDbs} from the
#'   \code{enrichR} package.
#' @param top.n Number of differentially expressed genes to create boxplots for, 
#'   ranked by adj. p-value after applying \code{padj.thresh} and 
#'   \code{fc.thresh} thresholds. If multiple thresholds are provided, the 
#'   lowest fold-change and highest adj. p-value thresholds will be used. 
#' @param block String or character vector defining the column(s) in 
#'   \code{samplesheet} to use to block for unwanted variance, e.g. batch or 
#'   technical effects.
#' @param count.filt Number indicating read threshold. Genes with fewer counts 
#'   than this number summed across all samples will be removed from the
#'   analysis.
#' @return Named List containing a list named 'res.list' containing 
#'   \linkS4class{DESeqResults} objects for all comparisons generated by 
#'   \code{\link{ProcessDEGs}}, a \linkS4class{DESeqDataSet} object named 'dds'
#'   from running \code{\link[DESeq2]{DESeq}}, a 
#'   \linkS4class{RangedSummarizedExperiment} object named 'rld' from running 
#'   \code{\link[DESeq2]{rlog}}, and a \linkS4class{RangedSummarizedExperiment} 
#'   object from running \code{\link[DESeq2]{vst}} named 'vsd'.
#'   
#' @import DESeq2
#' @importFrom tximport tximport
#' @importFrom utils read.table
#'
#' @export
#'
#' @author Jared Andrews
#'
#' @seealso
#' \code{\link[DESeq2]{DESeq}}, for more about differential expression analysis.
#' \code{\link{ProcessDEGs}}, for analyzing and visualizing the results.
#'
RunDESeq2 <- function(outpath, quants.path, samplesheet, tx2gene, level,  
  padj.thresh = 0.05, fc.thresh = 2, plot.annos = NULL, plot.box = TRUE, 
  plot.enrich = TRUE, enrich.libs = c("GO_Molecular_Function_2018", 
  "GO_Cellular_Component_2018", "GO_Biological_Process_2018", "KEGG_2019_Human",
  "Reactome_2016", "BioCarta_2016", "Panther_2016"), top.n = 100, block = NULL, 
  count.filt = 10) {
    
  message("### EXPLORATORY DATA ANALYSIS ###\n")

  ### CREATING DIRECTORY STRUCTURE ###
  message("# CREATING DIRECTORY STRUCTURE AND MODEL DESIGN #\n")
  # Create directory structure and set design formula.
  base <- outpath
  setup <- CreateOutputStructure(block, level, base)
  base <- setup$base
  design <- setup$design
  message(paste0("\nDesign is: ", design, "\n"))
  
  if (is.null(plot.annos)) {
      plot.annos <- level
  }
  
  ### FILE LOADING & PRE-FILTERING ###
  message("# FILE LOADING & PRE-FILTERING #\n")
  tx2gene <- read.table(tx2gene, sep = "\t")    
  samples <- read.table(samplesheet, header = TRUE)
  rownames(samples) <- samples$name
  files <- file.path(quants.path, samples$name, "quant.sf")
  names(files) <- samples$name

  # Read in our actual count files now.
  txi <- tximport(files, type = "salmon", tx2gene = tx2gene)

  # Create the DESeqDataSet object.
  dds <- DESeqDataSetFromTximport(txi, colData = samples, design = design)

  # Pre-filter transcripts with really low read counts.
  message(paste0("\ndds starting with ", nrow(dds), " genes."))
  dds <- dds[rowSums(counts(dds)) >= count.filt,]
  message(paste0("\ndds has ", nrow(dds), 
    " genes after removing genes with under ", count.filt, " counts total.\n"))

  ### VARIANCE STABILIZATION COMPARISONS ###
  message("\n# VARIANCE STABILIZATION COMPARISONS #\n")
  vst.out <- paste0(base,"/EDAFigures/VarianceTransformations.pdf")
  message(paste0("Output: ", vst.out))
  trans <- PlotRNAVarianceTransformations(dds, vst.out)
  rld <- trans$rld
  vsd <- trans$vsd

  ### SAMPLE DISTANCES ###
  message("\n# PLOTTING SAMPLE DISTANCES #\n")
  dists.out <- paste0(base, "/EDAFigures/SampleDistances.pdf")
  message(paste0("Output: ", dists.out))
  PlotRNASampleDistances(rld, vsd, dists.out, level, plot.annos)

  ### PCA PLOTS ###
  message("\n# PCA PLOTS #\n")
  pca.out <- paste0(base, "/EDAFigures/PCA.pdf")
  message(paste0("Output: ", pca.out))
  PlotRNAEDAPCAs(rld, vsd, pca.out, level, plot.annos)
  
  #======================================#
  ### DIFFERENTIAL EXPRESSION ANALYSIS ###
  message("\n### DIFFERENTIAL EXPRESSION ANALYSIS ###\n")
  dds <- DESeq(dds)

  # All the actual processing occurs here.
  full <- ProcessDEGs(dds, rld, vsd, base, level, plot.annos, padj.thresh,
    fc.thresh, plot.box, plot.enrich, enrich.libs, top.n)

  ### SAVING OBJECTS ###
  message("\n# SAVING ROBJECTS #\n")
  message(paste0("DESeq2: ", paste0(base, "/Robjects/dds.rds")))
  message(paste0("Regularized log transformation: ", 
    paste0(base, "/Robjects/rld.rds")))
  message(paste0("Variance stabilized transformation: ", 
    paste0(base, "/Robjects/vsd.rds")))
  saveRDS(dds, file=paste0(base, "/Robjects/dds.rds"))
  saveRDS(rld, file=paste0(base, "/Robjects/rld.rds"))
  saveRDS(vsd, file=paste0(base, "/Robjects/vsd.rds"))

  return(full)
}


#' Extract, visualize, and save DEG lists
#'
#' \code{ProcessDEGs} wraps several plotting functions to generate figures 
#' specifically for the differentially expressed genes between each possible
#' comparison. 
#' 
#' \code{ProcessDEGs} is called by \link{RunDESeq2} but can also be re-run with 
#' the \link{RunDESeq2} output or its own output if you want to save time and 
#' don't need to generate all of the exploratory data analysis figures again.
#'
#' This function will generate many figures in addition to saving the counts
#' and results tables.
#' 
#' @param dds A \linkS4class{DESeqDataSet} object as returned by 
#'   \code{\link[DESeq2]{DESeq}}.
#' @param rld A \linkS4class{RangedSummarizedExperiment} object of 
#'   \code{\link[DESeq2]{rlog}} transformed counts as returned by
#'   \link{RunDESeq2}.
#' @param vsd A \linkS4class{RangedSummarizedExperiment} object of 
#'   \code{\link[DESeq2]{vst}} transformed counts as returned by
#'   \link{RunDESeq2}.
#' @param outpath Path to directory to be used for output.
#' @param level String defining variable of interest.
#' @param plot.annos String or character vector defining the column(s) in 
#'   \code{samplesheet} to use to annotate figures.
#' @param padj.thresh Number or numeric scalar indicating the adjusted p-value 
#'   cutoff(s) to be used for determining "significant" differential expression.
#'   If multiple are given, multiple tables/plots will be generated using all 
#'   combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param fc.thresh Number or numeric scalar indicating the log2 fold-change 
#'   cutoff(s) to be used for determining "significant" differential expression.
#'   If multiple are given, multiple tables/plots will be generated using all 
#'   combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param plot.box Boolean indicating whether box plots for DEGs should be 
#'   created for each comparison. If so, the \code{top.n} genes will be plotted.
#' @param plot.enrich Boolean indicating whether enrichment analyses for DEGs 
#'   should be run and plotted for each comparison.
#' @param enrich.libs A vector of valid \code{enrichR} libraries to test the 
#'   genes against.
#'
#'   Available libraries can be viewed with 
#'   \code{\link[enrichR]{listEnrichrDbs}} from the \code{enrichR} package.
#' @param top.n Number of differentially expressed genes to create boxplots for, 
#'   ranked by adj. p-value after applying \code{padj.thresh} and 
#'   \code{fc.thresh} thresholds. If multiple thresholds are provided, the 
#'   lowest fold-change and highest adj. p-value thresholds will be used.
#' @return Named List containing a list named 'res.list' containing named lists
#'   for each p-value threshold containing
#'   \linkS4class{DESeqResults} objects for all comparisons generated by 
#'   \code{\link{ProcessDEGs}}, a \linkS4class{DESeqDataSet} object named 'dds'
#'   from running \code{\link[DESeq2]{DESeq}}, a 
#'   \linkS4class{RangedSummarizedExperiment} object named 'rld' from running 
#'   \code{\link[DESeq2]{rlog}}, and a \linkS4class{RangedSummarizedExperiment} 
#'   object from running \code{\link[DESeq2]{vst}} named 'vsd'.
#'
#' @importFrom DESeq2 lfcShrink counts plotCounts plotMA
#'
#' @export
#'
#' @author Jared Andrews
#'
#' @seealso
#' \code{\link{RunDESeq2}}, for generating input for this function.
#'
ProcessDEGs <- function(dds, rld, vsd, outpath, level, plot.annos, 
  padj.thresh = 0.05, fc.thresh = 2, plot.box = TRUE, plot.enrich = TRUE, 
  enrich.libs = c("GO_Molecular_Function_2018", 
  "GO_Cellular_Component_2018", "GO_Biological_Process_2018", "KEGG_2019_Human",
  "Reactome_2016", "BioCarta_2016", "Panther_2016"), top.n = 100) {

  message("\n# COLLECTING RESULTS #\n")
  # Get all possible sample comparisons.
  combs <- combn(levels(colData(rld)[,level]), 2)
  combs.seq <- seq(1, length(combs), by = 2)

  res.list <- list()
  for (p in padj.thresh) {
    rezzes <- list()
    for (samp in combs.seq) {
      g1 <- combs[samp]
      g2 <- combs[samp + 1]
      r <- results(dds, contrast = c(level, g1, g2), alpha = p)
      res <- suppressMessages(lfcShrink(dds, res = r, type='ashr'))
      rezzes[[paste0(g1, "-v-", g2)]] <- res
    }
    res.list[[toString(p)]] <- rezzes
  }
  
  ##### PLOTTING #####

  ### DEG BOXPLOTS, PCAS, VOLCANOES, HEATMAPS ###
  message("\n# GENERATING PLOTS #\n")
  for (p in padj.thresh) {
    for (fc in fc.thresh) {
      PlotRNADEGPCAs(res.list[[toString(p)]], rld, vsd, outpath, level, 
        plot.annos, p, fc)
      PlotRNAVolcanoes(res.list[[toString(p)]], dds, outpath, p, fc)
      PlotRNAHeatmaps(res.list[[toString(p)]], rld, vsd, level, outpath, p, fc, 
        plot.annos)
      PlotRNACombinedHeatmaps(res.list[[toString(p)]], rld, vsd, outpath, p, fc, 
        plot.annos)
      if (plot.enrich) {
        PlotEnrichments(res.list[[toString(p)]], outpath, p, fc, enrich.libs)
      }
    }
  }

  if (plot.box) {
    p <- max(padj.thresh)
    fc <- min(fc.thresh)
    PlotRNABoxplots(res.list[[toString(p)]], dds, rld, outpath, p, fc, top.n, 
      level)
  }
  
  ### MA PLOTs ###
  for (p in padj.thresh) {
    rez <- res.list[[toString(p)]]
    for (r in seq_along(rez)) {
      res <- rez[[r]]
      comp <- names(rez)[r]
      message("Comparison set: ", comp)

      message("Generating MA plots with p.adj cutoff of ", p)
      pdf(paste0(outpath,"/MAPlots/", comp, ".padj.", p, ".MA_plot.pdf"))
      plotMA(res, ylim = c(-6, 6), alpha = p)
      dev.off()
    }
  }

  ### SAVING TABLES ###
  message("\n# SAVING RESULTS TABLES #\n")
  SaveResults(results = res.list, outpath = outpath, dds = dds)

  return(list(res.list = res.list, dds = dds, rld = rld, vsd = vsd))
}
j-andrews7/OneLinerOmics documentation built on Sept. 9, 2021, 11 a.m.