#' Run DiffBind analysis on a samplesheet
#'
#' \code{RunDiffBind} performs a high-level differential binding analysis
#' with \code{DiffBind}. It, along with \link{ProcessDBRs}, form the crux of the
#' ChIP-seq portion of this package.
#'
#' The default parameters should be an adequate starting place for most users,
#' but lazy folks can provide multiple thresholds to \code{fdr.thresh}
#' and/or \code{fc.thresh} if they aren't sure how stringent or lenient they
#' need to be with their data.
#'
#' Providing the resulting DBA object as input to this function can be useful
#' when running multiple times with different \code{level}s and \code{block}s
#' or thresholds, as it skips bam loading, which is by far the most
#' time-intensive part.
#'
#' It's generally best to provide an empty directory as the output path, as
#' several directories will be generated.
#'
#' @param outpath Path to directory to be used for output. Additional
#' directories will be generated within this folder.
#' @param samplesheet Path to samplesheet containing sample metadata.
#' @param txdb \code{TxDb} object to use for annotation.
#' @param dba DBA object as returned by \code{\link[DiffBind]{dba.count}},
#' \code{\link[DiffBind]{dba.analyze}} or from this function itself. If
#' provided, \code{samplesheet} and \code{n.consensus} are ignored.
#' @param level String defining variable of interest from \code{samplesheet}.
#' Must be one of: "Treatment", "Condition", "Tissue", or "Factor".
#' @param se Path to file containing consensus SEs, which will be used to
#' to annotate whether individual peaks fall within an SE or not.
#' @param fdr.thresh Number or numeric scalar indicating the false discovery
#' rate (FDR) cutoff(s) to be used for determining "significant" differential
#' binding. If multiple are given, multiple tables/plots will be generated
#' using all combinations of \code{fdr.thresh} and \code{fc.thresh}.
#' @param fc.thresh Number or numeric scalar indicating the log2 fold-change
#' cutoff(s) to be used for determining "significant" differential binding.
#' If multiple are given, multiple tables/plots will be generated using all
#' combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param block String or character vector defining the column(s) in
#' \code{samplesheet} to use to block for unwanted variance, e.g. batch or
#' technical effects. Must be one of: "Treatment", "Condition", "Tissue",
#' or "Factor".
#' @param n.consensus Number of samples in which peaks must overlap for the
#' peaks to be merged and included in the consensus peak set.
#' @param breaks Vector of sequences to be used as breaks for signal heatmaps.
#' @param heatmap.colors Character vector containing custom colors to use for
#' heatmaps in hex (e.g. \code{c("#053061", "#f5f5f5", "#67001f")}).
#' @param heatmap.preset String indicating which of the color presets to use in
#' heatmaps.
#'
#' Available presets (low to high) are:
#' \itemize{
#' \item "BuRd" Blue to red.
#' \item "OrPu" Orange to purple.
#' \item "BrTe" Brown to teal.
#' \item "PuGr" Purple to green.
#' \item "BuOr" Sea blue to orange.
#' }
#' @param reverse Boolean indicating whether to flip heatmap color scheme
#' (high color will become low, etc).
#' @param plot.enrich Boolean indicating whether enrichment analyses for DBRs
#' should be run and plotted for each comparison.
#' @param enrich.libs Vector of valid \code{enrichR} libraries to test the
#' genes against.
#'
#'
#' Available libraries can be viewed with
#' \code{\link[enrichR]{listEnrichrDbs}} from the \code{enrichR} package.
#' @param promoters Scalar vector containing how many basepairs up and
#' downstream of the TSS should be used to define gene promoters.
#' @param method String indicating method to be used for differential expression
#' analysis. Can be "DESeq2" or "edgeR".
#' @param scale.full Boolean indicating whether the full library size (total
#' number of reads) for each sample is used for scaling normalization. If
#' \code{FALSE}, the total number of reads present in the peaks for each
#' sample is used (preferable if overall binding levels are expected to be
#' similar between samples).
#' @param flank.anno Boolean indicating whether flanking gene information for
#' each peak should be retrieved. Useful for broad peaks and super enhancers.
#' @param flank.distance Integer for distance from edges of peak to search for
#' flanking genes. Ignored if \code{flank.anno = FALSE}.
#' @return A \code{DBA} object from \code{\link[DiffBind]{dba.analyze}}.
#'
#' @importFrom utils read.table
#' @import DiffBind
#'
#' @export
#'
#' @author Jared Andrews
#'
#' @seealso
#' \code{\link[DiffBind]{dba}}, \code{\link[DiffBind]{dba.count}},
#' \code{\link[DiffBind]{dba.contrast}}, \code{\link[DiffBind]{dba.analyze}},
#' \code{\link[DiffBind]{dba.report}} for more about ChIP-seq differential
#' binding analysis.
#'
#' \code{\link{ProcessDBRs}}, for analyzing and visualizing the results.
#'
RunDiffBind <- function(outpath, samplesheet, txdb,
dba = NULL,
level = c("Treatment", "Condition", "Tissue", "Factor"),
se = NULL,
fdr.thresh = 0.05,
fc.thresh = 2,
block = NULL,
heatmap.colors = NULL,
heatmap.preset = NULL,
reverse = FALSE,
n.consensus = 2,
breaks = c(seq(-3, -1.0001, length = 250), seq(-1, -0.1, length = 250),
seq(-0.0999, 0.0999,length = 1), seq(0.1, 1, length = 250),
seq(1.0001, 3, length = 250)),
plot.enrich = TRUE,
enrich.libs = c("GO_Molecular_Function_2018",
"GO_Cellular_Component_2018", "GO_Biological_Process_2018", "KEGG_2019_Human",
"Reactome_2016", "BioCarta_2016", "Panther_2016"),
promoters = c(-2000, 2000),
method = c("DESeq2", "edgeR"),
scale.full = TRUE,
flank.anno = TRUE,
flank.dist = 5000) {
message("### SETUP ###\n")
rblock = NULL
# CHECKING ARGUMENTS #
message("\n# CHECKING ARGUMENTS #\n")
level <- match.arg(level)
if (level == "Treatment") {
rlevel <- DBA_TREATMENT
} else if (level == "Condition") {
rlevel <- DBA_CONDITION
} else if (level == "Tissue") {
rlevel <- DBA_TISSUE
} else if (level == "Factor") {
rlevel <- DBA_FACTOR
}
if (!is.null(block)) {
block <- match.arg(block, choices = c("Treatment", "Condition", "Tissue",
"Factor"))
if (block == "Treatment") {
rblock <- DBA_TREATMENT
} else if (block == "Condition") {
rblock <- DBA_CONDITION
} else if (block == "Tissue") {
rblock <- DBA_TISSUE
} else if (block == "Factor") {
rblock <- DBA_FACTOR
}
}
method <- match.arg(method)
if (!is.null(rblock)) {
if (method == "DESeq2") {
method <- DBA_DESEQ2_BLOCK
} else {
method <- DBA_EDGER_BLOCK
}
} else {
if (method == "DESeq2") {
method <- DBA_DESEQ2
} else {
method <- DBA_EDGER
}
}
# Set up colors/breaks for heatmaps later.
hmap.colors <- .heatmap_colors(breaks = breaks, preset = heatmap.preset,
custom.colors = heatmap.colors, reverse = reverse)
# CREATE DIRECTORY STRUCTURE #
message("# CREATING DIRECTORY STRUCTURE #\n")
setup <- CreateOutputStructure(block, level, base = outpath, chip = TRUE)
base <- setup$base
# LOAD SUPER ENHANCERS #
if (!is.null(se)) {
message("# LOADING SUPER ENHANCERS #\n")
se <- read.table(se, sep = '\t', header = TRUE, quote = NULL)
}
# FINDING DIFFERENTIALLY BOUND REGIONS #
message("### FINDING DIFFERENTIALLY BOUND REGIONS ###\n\n")
if (is.null(dba)) {
samps <- dba(sampleSheet = samplesheet, minOverlap = n.consensus)
count <- dba.count(samps, minOverlap = n.consensus)
} else {
count <- dba
}
if (!is.null(rblock)) {
cont <- dba.contrast(count, categories = rlevel, block = rblock)
} else {
cont <- dba.contrast(count, categories = rlevel)
}
results <- dba.analyze(cont, bFullLibrarySize = scale.full)
# CONSENSUS PEAKS #
message("# DEALING WITH CONSENSUS PEAKS #\n")
report <- dba.report(results, th = 1, bCalled = TRUE,
bCounts = TRUE, method = method, bCalledDetail = TRUE)
# ANNOTATION #
message("# ANNOTATING & GENERATING CONSENSUS PLOTS #\n")
peak.anno <- annotatePeak(report, tssRegion = promoters, TxDb = txdb,
annoDb = "org.Hs.eg.db", level = "gene", addFlankGeneInfo = flank.anno,
flankDistance = flank.dist)
PlotChIPAnnos(list(peak.anno), outpath = base)
PlotChIPPCAs(results, outpath = base, method = method)
PlotChIPHeatmaps(results, outpath = base, method = method, breaks = breaks,
colors = hmap.colors)
# DB PEAKS #
message("# ANNOTATING & GENERATING DBR PLOTS #\n")
ProcessDBRs(results = results, outpath = base, txdb = txdb,
fdr.thresh = fdr.thresh, fc.thresh = fc.thresh, method = method,
breaks = breaks, heatmap.colors = heatmap.colors,
heatmap.preset = heatmap.preset, reverse = reverse,
plot.enrich = plot.enrich, enrich.libs = enrich.libs,
flank.anno = flank.anno, flank.dist = flank.dist)
message("# SAVING RESULTS TABLES #\n")
SaveResults(results, outpath = base, chip = TRUE, method = method,
promoters = promoters, se = se, txdb = txdb, flank.anno = flank.anno,
flank.dist = flank.dist)
message("# SAVING ROBJECTS #")
saveRDS(results, file=paste0(base, "/Robjects/results.rds"))
return(results)
}
#' Extract, visualize, and save DBRs
#'
#' \code{ProcessDBRs} wraps several plotting functions to generate figures
#' specifically for differentially bound regions in a given comparison.
#'
#' \code{ProcessDBRs} is called by \link{RunDiffBind} but can also be re-run
#' with the \link{RunDiffBind} output if you want to save time and don't need to
#' generate all of the EDA/consensus figures again.
#'
#' This function will generate many figures in addition to saving the results as
#' tables.
#'
#' @param results \code{DBA} object as returned by
#' \code{\link[DiffBind]{dba.analyze}}.
#' @param outpath Path to directory to be used for output.
#' @param txdb \code{TxDb} object to use for annotation.
#' @param fdr.thresh Number or numeric scalar indicating the false discovery
#' rate (FDR) cutoff(s) to be used for determining "significant" differential
#' binding. If multiple are given, multiple tables/plots will be generated
#' using all combinations of \code{fdr.thresh} and \code{fc.thresh}.
#' @param fc.thresh Number or numeric scalar indicating the log2 fold-change
#' cutoff(s) to be used for determining "significant" differential binding.
#' If multiple are given, multiple tables/plots will be generated using all
#' combinations of \code{padj.thresh} and \code{fc.thresh}.
#' @param method Method used for differential binding e.g. \code{DBA_DESEQ2}.
#'
#' Do not quote this parameter, it is a global variable from \code{DiffBind}.
#' If a block was used, be sure to include that
#' (e.g. \code{DBA_DESEQ2_BLOCK}).
#' @param promoters Scalar vector containing how many basepairs up and
#' downstream of the TSS should be used to define gene promoters.
#' @param breaks Vector of sequences to be used as breaks for signal heatmaps.
#' @param heatmap.colors Character vector containing custom colors to use for
#' heatmaps in hex (e.g. \code{c("#053061", "#f5f5f5", "#67001f")}).
#' @param heatmap.preset String indicating which of the color presets to use in
#' heatmaps.
#'
#' Available presets (low to high) are:
#' \itemize{
#' \item "BuRd" Blue to red.
#' \item "OrPu" Orange to purple.
#' \item "BrTe" Brown to teal.
#' \item "PuGr" Purple to green.
#' \item "BuOr" Sea blue to orange.
#' }
#' @param reverse Boolean indicating whether to flip heatmap color scheme
#' (high color will become low, etc).
#' @param plot.enrich Boolean indicating whether enrichment analyses for DBRs
#' should be run and plotted for each comparison.
#' @param enrich.libs A vector of valid \code{enrichR} libraries to test the
#' genes against.
#' @param flank.anno Boolean indicating whether flanking gene information for
#' each peak should be retrieved. Useful for broad peaks and super enhancers.
#' @param flank.distance Integer for distance from edges of peak to search for
#' flanking genes. Ignored if \code{flank.anno = FALSE}.
#'
#' Available libraries can be viewed with
#' \code{\link[enrichR]{listEnrichrDbs}} from the \code{enrichR} package.
#'
#' @import DiffBind
#' @import parallel
#' @import GenomeInfoDb
#' @importFrom ChIPseeker annotatePeak
#' @importFrom GenomicRanges GRangesList
#'
#' @export
#'
#' @author Jared Andrews
#'
#' @seealso
#' \code{\link{RunDiffBind}}, for generating input for this function.
#'
ProcessDBRs <- function(results, outpath, txdb,
fdr.thresh = 0.05,
fc.thresh = 1,
method = NULL,
promoters = c(-2000, 2000),
breaks = c(seq(-3, -1.0001, length = 250), seq(-1, -0.1, length = 250),
seq(-0.0999, 0.0999,length = 1), seq(0.1, 1, length = 250),
seq(1.0001, 3, length = 250)),
heatmap.colors = NULL,
heatmap.preset = NULL,
reverse = FALSE,
plot.enrich = TRUE,
enrich.libs = c("GO_Molecular_Function_2018",
"GO_Cellular_Component_2018", "GO_Biological_Process_2018", "KEGG_2019_Human",
"Reactome_2016", "BioCarta_2016", "Panther_2016"),
flank.anno = TRUE,
flank.dist = 5000) {
hmap.colors <- .heatmap_colors(breaks = breaks, preset = heatmap.preset,
custom.colors = heatmap.colors, reverse = reverse)
for (fc in fc.thresh) {
for (fdr in fdr.thresh) {
for (i in seq_along(results$contrasts)) {
repper <- try({
reportdb <- dba.report(results, th = fdr, fold = fc, bCalled = TRUE,
bCounts = TRUE, method = method, contrast = i, bCalledDetail = TRUE)
})
g1 <- results$contrasts[[i]]$name1
g2 <- results$contrasts[[i]]$name2
if (class(repper) == "character") {
message(paste0(g1, "-v-", g2, " contrast has no DB sites, skipping."))
next
}
# Check for no DB peaks and skip if TRUE.
if (length(reportdb) == 0) {
message(paste0("No DB regions in ", g1, " v ", g2, ", skipping."))
next
}
g1up <- reportdb[(reportdb$Fold > 0)]
g2up <- reportdb[(reportdb$Fold < 0)]
# Create a named list of our subsets.
files <- GRangesList(reportdb, g1up, g2up)
names(files) <- c(paste0(g1, "-v-", g2), paste0(g1, ".up"),
paste0(g2, ".up"))
# Filter those with no elements, as annotatePeak will error otherwise.
keep <- lengths(files) != 0
files <- files[keep]
peak.anno.list <- lapply(files, annotatePeak, TxDb = txdb,
tssRegion = promoters, verbose = FALSE, annoDb = "org.Hs.eg.db",
addFlankGeneInfo = flank.anno, flankDistance = flank.dist)
PlotChIPAnnos(peak.anno.list, outpath, consensus = FALSE,
comp = names(files)[1], fc = fc, fdr = fdr)
pdf(paste0(outpath, "/DBRFigures/MAPlots/", g1, "-v-", g2, ".fdr.",
fdr, ".log2fc.", fc, ".MAplots.pdf"), width = 5, height = 5)
dba.plotMA(results, th = fdr, fold = fc, contrast = i, method = method)
dba.plotMA(results, th = fdr, fold = fc, contrast = i, method = method,
bXY = TRUE)
dev.off()
pdf(paste0(outpath, "/DBRFigures/SignalBoxPlots/", g1, "-v-", g2,
".fdr.", fdr, ".log2fc.", fc, ".BoxPlots.pdf"), width = 7, height = 5)
# Handle 0 DB peaks in one direction or another.
inc <- FALSE
dec <- FALSE
if (length(g1up) > 0) {
inc <- TRUE
}
if (length(g2up) > 0) {
dec <- TRUE
}
dba.plotBox(results, report = reportdb, contrast = i, method = method,
bReversePos = TRUE, bDBIncreased = inc, bDBDecreased = dec)
dev.off()
pdf(paste0(outpath, "/DBRFigures/VolcanoPlots/", g1, "-v-", g2,
".fdr.", fdr, ".log2fc.", fc, ".VolcanoPlots.pdf"), width = 5,
height = 5)
dba.plotVolcano(results, th = fdr, fold = fc, contrast = i,
method = method)
dev.off()
if (plot.enrich) {
PlotEnrichments(peak.anno.list, outpath, padj.thresh = fdr,
fc.thresh = fc, libraries = enrich.libs, chip = TRUE)
}
}
PlotChIPPCAs(results, outpath, method = method, fdr.thresh = fdr,
fc.thresh = fc, consensus = FALSE)
PlotChIPHeatmaps(results, outpath, method = method, fdr.thresh = fdr,
fc.thresh = fc, breaks = breaks, colors = hmap.colors,
consensus = FALSE)
}
}
}
#' @importFrom GenomicRanges makeGRangesFromDataFrame findOverlaps
#' @importFrom S4Vectors queryHits
.categorize_peaks <- function(dfs) {
# dfs = Named list of two dataframes - first for peaks, second for SEs.
# Make them GRanges objects for easy overlap.
peaks <- makeGRangesFromDataFrame(dfs$peaks, keep.extra.columns = TRUE)
peaks$cat <- "PROMOTER"
peaks$cat[((startsWith(peaks$annotation, "Intron") |
startsWith(peaks$annotation, "Downstream") |
startsWith(peaks$annotation, "Distal") |
startsWith(peaks$annotation, "5'") |
startsWith(peaks$annotation, "3'")))] <- "ENHANCER"
peaks$cat[((startsWith(peaks$annotation, "Exon")))] <- "EXON"
if (!is.null(dfs$ses)) {
ses <- makeGRangesFromDataFrame(dfs$ses, keep.extra.columns = TRUE)
peaks$inSE <- "FALSE"
hits <- findOverlaps(peaks, ses)
peaks$inSE[queryHits(hits)] <- "TRUE"
# Add a column for the peak category.
peaks$cat[((startsWith(peaks$annotation, "Promoter")) &
peaks$inSE == "TRUE")] <- "PROMOTER.SE"
peaks$cat[((startsWith(peaks$annotation, "Intron") |
startsWith(peaks$annotation, "Downstream") |
startsWith(peaks$annotation, "Distal") |
startsWith(peaks$annotation, "5'") |
startsWith(peaks$annotation, "3'")) &
peaks$inSE == "TRUE")] <- "ENHANCER.SE"
peaks$cat[((startsWith(peaks$annotation, "Exon")) &
peaks$inSE == "TRUE")] <- "EXON.SE"
}
return(as.data.frame(peaks))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.