knitr::opts_chunk$set(echo = TRUE)
In this case study, we perform differential peak calling on ChIP-seq data of the histone H3K4Me3 for samples from two cell lines (K562 and Gm12878) using publicly available data generated by the ENCODE Project. The same data set is used for all ChIP-seq differential testing case studies.
While several approaches have been proposed for differential peak calling, in this case study we examine a simple approach of testing for differential peaks in the promoter regions of all genes. While using pre-defined regions can reduce the power of the test to detect differences in non-promoter regions, since H3K4Me3 is an active marker of gene expression, restricting tests to promoter regions is not unreasonable. Differential peaks are tested using DESeq2.
library(dplyr) library(ggplot2) library(SummarizedBenchmark) library(BiocParallel) library(GenomicRanges) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(DESeq2) library(Rsubread) ## load helper functions for (f in list.files("../R", "\\.(r|R)$", full.names = TRUE)) { source(f) } ## project data/results folders datdir <- "data" resdir <- "results" sbdir <- "../../results/ChIPseq" dir.create(datdir, showWarnings = FALSE) dir.create(resdir, showWarnings = FALSE) dir.create(sbdir, showWarnings = FALSE) ## intermediary files we create below count_file <- file.path(resdir, "h3k4me3-promoters-counts.rds") result_file <- file.path(resdir, "h3k4me3-promoters-results.rds") bench_file <- file.path(sbdir, "h3k4me3-promoters-benchmark.rds") ## set up parallel backend cores <- as.numeric(Sys.getenv("SLURM_NTASKS")) multicoreParam <- MulticoreParam(workers = cores)
We downloaded the bam files directly from UCSC ENCODE portal.
broad_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHistone/" broad_bams <- c("wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bam", "wgEncodeBroadHistoneGm12878H3k04me3StdAlnRep2V2.bam", "wgEncodeBroadHistoneK562H3k4me3StdAlnRep1.bam", "wgEncodeBroadHistoneK562H3k4me3StdAlnRep2.bam") for (i_bam in broad_bams) { if (!file.exists(file.path(datdir, i_bam))) { download.file(paste0(broad_url, i_bam), destfile = file.path(datdir, i_bam)) } i_bai <- paste0(i_bam, ".bai") if (!file.exists(file.path(datdir, i_bai))) { download.file(paste0(broad_url, i_bai), destfile = file.path(datdir, i_bai)) } } uw_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeUwHistone/" uw_bams <- c("wgEncodeUwHistoneGm12878H3k4me3StdAlnRep1.bam", "wgEncodeUwHistoneGm12878H3k4me3StdAlnRep2.bam", "wgEncodeUwHistoneK562H3k4me3StdAlnRep1.bam", "wgEncodeUwHistoneK562H3k4me3StdAlnRep2.bam") for (i_bam in uw_bams) { if (!file.exists(file.path(datdir, i_bam))) { download.file(paste0(uw_url, i_bam), destfile = file.path(datdir, i_bam)) } i_bai <- paste0(i_bam, ".bai") if (!file.exists(file.path(datdir, i_bai))) { download.file(paste0(uw_url, i_bai), destfile = file.path(datdir, i_bai)) } }
We determine sample metadata from the file names.
bamfiles <- c(broad_bams, uw_bams) labs <- gsub("wgEncode(.*)Histone.*", "\\1", basename(bamfiles)) cells <- gsub("wgEncode.*Histone(.*)H3k.*", "\\1", basename(bamfiles)) meta <- data.frame(cellline = cells, lab = labs, file = bamfiles)
Next, we identify promotor regions using the UCSC "Known Gene" annotations for human genome assembly hg19 (GRCh37).
prom <- promoters(genes(TxDb.Hsapiens.UCSC.hg19.knownGene)) prom <- keepSeqlevels(prom, paste0("chr", c(1:22, "X", "Y", "M")), pruning.mode = "coarse")
We count reads for all promoter regions.
if (file.exists(count_file)) { rc <- readRDS(count_file) } else { anno <- data.frame(GeneID = seq_len(length(prom)), Chr = seqnames(prom), Start = start(prom), End = end(prom), Strand = strand(prom)) rc <- featureCounts(files = file.path(datdir, bamfiles), annot.ext = anno, allowMultiOverlap = TRUE, minOverlap = 50, readExtension3 = 150, ignoreDup = TRUE)$counts saveRDS(rc, file = count_file) }
We test for differential binding at each of the promoters.
if (file.exists(result_file)) { dat <- readRDS(result_file) } else { dds <- DESeqDataSetFromMatrix(countData = rc, colData = as.data.frame(meta), design = ~ lab + cellline) dds <- DESeq(dds, fitType = "mean") dat <- as.data.frame(results(dds, independentFiltering = FALSE)) colnames(dat) <- c('ind_covariate', 'effect_size', 'SE', 'test_statistic', 'pval', 'padj') dat <- dat[!is.na(dat$effect_size), ] saveRDS(dat, file = result_file) }
Here, we can check diagnostic plots to assess whether candidate covariates are actually informative.
We consider the average coverage across conditions and replicates as a potential covariate of interest. This is motivated by our prior assumption that ChIP-seq data always involves two sets of regions: signal and background. Sometimes, there are a few regions with signal coverage between signal and background. In this example, we see that p-values are differently distributed under low, median and high coverage.
rank_scatter(dat, pvalue = "pval", covariate = "ind_covariate")
strat_hist(dat, pvalue = "pval", covariate = "ind_covariate", maxy = 30)
We use the common BenchDesign
with the set of multiple testing correction
methods already included. Now, we're ready to construct the SummarizedBenchmark
object, which will run the functions specified in each method.
We also add in Scott's FDR Regression (both
nulltype = "empirical"
and nulltype = "theoretical"
)
since our test statistics are normally-distributed.
if (file.exists(bench_file)) { sb <- readRDS(bench_file) } else { bd <- initializeBenchDesign() bd <- addBMethod(bd, "fdrreg-t", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'theoretical', control = list(lambda = 0.01)) bd <- addBMethod(bd, "fdrreg-e", FDRreg::FDRreg, function(x) { x$FDR }, z = test_statistic, features = model.matrix( ~ splines::bs(ind_covariate, df = 3) - 1), nulltype = 'empirical', control = list(lambda = 0.01)) sb <- buildBench(bd, data = dat, ftCols = c("ind_covariate"), parallel = TRUE, BPPARAM = multicoreParam) saveRDS(sb, file = bench_file) }
Next, we'll add the default performance metric for q-value assays. First, we have to rename the assay to 'qvalue'.
assayNames(sb) <- "qvalue" sb <- addDefaultMetrics(sb)
Now, we'll plot the results.
rejections_scatter(sb, supplementary = FALSE) rejection_scatter_bins(sb, covariate = "ind_covariate", bins = 4, supplementary = FALSE)
plotFDRMethodsOverlap(sb, alpha = 0.05, nsets = ncol(sb), order.by = "freq", decreasing = TRUE, supplementary = FALSE)
covariateLinePlot(sb, alpha = 0.05, covname = "ind_covariate")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.