knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this vignette for the MiniChip package we will generate a GRanges object of ChIP-seq peaks for Adnp, as well as a matched Granges object of randomized peaks across the mouse genome. Then we will plot the distribution of Adnp read counts (per Million reads) across the peak summits.

suppressPackageStartupMessages({
  library(MiniChip)
  library(ComplexHeatmap)
  library(GenomicFeatures)
  library(ggplot2)
  library(reshape2)
  library(dplyr)
  library(BSgenome.Mmusculus.UCSC.mm10)
  #library(TxDb.Mmusculus.UCSC.mm10.knownGene)
  library(TxDb.Mmusculus.UCSC.mm10.ensGene)
  library(EnsDb.Mmusculus.v79)
})

Generate a peaks GRanges object.

First we load the narrowPeak output files generated by MACS peak finding for all 3 Adnp ChIP replicates, name the culumns in a meaningful way, and turn them into GRanges objects.

peaks1.d <- read.table(system.file("extdata", "Adnp_rep1_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks2.d <- read.table(system.file("extdata", "Adnp_rep2_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks3.d <- read.table(system.file("extdata", "Adnp_rep3_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)

names(peaks1.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks2.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks3.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")

peaks1 <- makeGRangesFromDataFrame(peaks1.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

peaks2 <- makeGRangesFromDataFrame(peaks2.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

peaks3 <- makeGRangesFromDataFrame(peaks3.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

Then we select only the peaks which occur in at least 2 of the 3 replicates using the GRanges subsetByOverlaps and overlapsAny functions.

# peaks that occur in at least 2/3 replicates
peaks3.1 <- subsetByOverlaps(peaks3,peaks1)
peaks3.2 <- subsetByOverlaps(peaks3,peaks2)
peaks1.2 <- subsetByOverlaps(peaks1,peaks2)

#keep all 3.1 peaks, find 3.2 and 1.2 peaks that do not overlap 3.1 peaks

# peaks which are found in 3 and 2, but not in 3 and 1, and not in 1 and 2
peaks3.2u <- peaks3.2[overlapsAny(peaks3.2,peaks3.1) ==FALSE & overlapsAny(peaks3.2,peaks1.2) == FALSE]
# peaks which are found in 1 and 2, but not in 3 and 1, and not in 3 and 2
peaks1.2u <- peaks1.2[overlapsAny(peaks1.2,peaks3.1) ==FALSE & overlapsAny(peaks1.2,peaks3.2) == FALSE]
# add together all 3.1 peaks as well as unique 3.2 and 1.2 peaks
peaks <- c(peaks3.1,peaks3.2u,peaks1.2u)

We can now generate a peaknames object describing the peaks GRanges object we have just selected (for later use), and look at the peaks GRanges object to make sure it countains seqnames (chromosomes), start, end, strand, summit, and name columns, as required by many functions in the MiniChip package.

peaknames <- "Adnp_peaks"
peaks

Generate a randomly shuffled peaks GRanges object.

Then we use the SimulatePeaks function from the MiniChip package to generate a GRanges object called random.peaks that contains the same number of peaks (of the same lengths) as the original peaks GRanges object, but ech one shuffled to a randomly chosen location in the mouse genome. This is a useful control object to have when comparing the overlap of ChIP peaks with genomic annotations, for example genes or repeat elements.

#  randomize peak locations
nSites <- length(peaks)
peak.widths <- width(peaks)
random.peaks <-  SimulatePeaks(nSites,peak.widths,chromosomeSizes=system.file("extdata", "chrNameLength_mm10_chr11.txt", package = "MiniChip"))

Draw heatmaps of Adnp ChIP-seq read counts across the peak summits (+/- 3025bp).

First, we select the bam files of mapped reads that we want to use from the gbuehler deepSeqRepos....

#select bam files from Adnp experiment
bamFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bam$")
#all.bamFiles <- list.files("/tachyon/scratch/gbuehler/deepSeqData/bam/", full.names=TRUE,pattern="*bam$")
#all.bamFiles <- grep("test",all.bamFiles,value=TRUE,invert=TRUE)
#bamFiles <- grep("1950F",all.bamFiles,value=TRUE)[1:8]
bamFiles

...and extract short but meaningful names for the bam files decribing what has been chipped.

bamNames <- gsub(paste(system.file("extdata", package = "MiniChip"),"/",sep=""),"",bamFiles)
bamNames <- gsub("_chr11.bam","",bamNames)
bamNames

You can select any bamFile that you are interested in, and the number of bamFiles you can choose is unlimited.

Check GC bias.

To check if the enrichments are biased towards GC rich regions, we can use the GCbias function to plot GC content versus the read counts for each alignment file.

GCbias(bamFiles[1:2],bamNames[1:2],pe="none", restrict="chr11",GCprob=TRUE, genome=BSgenome.Mmusculus.UCSC.mm10)

It looks like there is no GC bias in our selected bam files, so we can continue to calculate the heatmaps.

Calculate heatmaps (normalized read counts in windows around peaks).

Now we define the span (distance from peak summit to left and right, in this case 3025bp), and step size for windows across the peaks (in this case 50bp). Furthermore we crate a summits GRanges object that contains the genomic coordinates of the peak summits as s trat and end positions.

Then we use the SummitHeatmap function for the MiniChip package to calculate the counts per million (cpm) reads of each bam file in each window across the summit of each peak in the peaks object.

span <- 3025
step <- 50
summits <- peaks
start(summits) <- start(summits) + summits$summit
end(summits) <- start(summits)
names(summits) <- elementMetadata(summits)[,"name"]

counts <- SummitHeatmap(summits,bamFiles,bamNames,span,step,useCPM=TRUE,readShiftSize=75)

This will return the counts object, a list of length 8 (since we are using 8 bam files), where each element of the list contains a matrix of cpm values across all windows for all peaks. If you want to also save a seperately clustered heatmap for each bam file to a file, set plotHM=TRUE.

To plot heatmaps for each sample (bam file) sorted by the first bam file, you can use the DrawSummitHeatmaps function. By providing shorter names for the samples you can ensure nice heatmap titles. Here we select only the ChIP samples (and not the Input samples).

ht_list <- DrawSummitHeatmaps(counts[c(1,2,3)], bamNames[c(1,2,3)],orderSample=1, use.log=TRUE,
                              bottomCpm=c(0,0,0), medianCpm = c(2,2,2), topCpm = c(4,4,4), orderWindows=2, TargetHeight=500)
draw(ht_list, padding = unit(c(3, 8, 8, 2), "mm"))

Draw cumulative plots of Adnp ChIP-seq and Input read counts across the peak summits (+/- 3025bp).

First we will calculate the mean counts per million (cpm) values for the three replicates of Adnp Chip and Input using the SummarizeHeatmaps function.

inx.ChIP <- grep("ChIP",names(counts))
Adnp <- grep("Adnp",names(counts[inx.ChIP]),value=TRUE)
Input <- grep("Input",names(counts),value=TRUE)[1:3]

sampleList <- list(Adnp=Adnp,Input=Input)
meanCounts <- SummarizeHeatmaps(counts,sampleList)

Then we can split the epaks into those that overlap a promoter and those that do not, and generate the cumululative plots using the CumulativePlots function.

#select peaks that overlap a TSS (+/- 1kb)
#txdb=loadDb("/tungstenfs/scratch/gbuehler/michi/Annotations/GENCODE/Mouse/release_M23/gencode.vM23.annotation.txdb.sqlite")
txdb=TxDb.Mmusculus.UCSC.mm10.ensGene
TSSs <- promoters(txdb,upstream=1000,downstream=1000)
summit_TSS_overlap <- overlapsAny(summits,TSSs)
summit_TSS_overlap_names <- names(summits[summit_TSS_overlap == TRUE])

#cumulative plots
CumulativePlots(meanCounts,bamNames = names(meanCounts),
                                   span=3025,step=50,summarizing = "mean",overlapNames = summit_TSS_overlap_names)

This suggests that some of the Adnp peaks overlap promoters, and that they have as much enrichment in Adnp as the ones which do not overlap promoters. To see which ones overlap promoters, we can add the promoter annotation to the heatmaps, using the AnnotationHeatmap function.

Draw heatmaps of Adnp ChIP-seq and Input read counts and promoter annotation around the peak summits (+/- 3025bp).

promoters <- promoters(txdb,upstream=150,downstream=150)
annoname <- "promoters"
promoters.overlap <- AnnotationHeatmap(summits,annotation=promoters,annoname,span,step)
annos <- list(promoters.overlap)
names(annos) <- c("promoters")
allCounts <- c(meanCounts,annos)

cols <- c("darkblue","darkred","darkgreen")
upper.cpm <- c(rep(2,2),rep(1,1))
splitHM <- ifelse(summit_TSS_overlap==TRUE,"TSS","no TSS")

heatmap_list <- DrawSummitHeatmaps(allCounts, names(allCounts), plotcol= cols, medianCpm = upper.cpm, orderSample = 1, use.log=TRUE, summarizing = "mean", orderWindows=2,MetaScale=c("all","all","individual"), TargetHeight=500,
                                   splitHM=splitHM)
draw(heatmap_list, padding = unit(c(3, 8, 8, 2), "mm"),show_heatmap_legend=FALSE)

generate the data frames needed to plot tracks for a selected genomic region

#bigwig files
bigwigFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bw$")
bigwigNames <- c("Adnp_r1","Adnp_r2")
bigwigGroupNames <- c("Adnp","Adnp")

#region for plotting
ROI <- peaks[peaks$score==max(peaks$score)]
ROI <- resize(ROI,1000L, fix="center")

#gene annotations
genes <- genes(txdb)
gene_anno <- genes(EnsDb.Mmusculus.v79)
mcols(genes) <- dplyr::left_join(data.frame(mcols(genes)),data.frame(mcols(gene_anno)),by=c("gene_id"="gene_id"))
exons <- exons(txdb)

#repeat annotations
data(repeat_annotation, package = "MiniChip")
reps

#calculate data frames for track plotting
trackData <- calcTracks(bigwigFiles, bigwigNames, bigwigGroupNames, ROI, CoverageSmoothing=TRUE, smoothing_window=20,reps=reps,genes=genes,exons=exons)

plot the tracks

# Define the hex codes for each group
  color_map <- c(
    "Adnp" = "#3aa3db"
  )
plotTracks(trackData, bigwigNames, color_map)


fmi-basel/gbuehler-MiniChip documentation built on June 13, 2025, 6:15 a.m.