suppressPackageStartupMessages({
  library(ChIPpeakAnno)
  library(biomaRt)
  library(AnnotationHub)
  library(EnsDb.Hsapiens.v75)
  library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  library(BSgenome.Hsapiens.UCSC.hg19)
  library(org.Hs.eg.db)
  library(UpSetR)
  library(seqinr)
  library(motifStack)
})
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(BiocManager)
install("jianhong/workshop2020", build_vignettes = TRUE)

Four steps for peak annotation

The functions, toGRanges, annotatePeakInBatch, and addGeneIDs in the ChIPpeakAnno, make the annotation of ChIP-Seq peaks streamlined into four major steps:

  1. Read peak data with toGRanges

  2. Generate annotation data with toGRanges

  3. Annotate peaks with annotatePeakInBatch

  4. Add additional informations with addGeneIDs

## First, load the ChIPpeakAnno package
library(ChIPpeakAnno)

Step 1: convert the peak data from bed/broadPeak/narrowPeak etc. to GRanges with the toGRanges method

Here we use broadPeak file format as example.

## the sample file is included in ChIPpeakAnno package. 
## chage the file path into your own file path to handle your data
path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno")
## toGRanges is overlaoded method,
## by define the correct file format to import the file in correct coordinates
peaks <- toGRanges(path, format="broadPeak")
## see top 2 lines of the imported peaks.
## the imported peaks will be packaged into GRanges object
head(peaks, n=2)

Step 2: prepare annotation data with the toGRanges method

ChIP-seq is a kind of DNA-seq. In the mapping step, no transcriptome is used. In the annotation step, both Ensembl or UCSC annotations can be used. Depend on your research, less complex genome annotations, such as UCSC annotations, are prefearable for reproducible and robust gene/transcripts annotation. To discover and explain unknown biological mechanisms, more comprehensive and complex genome annotations are necessary, such as Ensembl.

Here we show how to prepare annotations based on TxDb package with the toGRanges method.

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoDataTxDb <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)
head(annoDataTxDb, n=2)

We can also prepare annotations based on EnsDb package with the toGRanges method.

library(EnsDb.Hsapiens.v75) ##GRCh37.p13
annoDataEnsDb <- toGRanges(EnsDb.Hsapiens.v75)
head(annoDataEnsDb, n=2)

The shortage of offline annotation packages is that it may not contain up-to-date data. We can also use online annotation data via the biomaRt or AnnotationHub packages. However, to use the biomaRt package, you must be very clear that the default mart service is based the newrest genome assembly. For example, if you are annotating the data mapped to hg19/GRCh37 or mm9/NCBIM37 assembly, you must change the default server into achive server. For more details, please refer the documentation of the biomaRt package.

library(biomaRt)
## here we use ensembl GRCh37.p13 (V75) service, 
## use listEnsemblArchives() function to list the available host
## use listDatasets() function to list the available dataset
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl",
                host = "feb2014.archive.ensembl.org")
## use the getAnnotation function to obtain the TSS for ensembl GRCh37.p13.
annoDataMart <- getAnnotation(mart, featureType = "TSS")
annoDataMart[head(names(annoDataEnsDb), n=2)]
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl",
                host = "feb2014.archive.ensembl.org")
annoDataMart <- readRDS(system.file("extdata", "ChIPpeakAnno", "annoDataMart.rds",
                                    package = "workshop2020", mustWork = TRUE))
annoDataMart[head(names(annoDataEnsDb), n=2)]

Here I show some sample code to get annotations from AnnotationHub.

library(AnnotationHub)
ah <- AnnotationHub()
hg19 <- AnnotationHub::query(ah, c("UCSC", "Hsapiens", "hg19", "knownGene"))
hg19
hg19TxDb <- hg19[["AH52258"]]
annoDataHg19<- toGRanges(hg19TxDb)
identical(annoDataHg19, annoDataTxDb)
## retrieve the gencode v32 annotation
GRCh37 <- AnnotationHub::query(ah, c("GRCh37", "GENCODE", "v32", "TxDb"))
annoDataGencode <- toGRanges(GRCh37[["AH75188"]])
head(annoDataGencode, n=2)

Step 3: annotate the peaks with the annotatePeakInBatch function.

Here we will compare the difference among differnt annotation sources.

First we compare the annotations from ensembl GRCh37.p13 between the EnsDb.Hsapiens.v75 package and ensembl archives.

seqlevelsStyle(annoDataEnsDb)
seqlevelsStyle(peaks)
## keep the seqnames in the same style
if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataEnsDb))){
  seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataEnsDb)[1]
}
## do annotation by nearest TSS of Ensembl GRCh37.p13
annoEnsDb <- annotatePeakInBatch(peaks, AnnotationData=annoDataEnsDb)
head(annoEnsDb, n=2)
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(annoEnsDb$insideFeature))
## keep the seqnames in the same style
if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataMart))){
  seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataMart)[1]
}
## do annotation by nearest TSS of Ensembl GRCh37.p13
annoMart <- annotatePeakInBatch(peaks, AnnotationData=annoDataMart)
head(annoMart, n=2)
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(annoMart$insideFeature))

We can see the minor difference between them. Lets check the reasons.

## check how many annotations are different from each other
table(start(annoDataMart[names(annoDataEnsDb)])==start(annoDataEnsDb))
table(end(annoDataMart[names(annoDataEnsDb)])==end(annoDataEnsDb))
## get the samples
head(annoDataMart[names(annoDataEnsDb)][
  start(annoDataMart[names(annoDataEnsDb)])!=start(annoDataEnsDb)], n=2)
head(annoDataEnsDb[start(annoDataMart[names(annoDataEnsDb)])!=
                     start(annoDataEnsDb)], n=2)
## check the biomart output
getBM(attributes = c("ensembl_gene_id", 'chromosome_name', "description"),
      filters = "ensembl_gene_id", values = c("LRG_198", "LRG_262"), 
      mart = mart)

Here we can see that the EnsDb.Hsapiens.v75 package already fixed the issue of chromosome_name issue, where the biomart online still output the wrong chromosome_name.

Now we annotate the peaks by UCSC hg19 annotations.

## keep the seqnames in the same style
if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataTxDb))){
  seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataTxDb)[1]
}
## do annotation by nearest TSS of UCSC hg19 annotations
annoTxDb <- annotatePeakInBatch(peaks, AnnotationData=annoDataTxDb)
head(annoTxDb, n=2)
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(annoTxDb$insideFeature))

Now we try to use gencode v32 annotations to annotate the peaks.

## keep the seqnames in the same style
if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataGencode))){
  seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataGencode)[1]
}
## do annotation by nearest TSS of Gencode
annoGencode <- annotatePeakInBatch(peaks, AnnotationData=annoDataGencode)
head(annoGencode, n=2)
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(annoGencode$insideFeature))

We can see the difference between the annotations among Ensembl, UCSC and Gencode in annotation. Let's check the overlaps of annotations.

Ensembl <- unique(annoDataEnsDb)
UCSC <- unique(annoDataTxDb)
Gencode <- unique(annoDataGencode)
## make the sequence info to same, otherwise can not compare.
genome(seqinfo(Gencode)) <- 
  genome(seqinfo(UCSC)) <- 
  genome(seqinfo(Ensembl)) <- factor("hg19")
isCircular(seqinfo(UCSC)) <- rep(FALSE, length(seqlevels(UCSC)))
isCircular(seqinfo(Gencode)) <- rep(FALSE, length(seqlevels(Gencode)))
seqlevels(Ensembl) <- sub("chrMT", "chrM", seqlevels(Ensembl))
seqlengths(Ensembl)["chrM"] <- seqlengths(UCSC)["chrM"]
## find the overlaps by max gap = 0 bp.
ol <- findOverlapsOfPeaks(Ensembl, 
                          UCSC, 
                          Gencode,
                          ignore.strand = FALSE,
                          connectedPeaks="keepAll")
## venn diagram to show the overlaps
makeVennDiagram(ol, connectedPeaks = "keepAll")

Step 4: add additional annotation with the addGeneIDs function

library(org.Hs.eg.db)
annoEnsDb <- addGeneIDs(annoEnsDb, orgAnn="org.Hs.eg.db", 
                        feature_id_type="ensembl_gene_id",
                        IDs2Add=c("symbol"))
head(annoEnsDb, n=2)
annoTxDb <- addGeneIDs(annoTxDb, orgAnn="org.Hs.eg.db", 
                        feature_id_type="entrez_id",
                        IDs2Add=c("symbol"))
head(annoTxDb, n=2)
annoGencode$entrez_id <- xget(sub("(ENSG\\d{11}).*$", "\\1", 
                                  as.character(annoGencode$feature)), 
                              org.Hs.egENSEMBL2EG, 
                              output = "last")
annoGencode$symbol[!is.na(annoGencode$entrez_id)] <- 
  xget(annoGencode$entrez_id[!is.na(annoGencode$entrez_id)], org.Hs.egSYMBOL)
head(annoGencode)
library(UpSetR)
upset(fromList(list(Ensembl=unique(annoEnsDb$symbol),
                    UCSC=unique(annoTxDb$symbol),
                    Gencode=unique(annoGencode$symbol))),
      order.by = "freq")

As a conclusion, annotate with different annotation resources, even the other parameter keep same, the annotations will be different from each other. To improve the reproducibility, accuracy of an annotation source should be provided. Some people were advertizing their tools by playing this trick by using different annotations to mislead users.

Find overlaps for replicates

The function findOverlapsOfPeaks returns an object of overlappingPeaks, which contains there elements: venn_cnt, peaklist (a list of
overlapping peaks or unique peaks), and overlappingPeaks (a list of data frame consists of the annotation of all the overlapping peaks).

The following examples illustrate the usage of this method to convert BED and GFF file to GRanges, add metadata from orignal peaks to the overlap GRanges using function addMetadata, and visualize the overlapping using function makeVennDiagram.

library(ChIPpeakAnno)
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE) 
## one can also try import from rtracklayer
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
## must keep the class exactly same as gr1$score, i.e., numeric.
gr2$score <- as.numeric(gr2$score) 
ol <- findOverlapsOfPeaks(gr1, gr2, connectedPeaks = "keepAll")
## add metadata (mean of score) to the overlapping peaks
ol <- addMetadata(ol, colNames="score", FUN=mean) 
head(ol$peaklist[["gr1///gr2"]], n=2)
makeVennDiagram(ol, fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color

But if we check the sum of the overlaps, the total number for each sample are not identical to inputed peak number. This can be fixed by set 'connectedPeaks' parameter into 'keepAll'.

length(gr1)
length(gr2)
62+166
61+166
makeVennDiagram(ol, fill=c("#009E73", "#F0E442"),
                col=c("#D55E00", "#0072B2"),
                cat.col=c("#D55E00", "#0072B2"),
                connectedPeaks = "keepAll")

Select annotation method

Visualize binding site distribution relative to features

The assignChromosomeRegion function can be used to summarize the distribution of peaks over different type of features such as exon, intron, enhancer, proximal promoter, 5' UTR and 3' UTR. This distribution can be summarized in peak centric or nucleotide centric view using the function assignChromosomeRegion. Please note that one peak might span multiple type of features, leading to the number of annotated features greater than the total number of input peaks. At the peak centric view, precedence will dictate the annotation order when peaks span multiple type of features.

The sample code here plots the distribution of peaks are enriched around the promoters.

overlaps <- ol$peaklist[["gr1///gr2"]] ## get the overlapping peaks
## load TxDb to assign genomic elements
library(TxDb.Hsapiens.UCSC.hg19.knownGene) 
aCR<-assignChromosomeRegion(overlaps, nucleotideLevel=FALSE, 
                           precedence=c("Promoters", "immediateDownstream", 
                                         "fiveUTRs", "threeUTRs", 
                                         "Exons", "Introns"), 
                           TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
barplot(aCR$percentage, las=3)

In addition, the distribution of the distance of overlapped peaks to the nearest feature such as the transcription start sites (TSS) can be plotted by the binOverFeature function. The sample code here plots the distribution of peaks around the TSS.

binOverFeature(overlaps, annotationData=annoDataTxDb,
               radius=5000, nbins=20, FUN=length, errFun=0,
               ylab="count", 
               main="Distribution of aggregated peak numbers around TSS")

Annotate peaks

The annotatePeakInBatch function provide multiple methods to annotate the peaks and those methods can be set by combination of multiple parameters. The 'output' is the key parameter to determine the annotation method. The default method is search the nearest features calculated as 'PeakLoc - FeatureLocForDistance'. For more information, please refer the documentation of the annotatePeakInBatch function.

As shown from the distribution of aggregated peak numbers around TSS and the distribution of peaks in different of chromosome regions, most of the peaks locate around TSS. Therefore, it is reasonable to use the annotatePeakInBatch or annoPeaks to annotate the peaks to the promoter regions of Hg19/GRCh37 genes. Promoters can be specified with 'bindingRegion'. For the following example, promoter region is defined as upstream 2000 and downstream 500 from TSS (bindingRegion=c(-2000, 500)).

overlaps.anno <- annotatePeakInBatch(overlaps, 
                                     AnnotationData=annoDataTxDb, 
                                     output="nearestBiDirectionalPromoters",
                                     bindingRegion=c(-2000, 500))
library(org.Hs.eg.db)
overlaps.anno <- addGeneIDs(overlaps.anno,
                            "org.Hs.eg.db",
                            feature_id_type="entrez_id",
                            IDs2Add = "symbol")
head(overlaps.anno, n=2)
library(WriteXLS)
WriteXLS(as.data.frame(unname(overlaps.anno)), "anno.xls")

The distribution of the common peaks around features can be visualized using a pie chart.

pie1(table(overlaps.anno$insideFeature))

Obtain enriched GO terms and Pathways

The following example shows how to use getEnrichedGO to obtain a list of enriched GO terms with annotated peaks. For pathway analysis, please use function getEnrichedPATH with reactome or KEGG database. Please note that by default feature_id_type is set as "ensembl_gene_id". If you are using TxDb as annotation data, please set it to "entrez_id".

over <- getEnrichedGO(overlaps.anno, orgAnn="org.Hs.eg.db", 
                      feature_id_type="entrez_id",
                      maxP=.05, minGOterm=10, 
                      multiAdjMethod="BH", condense=TRUE)
head(over[["bp"]][, -c(3, 10)], n=2)
library(reactome.db)
path <- getEnrichedPATH(overlaps.anno, "org.Hs.eg.db", "reactome.db", 
                        feature_id_type="entrez_id", maxP=.05)
head(path, n=2)

Output a summary of consensus in the peaks

There are multiple methods to get the consensus in the peaks:

  1. output the fastq file by the getAllPeakSequence function and search the motif by the 3rd program such as homer, MEME, and so on.

  2. test the pre-defined consensus patterns to see if target consensus are enriched or not by the summarizePatternInPeaks function.

  3. calculate the z-scores of all combinations of oligonucleotide in a given length by Markove chain by the oligoSummary function.

Here is an example to get the Z-scores for short oligos.

library(seqinr)
library(BSgenome.Hsapiens.UCSC.hg19)
seq <- getAllPeakSequence(overlaps, 
                          upstream=20, downstream=20, 
                          genome=Hsapiens)
## output the fasta file for the 3nd program
write2FASTA(seq, "test.fa")
## summary of the short oligos
os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, 
                   quickMotif=TRUE)
## plot the results
zscore <- sort(os$zscore)
h <- hist(zscore, breaks=100, main="Histogram of Z-score")
text(zscore[length(zscore)], max(h$counts)/10, 
     labels=names(zscore[length(zscore)]), srt=90)
## generate the motifs
library(motifStack)
pfms <- mapply(function(.ele, id)
    new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), 
    os$motifs, 1:length(os$motifs))
motifStack(pfms)

Find peaks with bi-directional promoters

Bidirectional promoters are the DNA regions located between TSS of two adjacent genes that are transcribed on opposite directions and often co-regulated by this shared promoter region. Here is an example to find peaks near bi-directional promoters.

bdp <- peaksNearBDP(overlaps, annoDataTxDb, maxgap=5000)
c(bdp$percentPeaksWithBDP, 
  bdp$n.peaks, 
  bdp$n.peaksWithBDP)
head(bdp$peaksWithBDP, n=2)

Determine if there is a significant overlap among multiple sets of peaks

Given two or more peak lists from different TFs, one may be interested in finding whether DNA binding profile of those TFs are correlated, and if correlated, what is the common binding pattern.

Here we will show how to compare binding profiles from multiple transcription factors (TFs) by ChIP-seq sample data of TAF, YY1 and Tead4 from mouse.

path <- system.file("extdata", package="ChIPpeakAnno")
files <- dir(path, "broadPeak")
data <- sapply(file.path(path, files), toGRanges, format="broadPeak")
(names(data) <- gsub(".broadPeak", "", files))

Hypergeometric test

When we test the association between two sets of data based on hypergeometric distribution, the number of all potential binding sites is required. The parameter totalTest in the makeVennDiagram function indicates how many potential peaks in total will be used in the hypergeometric test. It should be larger than the largest number of peaks in the peak list. The smaller it is set, the more stringent the test is. The time used to calculate p-value does not depend on the value of the totalTest. For practical guidance on how to choose totalTest, please refer to the post. The following example makes an assumption that there are 3% of coding region plus promoter region. Because the sample data is only a subset of chromosome 2, we estimate that the total binding sites is 1/24 of possible binding region in the genome.

ol <- findOverlapsOfPeaks(data, connectedPeaks="keepAll")
averagePeakWidth <- mean(width(unlist(GRangesList(ol$peaklist))))
tot <- ceiling(3.3e+9 * .03 / averagePeakWidth / 24)
makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepAll", 
                fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color
                cat.col=c("#D55E00", "#0072B2", "#E69F00"))
## see the difference if we set connectedPeaks to "keepFirstListConsistent"
## set connectedPeaks to keepFirstListConsistent will show consistent total 
## number of peaks for the first peak list.
makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepFirstListConsistent", 
                fill=c("#CC79A7", "#56B4E9", "#F0E442"),
                col=c("#D55E00", "#0072B2", "#E69F00"),
                cat.col=c("#D55E00", "#0072B2", "#E69F00"))

Permutation test

The above hypergeometric test requires users to input an estimate of the total potential binding sites for a given TF. To circumvent this requirement, we implemented a permutation test called peakPermTest. Before performing a permutation test, users need to generate random peaks using the distribution discovered from the input peaks for a given feature type (transcripts or exons), to make sure the binding positions relative to features, such as TSS and geneEnd, and the width of the random peaks follow the distribution of that of the input peaks.

Alternatively, a peak pool representing all potential binding sites can be created with associated binding probabilities for random peak sampling using the preparePool function. Here is an example to build a peak pool for human genome using the transcription factor binding site clusters (V3) (see ?wgEncodeTfbsV3) downloaded from ENCODE with the HOT spots (?HOT.spots) removed. HOT spots are the genomic regions with high probability of being bound by many TFs in ChIP-seq experiments. We suggest remove those HOT spots from the peak lists before performing permutation test to avoid the overestimation of the association between the two input peak lists. Users can also choose to remove ENCODE blacklist for a given species. The blacklists were constructed by identifying consistently problematic regions over independent cell lines and types of experiments for each species in the ENCODE and modENCODE datasets. Please note that some of the blacklists may need to be converted to the correct genome assembly using liftover utility.

Following are the sample codes to do the permutation test using the permTest function:

    data(HOT.spots)
    data(wgEncodeTfbsV3)
    hotGR <- reduce(unlist(HOT.spots))
    removeOl <- function(.ele){
        ol <- findOverlaps(.ele, hotGR)
        if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))]
        .ele
    }
    TAF <- removeOl(data[["TAF"]])
    TEAD4 <- removeOl(data[["Tead4"]])
    YY1 <- removeOl(data[["YY1"]])
    # we subset the pool to save demo time
    set.seed(1)
    wgEncodeTfbsV3.subset <- 
        wgEncodeTfbsV3[sample.int(length(wgEncodeTfbsV3), 2000)]
    pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3.subset), N=length(YY1))
    pt1 <- peakPermTest(YY1, TEAD4, pool=pool, seed=1, force.parallel=FALSE)
    plot(pt1)
    pt2 <- peakPermTest(YY1, TAF, pool=pool, seed=1, force.parallel=FALSE)
    plot(pt2)

Metagene analysis for given feature/peak ranges

You can easily visualize and compare the binding patterns of raw signals of multiple ChIP-Seq experiments using function featureAlignedHeatmap and featureAlignedDistribution.

features <- ol$peaklist[[length(ol$peaklist)]]
feature.recentered <- reCenterPeaks(features, width=4000)
## here we also suggest importData function in bioconductor trackViewer package 
## to import the coverage.
## compare rtracklayer, it will save you time when handle huge dataset.
library(rtracklayer)
files <- dir(path, "bigWig")
if(.Platform$OS.type != "windows"){
    cvglists <- sapply(file.path(path, files), import, 
                       format="BigWig", 
                       which=feature.recentered, 
                       as="RleList")
}else{## rtracklayer can not import bigWig files on Windows
    load(file.path(path, "cvglist.rds"))
}
names(cvglists) <- gsub(".bigWig", "", files)
feature.center <- reCenterPeaks(features, width=1)
sig <- featureAlignedSignal(cvglists, feature.center, 
                            upstream=2000, downstream=2000)
##Because the bw file is only a subset of the original file,
##the signals are not exists for every peak.
keep <- rowSums(sig[[2]]) > 0
sig <- sapply(sig, function(.ele) .ele[keep, ], simplify = FALSE)
feature.center <- feature.center[keep]
heatmap <- featureAlignedHeatmap(sig, feature.center, 
                                 upstream=2000, downstream=2000,
                                 upper.extreme=c(3,.5,4))
featureAlignedDistribution(sig, feature.center, 
                           upstream=2000, downstream=2000,
                           type="l")


jianhong/workshop2020 documentation built on April 20, 2021, 9:54 a.m.