#' sample background sequences based on mathed GC content distribution from peaks
#' @param features sets of features (peaks) for comparison (Grange object)
#' @param genome genome sequences
sampleBackgroundSeq <- function(obj, features, genome="BSgenome.Hsapiens.NCBI.GRCh38", nsample=NULL){
require(Biostrings)
peaks <- obj[["peaks"]]
if(is.null(peaks)){
stop("please provide peak when run addAtacObj function")
}
if(is.null(nsample)){
nsample <- length(features)
}
if(genome == "BSgenome.Hsapiens.NCBI.GRCh38"){
require(BSgenome.Hsapiens.NCBI.GRCh38)
genomeSeq <- BSgenome.Hsapiens.NCBI.GRCh38
seqnames(genomeSeq) <- paste("chr", seqnames(genomeSeq), sep="")
#species code
spcode <- 9606
# remove uncharaterized chromosomes
chrlist <- paste("chr", seq(1:22), sep="")
chrlist <- c(chrlist, "chrMT", "chrX", "chrY")
peaks <- peaks[as.character(peaks@seqnames) %in% chrlist]
}
# remove uncharaterized chromosomes
chrlist <- paste("chr", seq(1:22), sep="")
chrlist <- c(chrlist, "chrMT", "chrX", "chrY")
features <- features[as.character(features@seqnames) %in% chrlist]
# get background regions
backgroundRegion <- GenomicRanges::setdiff(peaks, features)
peakSeq <- Biostrings::getSeq(genomeSeq, backgroundRegion)
peakSeqCount <- Biostrings::alphabetFrequency(peakSeq)
# GC distribution for all peaks
GCbackground <- apply(peakSeqCount, 1, function(r){
round((r[2] + r[3])/(r[1]+r[2]+r[3]+r[4]), 2)
})
featureSeq <- Biostrings::getSeq(genomeSeq, features)
featureSeqCount <- Biostrings::alphabetFrequency(featureSeq)
GCfeatures <- apply(featureSeqCount, 1, function(r){
round((r[2] + r[3])/(r[1]+r[2]+r[3]+r[4]), 2)
})
# fit distribution
featureWeights <- rep(0, length(backgroundRegion))
densityEst <- density(GCfeatures, kernel = "gaussian", bw = 1)
weights <- approx(densityEst$x, densityEst$y, xout=GCbackground,
yright = 0.00001,
yleft = 0.00001)$y
featureWeights <- weights
sampledRegion <- backgroundRegion[sample(seq(length(backgroundRegion)), size = nsample,
prob = featureWeights, replace=FALSE)]
return(sampledRegion)
}
#' identify enriched motifs for a feature set (peaks) based on query regions
#' @param targetRegion genomic region sets for do enrichment analysis (e.g. chr17:44069893-44071328)
#' @param queryRegion genomic regions for all features (peaks)
enrichMotifs <- function(obj, targetRegion, sampleMethod="match.GC", nsample=10000){
options("scipen"=0.0001, "digits"=4)
motifMatrix <- obj[["motifMatrix"]]$matrix
motifname <- obj[["motifMatrix"]]$motifname
if(is.null(motifMatrix)){
stop("please calculate motif matrix first using addMotifMatrix")
}
targetMotifs <- motifMatrix[rownames(motifMatrix) %in% targetRegion, ]
# transform to Grange object
targetChrs <- sapply(strsplit(targetRegion, ":"), `[`, 1)
targetGenomicRegion <- sapply(strsplit(targetRegion, ":"), `[`, 2)
targetRegionGrange <- GRanges(targetChrs, IRanges(as.numeric(sapply(strsplit(targetGenomicRegion, "-"), `[`, 1)),
as.numeric(sapply(strsplit(targetGenomicRegion, "-"), `[`, 2))))
# sample regions
backgroundSeq <- sampleBackgroundSeq(obj, targetRegionGrange, nsample=nsample)
# transform back to characters
backgroundSeq <- paste0(as.character(seqnames(backgroundSeq)), ":",
start(backgroundSeq), "-", end(backgroundSeq))
motifMatrix.bg <- motifMatrix[backgroundSeq, ]
targetMotifCounts <- Matrix::colSums(targetMotifs)
bgMotifCounts <- Matrix::colSums(motifMatrix.bg)
observed <- targetMotifCounts / length(targetRegion) * 100
sampled <- bgMotifCounts / length(backgroundSeq) * 100
fold.enrichment <- round(observed / sampled, 2)
# hypergeometric test
pVals <- unlist(lapply(seq(targetMotifCounts), function(r){
phyper(targetMotifCounts[r]-1, bgMotifCounts[r],
n=nrow(motifMatrix.bg) - bgMotifCounts[r],
k=length(targetRegion), lower.tail=FALSE)
}))
enrichedMotifs <- data.frame(motif=colnames(targetMotifs),
observedCount=targetMotifCounts,
backgroundCount=bgMotifCounts,
observedPercent=observed,
backgroundPercent=sampled,
fold.enrichment=fold.enrichment,
pvalue = pVals, name=motifname)
enrichedMotifs <- enrichedMotifs[order(enrichedMotifs$pvalue), ]
return(enrichedMotifs)
}
#' identify overrepresentative motifs in DAR regions for each cluster
#' @param obj pagoda2 object
#' @param PvalueCutoff pvalue cutoff for significant DAR regions
#' @export
enrichMotifsDAR <- function(obj, sampleMethod="match.GC", PvalueCutoff=0.05, peakCutoff=500,
nsample=10000, genome="BSgenome.Hsapiens.NCBI.GRCh38", use.pvalues=FALSE){
if(is.null(obj[["DAR"]])){
stop("please run idenDAR to identify differatial accessible regions first")
}
dar <- obj[["DAR"]]
clusters <- names(dar)
enrichedMotifCluster <- parallel::mclapply(seq(length(clusters)), function(r){
darRegion <- dar[[r]]
if(use.pvalues){
sigDARregion <- rownames(darRegion[darRegion$PValue < PvalueCutoff, ])
sigDARregion <- sigDARregion[1:min(length(sigDARregion), peakCutoff)]
} else{
sigDARregion <- rownames(darRegion[darRegion$adjPValue<PvalueCutoff, ])
sigDARregion <- sigDARregion[1:min(length(sigDARregion), peakCutoff)]
}
if(length(sigDARregion)>5){
enrichedMotifCluter <- enrichMotifs(obj, sigDARregion, sampleMethod="match.GC", nsample=nsample)
} else{
enrichedMotifCluter <- NULL
}
enrichedMotifCluter
}, mc.cores=5)
names(enrichedMotifCluster) <- clusters
obj[["enrichedMotifs"]] <- enrichedMotifCluster
return(obj)
}
#' plot top motifs for each cluster
#' @export
plotTopMotifs <- function(obj, ntop=5, genome="human"){
require(ggseqlogo)
require(ggplot2)
require(JASPAR2018)
require(TFBSTools)
enrichedMotifs <- obj[["enrichedMotifs"]]
if(is.null(enrichedMotifs)){
stop("please run enrichMotifsDAR first")
}
if(genome=="human"){
pwms <- TFBSTools::getMatrixSet(JASPAR2018, opts = list(species = 9606, all_versions = FALSE))
}
motifPlot <- lapply(seq(as.numeric(names(enrichedMotifs))), function(r){
clMotifs <- enrichedMotifs[[r]]
if(!is.null(clMotifs) & !is.null(nrow(clMotifs[2]))){
motifs <- rownames(clMotifs)[1:ntop]
motifs.pwm <- pwms[names(pwms) %in% motifs]
motif.profiles <- lapply(motifs.pwm, function(m){
m@profileMatrix})
motif.names <- unlist(lapply(motifs.pwm, function(m){
m@name}))
names(motif.profiles) <- motif.names
ggseqlogo(motif.profiles, ncol=1) + ggplot2::ggtitle(paste("cluster", r, sep=" "))
}
})
# remove clusters that have null element
motifPlot <- plyr::compact(motifPlot)
cowplot::plot_grid(plotlist=motifPlot, nrow=3)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.