# Copyright © 2015-2017 The Bratwurst package contributors
# This file is part of the Bratwurst package. The Bratwurst package is licenced
# under GPL-3
#==============================================================================#
# Author: Sebastian Steinhauser - s.steinhauser@gmx.net
# Date: 12.04.2016
# Comments: Runs NMF-GPU for a given matrix.
#==============================================================================#
# Load Libraries
library(GenomicRanges)
library(ggplot2)
library(gridExtra)
library(data.table)
library(grid)
library(scales)
library(cowplot)
library(RColorBrewer)
library(ComplexHeatmap)
library(rtracklayer)
library(limma)
library(reshape2)
library(cluster)
library(circlize)
# Source required functions.
#source('/home/steinhau/NMF-GPU_toolbox/src/nmfGPU_functions.R')
source('/home/thymin/Projects/NMF-GPU_toolbox/src/nmfGPU_functions.R')
#==============================================================================#
# MAIN #
#==============================================================================#
samples <- 'mENCODE-SE'
# Define result path for plots etc.
prefix <- 'heart' #HSC'
#result.path <- file.path('/home/steinhau/NMF-GPU_toolbox/results/', sprintf('%s_meanSESsubtract_matrix_raw', prefix))
result.path <- file.path('/home/thymin/Projects/NMF-GPU_toolbox/results/', sprintf('%s_meanSESsubtract_matrix_raw', prefix))
result.path <- file.path('/home/steinhau/NMF-GPU_toolbox/results/', sprintf('%s_mergedRPKM_matrix', prefix))
if(!file.exists(result.path)){dir.create(result.path)}
# Read matrix file.
matrix.file <- file.path('/home/steinhau/NMF-GPU_toolbox/data/mENCODE', prefix,
'H3K27ac_SESsubtract_mean_mergedConPeaks_matrix.txt')
matrix.file <- '/home/steinhau/NMF-GPU_toolbox/data/HSC/H3K27Ac_RPKM_mean_mergedConPeaks_matrix.txt'
raw.matrix <- fread(matrix.file, sep = '\t')
# Compute mean peak score threshold.
max.peakScore <- apply(raw.matrix, 1, max)
peakScore.thres <- 30 #quantile(max.peakScore, 0.25)
svg(file.path(result.path, 'hist_maxConPeakScore_thres.svg'))
hist(max.peakScore, xlim = c(0, 100), breaks = 10^4)
abline(v = peakScore.thres, col = 'red')
dev.off()
# Filter regions with low coverage ( <= 10 norm read counts on average)
i.covKeep <- which(max.peakScore >= peakScore.thres)
raw.matrix <- raw.matrix[i.covKeep,]
# Load Homer annotated peak file.
annoPeaks.file <- file.path('/home/steinhau/NMF-GPU_toolbox/data/mENCODE', prefix,
'H3K27ac_mergedConPeaks_anno.bed')
annoPeaks.file <- file.path('/home/steinhau/NMF-GPU_toolbox/data/HSC/H3K27Ac_mergedConPeaks_anno.bed')
anno.peaks <- fread(sprintf('cat %s | cut -f1,2-4,10', annoPeaks.file))
anno.peaks <- anno.peaks[i.covKeep,]
## Filter matrix for enhancers (TSS distance)
# i.keep <- which(abs(anno.peaks$V5) > 1500)
# raw.matrix <- raw.matrix[i.keep,]
# filtered.annoPeaks <- anno.peaks[i.keep,]
## Normalisation & Standardisation
#norm.matrix <- quantileTransform(raw.matrix)
#m.fac <- nrow(norm.matrix)%%10 - 1
#norm.matrix <- norm.matrix*10^2
# #boxplot(raw.matrix)
# norm.matrix <- normalizeQuantiles(raw.matrix)
# #norm.matrix <- normalizeUpperQuartile(raw.matrix)
# #boxplot(norm.matrix)
# Use raw data.
# norm.matrix <- raw.matrix
# norm.matrix[norm.matrix < 0] <- 0
# Apply Sigmoid Transformation to get scores between 0 -> 1.
norm.matrix <- raw.matrix
norm.matrix[norm.matrix < 0] <- 0
norm.matrix <- norm.matrix + 10^-6
# norm.matrix <- apply(norm.matrix, 2, sigmoidTransform)
# norm.matrix <- norm.matrix + 10^-6
# Feature selection by Median Absolute Deviation.
# row.mad <- apply(norm.matrix, 1, mad)
# norm.matrix <- norm.matrix[which(row.mad > 0.2),]
# Center row on mean.
# norm.matrix <- apply(norm.matrix, 1, function(matrix.row) {
# matrix.row/mean(matrix.row)
# })
# norm.matrix <- t(norm.matrix)
# norm.matrix[norm.matrix < 0] <- 0
### RUN NMF GPU
tmpMatrix.path <- writeTmpMatrix(norm.matrix)
# Define NMF-GPU parameters.
#out.dir <- '/home/steinhau/NB_NMF/data/TC_en_NMF_quanNorm_std_rmvOut/'
#dir.create(out.dir)
# RUN NMF.
k.max <- 12
outer.iter <- 100
inner.iter <- 10^4
dec.matrix <- runNmfGpu(tmpMatrix.path = tmpMatrix.path,
k.max = k.max,
outer.iter = outer.iter,
inner.iter = inner.iter)
#save(dec.matrix, file = file.path(result.path, sprintf('%s_deconvMatrix.RData', samples)))
#==============================================================================#
# Criteria for optimal factorization rank #
#==============================================================================#
# Get frob error from list of deconv. matrix and compute error statistics.
frobError.matrix <- getFrobError(dec.matrix)
frobError.data <- computeFrobErrorStats(frobError.matrix)
# Generate ranked frob errors plot & save as svg/png.
gg.rankedFrobError <- plotRankedFrobErrors(frobError.matrix)
rankedFrobError.svg <- sprintf('%s_k%s_iter%s_rankedFrobError.%s',
samples, k.max, outer.iter, c('svg', 'png'))
sapply(1:length(rankedFrobError.svg), function(i) {
save_plot(plot = gg.rankedFrobError,
filename = file.path(result.path, rankedFrobError.svg[i]))
})
# Generate mean frob error and coef. of var. plot
gg.frobError <- plotFrobError(frobError.matrix)
# Generate Alexandrov Criterion plot
sil.vec <- computeSilhoutteWidth(dec.matrix)
gg.silhoutteStats <- plotSilhoutteStas(sil.vec)
# Cophenetic correlation coefficient plot
coph.coeff <- computeCopheneticCoeff(dec.matrix)
gg.copheneticCoeff <- plotCopheneticCoeff(coph.coeff, keep.x = T)
# Combine all plots for estmation of optimal k
gg.optKplot <- plot_grid(gg.frobError,
gg.silhoutteStats,
gg.copheneticCoeff, align = 'V',
rel_heights = c(1, 1, 1.05), ncol = 1)
# Save plots for optimal k estimation.
optK.svg <- sprintf('%s_k%s_iter%s_optKPlot.%s', samples, k.max,
outer.iter, c('svg', 'png'))
sapply(1:length(optK.svg), function(i) {
save_plot(gg.optKplot, filename = file.path(result.path, optK.svg[i]))
})
#==============================================================================#
# H-MATRIX ANALYSIS/VISUALIZATION #
#==============================================================================#
load('/home/thymin/NMF-GPU_toolbox/heart_nmf.RData')
# Plot Heatmaps for H over all k
H.list <- getHmatrixList(dec.matrix)
H.list <- lapply(H.list, function(x) {
colnames(x) <- colnames(raw.matrix)
return(x)
})
# HMatrixHeat.svg <- sprintf('%s_k%s_iter%s_HmatrixHeatmap.svg',
# samples, k.max, outer.iter)
# svg(file.path(result.path, HMatrixHeat.svg), width = 15)
# plotHeatmap4MatrixList(H.list, trans = T)
# dev.off()
#meta.file <- '/home/steinhau/NMF-GPU_toolbox/data/mENCODE/H3K27ac_sampleSheet.txt'
meta.file <- '/home/thymin/Projects/NMF-GPU_toolbox/data/mENCODE/H3K27ac_sampleSheet.txt'
meta.data <- readLines(meta.file)
meta.data <- lapply(strsplit(meta.data, '\t'), function(r) r[c(1,3,4,6)])
meta.data <- do.call(rbind, meta.data)
i.sort <- match(colnames(H.list[[1]]), meta.data[,1])
meta.data <- meta.data[i.sort,]
colnames(meta.data) <- c('ID', 'Tissue/Cell', 'Stage', 'Timepoint')
#anno.file <- '/home/steinhau/NMF-GPU_toolbox/data/mENCODE/sampleAnnotation.csv'
anno.file <- '/home/thymin/Projects//NMF-GPU_toolbox/data/mENCODE/sampleAnnotation.csv'
anno <- read.table(anno.file, sep = '\t')
meta.data <- merge(meta.data, anno, by.x = 'Tissue/Cell', by.y = 'V1')
colnames(meta.data) <- c('Cell', 'ID', 'Stage', 'Timepoint', 'System', 'Type')
# Define colors for annotation.
library(RColorBrewer)
n.time <- length(unique(meta.data$Timepoint))
n.cell <- length(unique(meta.data$Cell))
qual.colPals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
# Timepoint
time.col <- unlist(mapply(brewer.pal, n.time, 'Dark2'))[,1]
names(time.col) <- sort(unique(meta.data$Timepoint))
# Stage
stages <- unique(meta.data$Stage)
stage.col <- c('darkgreen', 'darkblue', 'darkred', 'orange')[1:length(stages)]
names(stage.col) <- stages
# Cell
cell.col <- unlist(mapply(brewer.pal, n.cell, 'Paired'))[,1]
names(cell.col) <- sort(unique(meta.data$Cell))
# Prepare heatmap annotation
heat.anno <- HeatmapAnnotation(df = meta.data[,c(1,3,4)],
col = list(Cell = cell.col,
Timepoint = time.col,
Stage = stage.col))
getColorMap <- function(matrix.list) {
matrix.max <- quantile(unlist(matrix.list), 0.95)
matrix.min <- quantile(unlist(matrix.list), 0.05)
if(matrix.min > 0) {
col.map <- colorRamp2(c(0, matrix.max), c("white", "red"))
} else {
col.map <- colorRamp2(c(matrix.min, 0, matrix.max), c("blue", "white", "red"))
}
return(col.map)
}
col.map <- getColorMap(H.list)
# Create ouput dir structure.
hmatrix.resultPath <- file.path(result.path, 'H-matrix')
dir.create(hmatrix.resultPath)
sapply(c('png', 'svg'), function(suffix) {
dir.create(file.path(hmatrix.resultPath, suffix))
})
# Print H-Matrix for all K's in png & svg.
sapply(1:length(H.list), function(i) {
h.heatmap <- Heatmap(H.list[[i]], col = col.map,
clustering_distance_columns = 'pearson',
heatmap_legend_param = list(color_bar = "continuous"),
show_column_names = F, cluster_rows = F,
top_annotation = heat.anno)
png(file.path(hmatrix.resultPath, 'png',
sprintf('heart_Hmatrix_%sk.png', i+1)))
draw(h.heatmap)
dev.off()
svg(file.path(hmatrix.resultPath, 'svg',
sprintf('heart_Hmatrix_%sk.svg', i+1)))
draw(h.heatmap)
dev.off()
})
#save(H.list, W.list, anno.peaks, regionDB, file = '/home/thymin/NMF-GPU_toolbox/heart_nmf.RData') #file.path(result.path, 'heart_nmf.RData'))
#==============================================================================#
# W-MATRIX ANALYSIS/VISUALIZATION #
# FIND REGIONS ASSOCIATED TO SIGNATURES #
#==============================================================================#
# Plot Heatmaps for W over all k
W.list <- getWmatrixList(dec.matrix)
### Find representative regions.
# Get W for best K
k.opt <- 5
W <- W.list[[as.character(k.opt)]]
H <- H.list[[as.character(k.opt)]]
# Compute unbaised signature names by applying a row k-means
# and classify signature according to cluster and highest cluster mean.
getSignatureNames <- function(H, meta.data) {
sig.names <- lapply(1:nrow(H), function(i) {
k.cluster <- kmeans(as.numeric(H[i,]), 2)
n.kCluster <- which(k.cluster$centers == max(k.cluster$centers))
samples.kCluster <- colnames(H)[k.cluster$cluster == n.kCluster]
m <- meta.data[meta.data$ID%in%samples.kCluster,]
sig.name <- paste(sort(unique(m$Timepoint)), collapse = '/')
sig.name <- paste(unique(m$Stage), sig.name, sep = '_')
return(sig.name)
})
return(unlist(sig.names))
}
signature.names <- getSignatureNames(H, meta.data)
# Manuell chronological re-ordering.
i.sigOrder <- c(2:4,1,5)
signature.names <- signature.names[i.sigOrder]
W <- W[,i.sigOrder]
colnames(W) <- signature.names
H <- H[i.sigOrder,]
rownames(H) <- signature.names
# Compute 'Delta Signature contribution' and select regions for signature of interest.
W.delta <- computeSigAbsDelta(W)
# Compute Entropy per row
W.entropy <- computeEntropy(W)
# Perform Kmeans on W rows to extract all possible signature combinations.
k.row <- apply(W, 1, function(x) {
x.trans <- sigmoidTransform(x)
k.cluster <- kmeans(x.trans, 2)
d <- dist(as.data.frame(x.trans))
sil.mean <- mean(silhouette(k.cluster$cluster, d)[,3])
return(list(centers = t(k.cluster$centers),
silmean = sil.mean,
explainedVar = k.cluster$betweenss/k.cluster$totss,
oddsVar = sum(k.cluster$withinss)/k.cluster$betweenss,
attribution = k.cluster$cluster))
})
# Compute different variance vars. to determine 11111 signature.
k.explainedVar <- unlist(lapply(k.row, function(r) r$explainedVar))
k.oddsVar <- unlist(lapply(k.row, function(r) r$oddsVar))
k.varCoef <- apply(W, 1, function(r) sd(r)/mean(r))
var.thres <- 0.25
all.signature <- which(k.varCoef < var.thres)
# Extract Signature combinations generated by k-means
k.attribution <- lapply(k.row, function(r) abs(r$attribution))
k.attribution <- do.call(rbind, k.attribution)
k.ids <- apply(k.attribution, 1, function(r) paste(r, collapse = ''))
k.ids[all.signature] <- gsub('2', '1', k.ids[all.signature])
# Extract Centroids for each row.
k.scores <- lapply(k.row, function(r) r$centers)
k.scores <- do.call(rbind, k.scores)
# Extract mean silhouette width for each row.
k.silmean <- lapply(k.row, function(r) r$silmean)
k.silmean <- unlist(k.silmean)
# Get for all signature combinations corresponding regions.
ids <- sort(unique(k.ids))[-1]
n.ids <- length(ids)+1
i.regions <- lapply(1:(n.ids/2), function(i) {
sub.id <- ids[c(i, n.ids - i)]
i.region <- which(k.ids%in%sub.id)
return(i.region)
})
names(i.regions) <- ids[1:(n.ids/2)]
# Get regions for all signature id.
allSig.id <- sort(unique(k.ids))[1]
i.regions[[allSig.id]] <- which(k.ids%in%allSig.id)
# Compute regions scores (delta centroid values) to distinguish between
# different signature combinations within a 'row cluster'
# region.scores <- lapply(1:(n.ids/2), function(i) {
# k.score1 <- k.scores[which(k.ids == ids[i]),]
# k.score2 <- k.scores[which(k.ids == ids[n.ids-i]), 2:1]
# kscore.df <- rbind(k.score1, k.score2)
# k.sil <- c(k.silmean[which(k.ids == ids[i])],
# k.silmean[which(k.ids == ids[n.ids-i])])
# kscore.df <- cbind(kscore.df, k.sil, as.numeric(ids[i]))
# return(kscore.df)
# })
# region.scores <- do.call(rbind, region.scores)
# colnames(region.scores) <- c('x', 'y', 'SilWidth', 'Sig')
# region.scores <- as.data.frame(region.scores)
# MA like score plotting useful?
# region.scores$d <- region.scores[,1] - region.scores[,2]
# region.scores$M <- log2(region.scores[,1]/region.scores[,2])
# region.scores$A <- 1/2*(region.scores[,1] + region.scores[,2])
# Compute Row mean diff between Cluster 1 and 2.
rowMean <- function(m) {apply(as.matrix(m), 1, mean)}
w.diffs <- lapply(1:length(i.regions), function(i) {
w <- W[i.regions[[i]],]
if (!grepl(names(i.regions)[i], pattern = '2')) {
w.diff <- rowMean(w)
} else {
signature.comb <- as.numeric(unlist(strsplit(names(i.regions)[[i]], split = '')))
w.mean1 <- rowMean(w[,which(signature.comb == 1)])
w.mean2 <- rowMean(w[,which(signature.comb == 2)])
w.diff <- w.mean1 - w.mean2
}
return(w.diff)
})
names(w.diffs) <- names(i.regions)
# gg.scatter <- ggplot(as.data.frame(region.scores), aes(x = x, y = y)) + geom_point()
# gg.scatter <- gg.scatter + stat_function(fun = function(x) {x}, col = 'red',
# linetype = 'dashed', size = 1.25)
# gg.scatter <- gg.scatter + facet_wrap(~Sig, scales = 'free')
# gg.scatter <- gg.scatter + xlab('1') + ylab('2')
# gg.scatter <- gg.scatter + theme_bw() + science_theme
# gg.scatter + theme(strip.background = element_rect(fill = 'white'))
# Plot W for given signature combination.
w <- W[i.regions[[1]],]
w <- w[region.scores[i.regions[[1]], 'd'] < 0,]
#w <- w[order(w[,5], decreasing = T),]
png(file.path(result.path, sprintf('heaeRrt_Wmatrix_%sk.png', 5)))
Heatmap(w, cluster_rows = F, cluster_columns = F, show_row_names = F,
heatmap_legend_param = list(color_bar = "continuous"))
dev.off()
# Extract regions for a given signature combination
j <- 6# length(i.regions) - 1
p <- anno.peaks[i.regions[[j]],]
p[order(w.diffs[[j]], decreasing = T),]
#==============================================================================#
# Analysis of Signature combination for regulatory element enrichment. #
#==============================================================================#
### Get number of peaks per signature combination and create a barplot.
n.peaks <- lapply(names(i.regions), function(region.n) {
n.peak1 <- sum(w.diffs[[region.n]] > 0)
n.peak2 <- sum(w.diffs[[region.n]] < 0)
n.peak <- c(n.peak1, n.peak2)
n.peak <- melt(n.peak)
n.peak$clusterId <- c('1', '2')
n.peak$sigCombId <- region.n
return(n.peak)
})
n.peaks <- do.call(rbind, n.peaks)
gg.bar <- ggplot(n.peaks, aes(x = sigCombId, y = value, fill = clusterId))
gg.bar <- gg.bar + geom_bar(colour='black', stat='identity',
position=position_dodge())
gg.bar <- gg.bar + xlab('Signature combinations') + ylab('#Regions')
gg.bar <- gg.bar + scale_fill_manual(values = c('red', 'blue'),
name = 'Cluster')
gg.bar <- gg.bar + theme_bw() + science_theme
gg.bar <- gg.bar + theme(axis.text.x = element_text(angle = 90, hjust = 1))
save_plot(gg.bar, filename = file.path(result.path, 'clusterCombination_bar.svg'))
#### Promoter/Enhancer/Superenhancer enrichment in different signature combinations
## FUNCTIONS.
spliteRegionsByScore <- function(regions) {
up.regions <- regions[mcols(regions)[,1] > 0,]
up.regions <- up.regions[order(mcols(up.regions)[,1], decreasing = T),]
down.regions <- regions[mcols(regions)[,1] < 0,]
down.regions <- down.regions[order(mcols(down.regions)[,1]),]
return(list('1' = up.regions, '2' = down.regions))
}
computeRegionFC <- function(peaks, subregions, universe) {
prop.universe <- sum(countOverlaps(universe, peaks))/length(universe)
prop.subregions <- sum(countOverlaps(subregions, peaks))/length(subregions)
fc <- prop.subregions/prop.universe
return(fc)
}
computeFisher4Regions <- function(peaks, subregions, universe) {
x1 <- sum(countOverlaps(subregions, peaks) > 0)
x2 <- length(subregions) - x1
x3 <- sum(countOverlaps(universe, peaks) > 0) - x1
x4 <- length(universe) - x3 - x2 - x1
m <- matrix(c(x1, x3, x2, x4), nrow = 2)
p <- fisher.test(m)
return(p$p.value)
}
### NES Functions
computeRecoveryAUC <- function(ranks, n) {
ranks <- unique(c(1, sort(ranks[ranks < n]), n))
rank.diff <- ranks[-1] - ranks[-length(ranks)]
#recov <- 0:c(length(rank.diff)-1)
recov <- c(0, cumsum(rep(1/n, length(rank.diff)-1)))
auc <- sum(recov*rank.diff)/n
return(auc)
}
computeRmdAUCs <- function(ranks, n, iter = 10^3, threads = 3) {
n.hits <- sum(ranks <= n)
rmd.aucs <- mclapply(1:iter, function(i) {
rmd.ranks <- sort(sample(1:n, size = n.hits, replace = F))
rmd.auc <- computeRecoveryAUC(rmd.ranks, n)
return(rmd.auc)
}, mc.cores = threads)
return(unlist(rmd.aucs))
}
computeNES <- function(ranks, n, iter = 10^3, threads = 3) {
auc <- computeRecoveryAUC(ranks, n)
rmd.aucs <- computeRmdAUCs(ranks, n, iter = iter, threads = threads)
nes <- (auc - mean(rmd.aucs))/sd(rmd.aucs)
r <- sum(auc < rmd.aucs) + 1
result <- data.frame('Rank' = r, 'NES' = nes, 'AUC' = auc)
return(result)
}
computePeakStats4Regions <- function(peaks, subregions, universe) {
n.regions <- lapply(subregions, length)
n.ov <- lapply(subregions, function(r) sum(countOverlaps(r, peaks) > 0))
# Compute Fold-enrichment
fcs <- lapply(subregions, function(subregion){
if(length(subregion) == 0) {return(NA)}
computeRegionFC(peaks, subregion, universe)
})
# Compute Fisher-exact p-value
pvalues <- lapply(subregions, function(subregion){
if(length(subregion) == 0) {return(NA)}
computeFisher4Regions(peaks, subregion, universe)
})
# Compute Normalized enrichment score
nes <- lapply(subregions, function(subregion) {
if(length(subregion) <= 1) {return(NA)}
ov <- findOverlaps(subregion, peaks)
promoter.rank <- unique(queryHits(ov))
nes <- computeNES(promoter.rank, length(subregion))
return(nes)
})
nes <- do.call(rbind, nes)
result <- data.frame('NumberRegions' = unlist(n.regions), 'OV' = unlist(n.ov),
'FC' = unlist(fcs), 'FisherPvalue' = unlist(pvalues))
result <- cbind(result, nes)
return(result)
}
computeRunningPeakStats4Regions <- function(peaks, subregions, universe) {
result.rankProp <- lapply(seq(0.1, 1, by = 0.1), function(region.prop) {
# Get only top x% of region.
filtered.subregions <- lapply(subregions, function(r) {
n <- floor(length(r)*region.prop)
return(r[0:n,])
})
result <- computePeakStats4Regions(peaks, filtered.subregions, universe)
result$ClusterID <- 1:2
result$RankProp <- region.prop
return(result)
})
result.rankProp <- do.call(rbind, result.rankProp)
return(result.rankProp)
}
## Analysis
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.ensGene)
#gencode.gff <- '/home/thymin/genomes/mm10/'
promoter.regions <- promoters(TxDb.Mmusculus.UCSC.mm10.ensGene,
upstream = 1500, downstream = 500)
universe <- GRanges(anno.peaks$V2, IRanges(anno.peaks$V3, anno.peaks$V4))
# Compute promoter overlap/enrichment.
promoter.enrichment <- mclapply(1:length(i.regions), function(i) {
subregions <- universe[i.regions[[i]],]
subregions$score <- w.diffs[[i]]
subregions <- spliteRegionsByScore(subregions)
peak.stats <- computeRunningPeakStats4Regions(peaks = promoter.regions,
subregions = subregions,
universe = universe)
peak.stats$SignatureComb <- names(i.regions)[i]
return(peak.stats)
}, mc.cores = 1)
promoter.enrichment <- do.call(rbind, promoter.enrichment)
promoter.enrichment <- promoter.enrichment[order(promoter.enrichment$ClusterID, promoter.enrichment$SignatureComb),]
promoter.enrichment <- promoter.enrichment[!is.na(promoter.enrichment$FC),]
promoter.data <- promoter.enrichment[promoter.enrichment$RankProp == 1,]
promoter.data$ClusterID <- factor(promoter.data$ClusterID)
promoter.data[order(promoter.data$NES)]
gg.enrichmentBar <- ggplot(promoter.data, aes(x = SignatureComb, y = FC, fill = ClusterID))
gg.enrichmentBar <- gg.enrichmentBar + geom_bar(colour='black', stat='identity',
position=position_dodge())
gg.enrichmentBar <- gg.enrichmentBar + scale_fill_manual(values = c('red', 'blue'),
name = 'Cluster')
gg.enrichmentBar <- gg.enrichmentBar+ theme_bw() + science_theme
gg.enrichmentBar <- gg.enrichmentBar + theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Compute superenhancer overlap/enrichment.
superenhancer.bed <- '/home/thymin/Projects/bloodEnhancer_project/heart_superenhancers.bed'
superenhancers <- import.bed(superenhancer.bed)
# Compute promoter overlap/enrichment.
superenhancer.enrichment <- mclapply(1:length(i.regions), function(i) {
print(i)
subregions <- universe[i.regions[[i]],]
subregions$score <- w.diffs[[i]]
subregions <- spliteRegionsByScore(subregions)
peak.stats <- computeRunningPeakStats4Regions(peaks = superenhancers,
subregions = subregions,
universe = universe)
peak.stats$SignatureComb <- names(i.regions)[i]
peak.stats <- peak.stats[!is.na(peak.stats$FC),]
return(peak.stats)
}, mc.cores = 1)
superenhancer.enrichment <- do.call(rbind, superenhancer.enrichment)
p <- promoter.enrichment[promoter.enrichment$RankProp == 1,]
p$anno <-
plot.matrix <-
#barplot(p$FC)
signature.comb <- p[,c('ClusterID', 'SignatureComb')]
sortSignatureCombs <- function(signature.comb) {
}
# Compute enrichment of mouse encode chromHMM model.
chromstat.bed <- '/home/thymin/mouseEncode_chromHMM/heart_chromatinStates_mm10.bed'
chrom.stats <- import.bed(chromstat.bed)
chromStat.enrichment <- mclapply(1:length(i.regions), function(i) {
subregions <- universe[i.regions[[i]],]
subregions$score <- w.diffs[[i]]
subregions <- spliteRegionsByScore(subregions)
chromStat.peakStats <- lapply(sort(unique(chrom.stats$name)), function(n) {
chromStat.n <- chrom.stats[chrom.stats$name == n,]
peak.stats <- computePeakStats4Regions(peaks = chromStat.n,
subregions = subregions,
universe = universe)
peak.stats$chromStat <- n
return(peak.stats)
})
chromStat.peakStats <- do.call(rbind, chromStat.peakStats)
return(chromStat.peakStats)
}, mc.cores = 4)
#####
# Compute promoter overlap/enrichment.
chip.peaks <- regionDB$regionGRL
fc <- computeRegionFC4GRlist(peak = chip.peaks, subregion = subregions[[1]], universe = promoter.universe)
library('LOLA')
promoter.universe <- subsetByOverlaps(universe, promoter.regions)
promoter.chipEnrichment <- lapply(1:length(i.regions), function(i) {
print(i)
subregions <- universe[i.regions[[i]],]
subregions$score <- w.diffs[[i]]
subregions <- subsetByOverlaps(subregions, promoter.regions)
subregions <- spliteRegionsByScore(subregions)
lola.results <- lapply(1:length(subregions), function(i.subregion){
lola.result <- runLOLA(userSets = subregions[[i.subregion]],
userUniverse = promoter.universe,
regionDB, cores = 5)
lola.result$adjPLog <- -log10(p.adjust(p = 10^(-lola.result$pValueLog),
method = 'fdr'))
lola.result$clusterId <- i.subregion
return(lola.result)
})
lola.results <- do.call(rbind, lola.results)
lola.results$SigCombId <- names(i.regions)[i]
return(lola.results)
})
library('rGREAT')
test <- submitGreatJob(gr = subregions[[1]], species = 'mm10', bg = promoter.universe, request_interval = 30)
availableCategories(test)
tb <- getEnrichmentTables(test, category = "Gene Expression")
==============================================================================#
# LOLA [PMID:26508757]- #
# Perform enrichment analysis with signature associated regions #
# with FISHER's EXACT TEST (--> Peak OV) #
#==============================================================================#
dbPath <- '/home/steinhau/NMF-GPU_toolbox/LOLACore/mm10/'
dbPath <- '/home/thymin/Projects//NMF-GPU_toolbox/LOLACore/mm10/'
regionDB <- loadRegionDB(dbPath, useCache = T)
# Define the universe
universe <- GRanges(anno.peaks$V2,
IRanges(anno.peaks$V3, anno.peaks$V4))
lolaEnrichment.results <- lapply(names(i.regions), function(n.region) {
print(n.region)
regions <- universe[i.regions[[n.region]]]
cluster.1 <- which(w.diffs[[n.region]] > 0)
cluster.2 <- which(w.diffs[[n.region]] < 0)
lola.results <- lapply(list(cluster.1, cluster.2), function(j.cluster) {
lola.result <- runLOLA(regions[j.cluster,], universe, regionDB, cores=6)
#lola.result$fdr <- p.adjust(10^-lola.result$pValueLog, method = 'fdr')
return(lola.result)
})
return(lola.results)
})
### Prepare feature matrix for plotting.
x <- lapply(lolaEnrichment.results, function(lolaEnrichment.result) {
lapply(lolaEnrichment.result, function(l) {
l$fdr <- p.adjust(10^-l$pValueLog, method = 'fdr')
return(l)
})
})
names(x) <- names(i.regions)
fdr.thres <- 0.01
term.dict <- lapply(x, function(sublist) {
lapply(sublist, function(l) {
l <- l[l$fdr <= fdr.thres,]
l[1:5,]$filename
})
})
term.dict <- unique(unlist(term.dict))
term.dict <- term.dict[!is.na(term.dict)]
x <- lapply(names(i.regions), function(n.regions) {
n.cluster <- 1
r <- lapply(x[[n.regions]], function(r) {
r$clusterID <- n.cluster
n.cluster <<- n.cluster + 1
return(r)
})
r <- do.call(rbind, r)
r$signatureID <- n.regions
return(r)
})
x <- do.call(rbind, x)
filtered.x <- x[x$filename%in%term.dict,]
filtered.x$id <- paste(filtered.x$signatureID, filtered.x$clusterID)
signature.comb <- do.call(rbind, strsplit(filtered.x$signatureID, ''))
enrichment.matrix <- dcast(filtered.x, formula = filename ~ id, value.var = 'logOddsRatio') #fdr")
rownames(enrichment.matrix) <- enrichment.matrix$filename
enrichment.matrix <- enrichment.matrix[,-1]
#enrichment.matrix <- -log10(enrichment.matrix + 10^-120)
### Get row annotation data.
rowAnno.df <- unique(filtered.x[,c('cellType', 'antibody', 'filename'),with=F])
i.rmv <- grep(rowAnno.df$filename, pattern = 'Input')
rowAnno.df <- rowAnno.df[-i.rmv,]
enrichment.matrix <- enrichment.matrix[-i.rmv,]
i.na <- which(is.na(rowAnno.df$cellType))
rowAnno.df[i.na]$antibody <- gsub('.bed', '', rowAnno.df[i.na]$filename)
# Replace embryonic stem cell
i.stem <- grep(rowAnno.df$cellType, pattern = 'Embryonic', ignore.case = T)
rowAnno.df$cellType[i.stem] <- 'Embryonic Stem Cells'
rowAnno.df$anno <- paste(rowAnno.df$cellType, rowAnno.df$antibody)
rowAnno.df$anno <- duplicated(rowAnno.df[,1:2,with=F]) + 1
### Column Annotation.
sig.comb <- strsplit(colnames(enrichment.matrix), split = '')
sig.comb <- do.call(rbind, sig.comb)
sig.comb <- apply(sig.comb, 1, function(x) x[1:5] == x[7])
sig.comb <- apply(as.data.frame(sig.comb), 1, as.character)
rownames(sig.comb) <- 1:nrow(sig.comb)
colnames(sig.comb) <- paste(df = 'Factor', 1:5)
col <- c('TRUE' = 'black', 'FALSE' = 'white')
anno.colmap <- lapply(1:ncol(sig.comb), function(i) col)
names(anno.colmap) <- colnames(sig.comb)
# Col annotation order.
i.order <- apply(sig.comb, 1, function(c) sum(as.logical(c)))
i.order <- lapply(unique(sort(i.order)), function(n) {
j <- which(i.order == n)
j <- j[order(as.logical(sig.comb[j,5]), as.logical(sig.comb[j,4]),
as.logical(sig.comb[j,3]), as.logical(sig.comb[j,2]),
decreasing = T)]
return(j)
})
i.order <- unlist(i.order)
# Prepare annotation heatmap.
sigComb.anno <- HeatmapAnnotation(as.data.frame(sig.comb)[i.order,],
col = anno.colmap, show_legend = F,
gp = gpar(col = 'black', lty = 1, lwd = 1))
Heatmap(enrichment.matrix[,i.order], cluster_rows = T, cluster_columns = F,
show_row_names = F, show_column_names = F,
heatmap_legend_param = list(color_bar = "continuous"),
bottom_annotation = sigComb.anno)
filtered.x$fdr
table(x[x$fdr < 0.01,]$signatureID)
enriched.terms <- lapply(lola.results, function(r) r$filename[r$qValue < 0.01])
fdr.thres <- 0.1
#==============================================================================#
# Feature recovery analysis - #
# Perform enrichment analysis with signature rankings by computing #
# normalized enrichment scores (NES) (--> Peak OV with x% top regions) #
#==============================================================================#
### Compute NES
feature.peaks <- regionDB$regionGRL
j.regions <- lapply(i.regions, function(i) {
i[!is.promoter[i]]
})
j.regions <- i.regions
n.ranks <- 500
featureNes.list <- lapply(1:length(i.regions), function(i) {
print(names(i.regions)[i])
# Get subregions and compute ranking score.
j <- j.regions[[i]]
subregions <- anno.peaks[j,]
subregions.gr <- GRanges(subregions$V2, IRanges(subregions$V3, subregions$V4))
i.keep <- names(w.diffs[[i]])%in%as.character(j)
subregions.gr$score <- w.diffs[[i]][i.keep]
feature.nes <- lapply(c(T, F), function(b) {
print(b)
if(b & sum(subregions.gr$score > 0) < n.ranks) { return(NA) }
if(!b & sum(subregions.gr$score < 0) < n.ranks) { return(NA) }
# Compute feature overlap.
sorted.regions <- subregions.gr[order(subregions.gr$score, decreasing = b)]
ov <- findOverlaps(feature.peaks, sorted.regions)
# Compute Normalized Enrichment Scores.
feature.nes <- mclapply(1:length(feature.peaks), FUN = function(j) {
ranks <- subjectHits(ov)[queryHits(ov) == j]
ranks <- sort(unique(ranks))
computeNES(ranks, n.ranks, threads = 1)
}, mc.cores = 6)
feature.nes <- unlist(feature.nes)
return(feature.nes)
})
names(feature.nes) <- c('1', '2')
return(feature.nes)
})
names(featureNes.list) <- names(i.regions)
# Extract features with nes thres.
nes.thres <- 2.5
feature.dict <- lapply(featureNes.list, function(feature.nes) {
lapply(feature.nes, function(nes) {
i.feature <- which(nes >= nes.thres)
i.feature <- i.feature[!i.feature%in%is.na(nes)]
i.feature <- i.feature[order(nes[i.feature], decreasing = T)]
return(i.feature[1:3])
})
})
feature.dict <- sort(unique(unlist(feature.dict)))
enriched.features <- regionDB$regionAnno[feature.dict,]
enriched.features <- enriched.features[enriched.features$collection != 'ucsc_features',]
# Create large data.table containing NES scores.
features.dt <- lapply(names(featureNes.list), function(n) {
features <- enriched.features
features.dt <- lapply(names(featureNes.list[[n]]), function(id) {
features$clusterId <- id
features$combClusterId <- n
features$nes <- featureNes.list[[n]][[id]][feature.dict]
return(features)
})
features.dt <- do.call(rbind, features.dt)
return(features.dt)
})
features.dt <- do.call(rbind, features.dt)
features.dt$id <- paste(features.dt$combClusterId, features.dt$clusterId)
features.dt <- features.dt[!is.na(features.dt$nes)]
features.dt <- features.dt[features.dt$collection != 'ucsc_features',]
### Compute enrichment matrix.
enrichment.matrix <- dcast(features.dt, formula = filename ~ id, value.var = 'nes')
rownames(enrichment.matrix) <- enrichment.matrix$filename
enrichment.matrix <- enrichment.matrix[,-1]
### Get row annotation data.
rowAnno.df <- unique(enriched.features[,c('cellType', 'antibody', 'filename'),with=F])
i.rmv <- grep(rowAnno.df$filename, pattern = 'Input')
rowAnno.df <- rowAnno.df[-i.rmv,]
enrichment.matrix <- enrichment.matrix[-i.rmv,]
i.na <- which(is.na(rowAnno.df$cellType))
rowAnno.df[i.na]$antibody <- gsub('.bed', '', rowAnno.df[i.na]$filename)
# Replace embryonic stem cell
i.stem <- grep(rowAnno.df$cellType, pattern = 'Embryonic', ignore.case = T)
rowAnno.df$cellType[i.stem] <- 'ESC'
rowAnno.df$anno <- paste(rowAnno.df$cellType, rowAnno.df$antibody)
i.rep <- lapply(unique(rowAnno.df$anno), function(a) {
j <- which(rowAnno.df$anno%in%a)
names(j) <- 1:length(j)
return(j)
})
i.rep <- sort(unlist(i.rep))
rowAnno.df$anno <- names(i.rep)
rowAnno.df$anno <- paste(rowAnno.df$antibody, rowAnno.df$cellType,
rowAnno.df$anno)
rownames(enrichment.matrix) <- rowAnno.df$anno
### Column Annotation.
sig.comb <- strsplit(colnames(enrichment.matrix), split = '')
sig.comb <- do.call(rbind, sig.comb)
sig.comb <- apply(sig.comb, 1, function(x) x[1:5] == x[7])
sig.comb <- apply(as.data.frame(sig.comb), 1, as.character)
rownames(sig.comb) <- 1:nrow(sig.comb)
colnames(sig.comb) <- paste(df = 'Factor', 1:5)
col <- c('TRUE' = 'black', 'FALSE' = 'white')
anno.colmap <- lapply(1:ncol(sig.comb), function(i) col)
names(anno.colmap) <- colnames(sig.comb)
# Col annotation order.
i.order <- apply(sig.comb, 1, function(c) sum(as.logical(c)))
i.order <- lapply(unique(sort(i.order)), function(n) {
j <- which(i.order == n)
j <- j[order(as.logical(sig.comb[j,5]), as.logical(sig.comb[j,4]),
as.logical(sig.comb[j,3]), as.logical(sig.comb[j,2]),
decreasing = T)]
return(j)
})
i.order <- unlist(i.order)
# Prepare annotation heatmap.
sigComb.anno <- HeatmapAnnotation(as.data.frame(sig.comb)[i.order,],
col = anno.colmap, show_legend = F,
gp = gpar(col = 'black', lty = 1, lwd = 1))
enrichment.matrix[is.na(enrichment.matrix)] <- 0
# enrichment.matrix[enrichment.matrix > 5] <- 5
h <- Heatmap(enrichment.matrix[,i.order], cluster_rows = T, cluster_columns = T,
show_row_names = T, show_column_names = F,
heatmap_legend_param = list(color_bar = "continuous", legend_direction = "horizontal",
legend_width = unit(5, "cm"), title_position = "lefttop"),
bottom_annotation = sigComb.anno)
png(file.path(result.path, 'heart_enhancer_lolaPeaksNES_1000.png'))
draw(h, heatmap_legend_side = "bottom")
dev.off()
svg(file.path(result.path, 'heart_enhancer_lolaPeaksNES_1000.svg'), width = 12, height = 10)
draw(h, heatmap_legend_side = "bottom")
dev.off()
################################################################################
motif.bed <- '/home/steinhau/NMF-GPU_toolbox/data/mENCODE/heart/H3K27ac_fimoMotifs.bed'
motifs <- fread(motif.bed, sep = '\t')
motifs.gr <- GRanges(motifs$V2, IRanges(motifs$V3, motifs$V4), 'motif' = motifs$V10)
motif.names <- unique(motifs$V10)
motif.nes <- lapply(motif.names, function(motif.n) {
print(motif.n)
motif.gr <- motifs.gr[motifs.gr$motif == motif.n]
nes <- sapply(1:ncol(W.zscore), function(j) {
j.sorted <- order(W.zscore[,j], decreasing = T)[1:n.top]
nes <- computeNES(motif.gr, universe[j.sorted,])
return(nes)
})
return(nes)
})
motif.nes <- do.call(rbind, motif.nes)
rownames(motif.nes) <- motif.names
max.nes <- apply(motif.nes, 1, function(x) max(x, na.rm = T))
motif.nes[max.nes > 3,]
library(JASPAR2014)
library(TFBSTools)
PFMatrixList <- getMatrixSet(JASPAR2014, opts = list())
pwmList <- toPWM(PFMatrixList)
peak.seq <- getSeq(BSgenome.Mmusculus.UCSC.mm10, universe)
sitesetList <- searchSeq(pwmList, peak.seq, min.score="60%", strand="*")
#==============================================================================#
# Feature enrichment analysis inspired by GREAT [20436461] - #
# Perform enrichment analysis with binominal test #
# (--> Peak Annotation OV against 'genome background') #
#==============================================================================#
library(BSgenome.Mmusculus.UCSC.mm10)
chrs <- unique(as.character(seqnames(regionDB$regionGRL[[1]])))
genome.size <- seqinfo(BSgenome.Mmusculus.UCSC.mm10)[chrs]
genome.size <- sum(as.numeric(seqlengths(genome.size)))
binom.pvalues <- lapply(1:length(lola.regions), function(i.region) {
print(i.region)
regions <- lola.regions[[i.region]]
anno.size <- sum(width(regions))/genome.size
pvalues <- lapply(i.specific, function(i.spec){
n.regions <- length(i.spec)
ov <- findOverlaps(regions, universe[i.spec,])
anno.hits <- length(unique(subjectHits(ov)))
pvalue <- binom.test(x = anno.hits, n = n.regions, p = anno.size)
return(pvalue$p.value)
})
pvalues <- unlist(pvalues)
return(pvalues)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.