###############################################################
# correlation between two sets of genomic regions
# or enrichment of one type of genomic region in the other
###############################################################
# == title
# Correlation between two sets of genomic regions
#
# == param
# -gr_list_1 a list of `GenomicRanges::GRanges` objects, should be a named list, e.g. low methylated regions in different samples.
# -gr_list_2 a list of `GenomicRanges::GRanges` objects, should be a named list, e.g. a list of genomic features.
# -background a `GenomicRanges::GRanges` object. The correlation is only looked in the background regions.
# -chromosome a vector of chromosome names
# -species species, used for random shuffling genomic regions
# -nperm number of random shufflings. If it is set to 0 or 1, no random shuffling will be performed.
# -mc.cores number of cores for parallel calculation
# -stat_fun method to calculate correlations. There are some pre-defined functions:
# `genomic_corr_reldist`, `genomic_corr_absdist` measure how two sets of genomic regions are close; `genomic_corr_jaccard`,
# `genomic_corr_intersect` measures how two sets of genomic regions are overlapped.
# The self-defined function should accept at least two arguments which are two GRanges object.
# The third argument is ``...`` which is passed from the main function. The function
# should only return a numeric value.
# -... pass to ``stat_fun``
# -bedtools_binary random shuffling is perfomed by ``bedtools``. If ``bedtools`` is not in ``PATH``, the path of ``bedtools`` can be set here.
# -tmpdir dir for temporary files
#
# == details
# The correlation between two sets of genomic regions basically means how much the first type of genomic regions
# are overlapped or close to the second type of genomic regions.
#
# The significance of the correlation is calculated by random shuffling the regions.
# In random shuffling, regions in ``gr_list_1`` will be shuffled. So if you want to shuffle ``gr_list_2``,
# just switch the first two arguments.
#
# Pleast note random shuffling is done by "bedtools", so "bedtools" should be installed and exists in ``PATH``
# and should support ``-i -g -incl`` options.
#
# This function is very time-consuming.
#
# == value
# A list containing following elements:
#
# -stat statistic value
# -fold_change stat/E(stat), stat divided by expected value which is generated from random shuffling
# -p.value p-value for over correlated. So, 1 - p.value is the significance for being less correlated
# -stat_random_mean mean value of stat in random shuffling
# -stat_random_sd standard deviation in random shuffling
#
# If ``perm`` is set to 0 or 1, ``fold_change``, ``p.value``, ``stat_random_mean`` and ``stat_random_sd`` are all ``NULL``.
#
# == seealso
# `genomic_corr_reldist`, `genomic_corr_jaccard`, `genomic_corr_absdist`, `genomic_corr_intersect`,
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqname = "chr1", ranges = IRanges(start = c(4, 10), end = c(6, 16)))
# gr2 = GRanges(seqname = "chr1", ranges = IRanges(start = c(7, 13), end = c(8, 20)))
# genomic_regions_correlation(gr1, gr2, nperm = 0)
# genomic_regions_correlation(list(gr1 = gr1), list(gr2 = gr2), nperm = 0)
#
genomic_regions_correlation = function(gr_list_1, gr_list_2, background = NULL,
chromosome = paste0("chr", c(1:22, "X", "Y")), species = "hg19",
nperm = 0, mc.cores = 1, stat_fun = genomic_corr_jaccard, ...,
bedtools_binary = Sys.which("bedtools"), tmpdir = tempdir()) {
## check input
if(inherits(gr_list_1, "GRanges")) {
gr_name_1 = deparse(substitute(gr_list_1))
gr_list_1 = list(gr_list_1)
names(gr_list_1) = gr_name_1
}
if(inherits(gr_list_2, "GRanges")) {
gr_name_2 = deparse(substitute(gr_list_2))
gr_list_2 = list(gr_list_2)
names(gr_list_2) = gr_name_2
}
if(is.null(names(gr_list_1))) {
stop("`gr_list_1` should have names.\n")
}
if(is.null(names(gr_list_2))) {
stop("`gr_list_2` should have names.\n")
}
if(nperm > 1) {
if(!file.exists(bedtools_binary) && nperm > 1) {
stop("Cannot find binary file for bedtools.")
}
}
message("set strand to * and merge potential overlapped regions in gr_list_1.")
gr_list_1 = lapply(gr_list_1, function(gr) {
strand(gr) = "*"
reduce(sort(gr))
})
message("set strand to * and merge potential overlapped regions in gr_list_2.")
gr_list_2 = lapply(gr_list_2, function(gr) {
strand(gr) = "*"
reduce(sort(gr))
})
# limit in chromosomes
message("subset regions in selected chromosomes.")
gr_list_1 = lapply(gr_list_1, function(gr) gr[ seqnames(gr) %in% chromosome])
gr_list_2 = lapply(gr_list_2, function(gr) gr[ seqnames(gr) %in% chromosome])
gr_name_1 = names(gr_list_1)
gr_name_2 = names(gr_list_2)
# limit in background
if(!is.null(background)) {
strand(background) = "*"
background = background[ seqnames(background) %in% chromosome ]
background = reduce(background)
message("overlaping `gr_list_1` to background")
gr_list_1 = lapply(gr_list_1, function(gr) {
intersect(gr, background)
})
message("overlaping `gr_list_2` to background")
gr_list_2 = lapply(gr_list_2, function(gr) {
intersect(gr, background)
})
# since `background` will be send to bedtoos, here we need a data frame
background_df = as.data.frame(background)[1:3]
}
# prepare values that will be returned
foldChange = matrix(0, nrow = length(gr_list_2), ncol = length(gr_list_1))
rownames(foldChange) = names(gr_list_2)
colnames(foldChange) = names(gr_list_1)
p = foldChange
stat = foldChange
stat_random_mean = stat
stat_random_sd = stat
if(nperm > 1) {
chr_len_df = getChromInfoFromUCSC(species)
chr_len_df = chr_len_df[chr_len_df[[1]] %in% chromosome, , drop = FALSE] # needed for bedtools shuffle
# cache chr_len_df and background files
chr_len_df_tmp = tempfile(tmpdir = tmpdir)
write.table(chr_len_df, file = chr_len_df_tmp, sep = "\t", row.names = FALSE, col.names= FALSE, quote = FALSE)
}
if(!is.null(background)) {
background_df_tmp = tempfile(tmpdir = tmpdir)
write.table(background_df, file = background_df_tmp, sep = "\t", row.names = FALSE, col.names= FALSE, quote = FALSE)
}
# loop in gr_list_1
for(i in seq_along(gr_list_1)) {
stat_random = matrix(0, nrow = length(gr_list_2), ncol = nperm)
# stat
for(j in seq_along(gr_list_2)) {
message(qq("calculating correlation between @{gr_name_1[i]} and @{gr_name_2[[j]]}"))
stat[j, i] = suppressWarnings(do.call("stat_fun", list(gr_list_1[[i]], gr_list_2[[j]], ...)))
}
if(nperm > 1) {
# random shuffle gr_list_1
# cache gr_list_1
gr_list_1_df = as.data.frame(gr_list_1[[i]])[1:3]
gr_list_1_df_tmp = tempfile()
write.table(gr_list_1_df, file = gr_list_1_df_tmp, sep = "\t", row.names = FALSE, col.names= FALSE, quote = FALSE)
res = mclapply(seq_len(nperm), mc.cores = mc.cores, function(k) {
if(is.null(background)) {
gr_random = systemdf(qq("'@{bedtools_binary}' shuffle -i '@{gr_list_1_df_tmp}' -g '@{chr_len_df_tmp}'"))
} else {
gr_random = systemdf(qq("'@{bedtools_binary}' shuffle -i '@{gr_list_1_df_tmp}' -g '@{chr_len_df_tmp}' -incl '@{background_df_tmp}'"))
}
gr_random = GRanges(seqnames = gr_random[[1]], ranges = IRanges(gr_random[[2]], gr_random[[3]]))
# x contains stat for every gr_list_2
x = numeric(length(gr_list_2))
for(j in seq_along(gr_list_2)) {
message(qq("calculating correlation between random_@{gr_name_1[i]} and @{gr_name_2[[j]]}, @{k}/@{nperm}"))
x[j] = suppressWarnings(do.call("stat_fun", list(gr_random, gr_list_2[[j]], ...)))
}
return(x)
})
# `res` is a list, convert to a matrix
for(k in seq_along(res)) {
stat_random[, k] = res[[k]]
}
file.remove(gr_list_1_df_tmp)
stat_random_mean[, i] = rowMeans(stat_random)
stat_random_sd[, i] = rowSds(stat_random)
foldChange[, i] = stat[, i]/stat_random_mean[, i]
p[, i] = sapply(seq_len(length(gr_list_2)), function(k) sum(stat_random[k, ] > stat[k, i])/nperm)
} else {
stat_random_mean[, i] = NA
stat_random_sd[, i] = NA
foldChange[, i] = NA
p[, i] = NA
}
}
if(nperm > 1) {
file.remove(chr_len_df_tmp)
}
if(!is.null(background)) {
file.remove(background_df_tmp)
}
if(nperm <= 1) {
foldChange = NULL
p = NULL
stat_random_mean = NULL
stat_random_sd = NULL
z = NULL
} else {
z = (stat - stat_random_mean)/stat_random_sd
}
res = list(stat = stat,
fold_change = foldChange,
p.value = p,
stat_random_mean = stat_random_mean,
stat_random_sd = stat_random_sd,
z = z)
return(res)
}
# == title
# Relative distance between two sets of genomic regions
#
# == param
# -gr1 genomic region 1, a `GenomicRanges::GRanges` object
# -gr2 genomic region 2, a `GenomicRanges::GRanges` object
#
# == details
# For regions in ``gr1`` and ``gr2``, they are all degenerated as single points
# which are the middle points of regions. For each middle point in ``gr1``, it looks
# for two nearest points in ``gr2`` on its left and right. The statistic is defined as the ratio of the distance
# to the nearest neighbour point to the distance of two neighbour points. If ``gr1`` and ``gr2`` are not correlated at all,
# It is expected that the ratio follows a uniform distribution. So final statisitic are the KS-statistic
# between the real distribution of rations to the uniform distribution.
#
# == reference
# Favoriv A, et al. Exploring massive, genome scale datasets with the GenometriCorr package. PLoS Comput Biol. 2012 May; 8(5):e1002529
#
# == value
# A single correlation value.
#
# == seealso
# `genomic_regions_correlation`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 5), c(3, 8)))
# gr2 = GRanges(seqnames = "chr1", ranges = IRanges(c(2, 6), c(4, 8)))
# genomic_corr_reldist(gr1, gr2)
#
genomic_corr_reldist = function(gr1, gr2) {
# GRanges for mid-points
gr1_mid = ceiling( (start(gr1) + end(gr1))/2 )
gr2_mid = ceiling( (start(gr2) + end(gr2))/2 )
# we don't need strand information here
gr1_mid_gr = GRanges(seqnames = seqnames(gr1),
ranges = IRanges(start = gr1_mid,
end = gr1_mid))
gr2_mid_gr = GRanges(seqnames = seqnames(gr2),
ranges = IRanges(start = gr2_mid,
end = gr2_mid))
gr2_mid_gr = sort(gr2_mid_gr)
# look for nearest position
mtch = nearest(gr1_mid_gr, gr2_mid_gr) # index in gr2
l = !is.na(mtch) # if there is no site in gr2 on some chromosome, the value would be NA
gr1_mid_gr = gr1_mid_gr[l] # remove these regions in gr1
m2 = gr2_mid_gr[ mtch[l] ]
mtch = mtch[l]
# look for another point in gr2
ind = ifelse(start(gr1_mid_gr) > start(m2), mtch + 1, mtch - 1) # ind contains index in gr2
l = ind >= 1 & ind <= length(gr2)
m3 = gr2_mid_gr[ ind[l] ]
gr1_mid_gr = gr1_mid_gr[l]
m2 = m2[l]
suppressWarnings(d <- distance(gr1_mid_gr, m2) / distance(m2, m3))
d = d[!is.na(d) & !is.infinite(d)] # this is not necessary, just double check
stat = 0
try(stat <- (integrate(ecdf(d), lower = 0, upper = 0.5)$value - 0.25) / 0.25, silent = TRUE)
return(stat)
}
# == title
# Jaccard coefficient between two sets of genomic regions
#
# == param
# -gr1 genomic region 1, a `GenomicRanges::GRanges` object
# -gr2 genomic region 2, a `GenomicRanges::GRanges` object
# -background background regions that should be only looked in, a `GenomicRanges::GRanges` object
#
# == details
# Jaccard coefficient is defined as the total length of intersection divided by total
# length of union of two sets of genomic regions.
#
# You can set the background when calculating Jaccard coefficient. For example,
# if the interest is the Jaccard coefficient between CpG sites in ``gr1`` and in ``gr2``
# ``background`` can be set with a `GenomicRanges::GRanges` object which contains positions of CpG sites.
#
# Be careful with the ``strand`` in your `GenomicRanges::GRanges` object!
#
# == value
# A single correlation value.
#
# == seealso
# `genomic_regions_correlation`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 5), c(3, 8)))
# gr2 = GRanges(seqnames = "chr1", ranges = IRanges(c(2, 6), c(4, 8)))
# genomic_corr_jaccard(gr1, gr2)
#
genomic_corr_jaccard = function(gr1, gr2, background = NULL) {
if(is.null(background)) {
res = sum(as.numeric(width(intersect(gr1, gr2)))) / sum(as.numeric(width(union(gr1, gr2))))
} else {
gr1 = intersect(gr1, gr2)
gr1 = intersect(gr1, background)
gr2 = union(gr1, gr2)
gr2 = intersect(gr2, background)
res = sum(as.numeric(width(gr1))) / sum(as.numeric(width(gr2)))
}
return(res)
}
# == title
# Absolute distance between two sets of genomic regions
#
# == param
# -gr1 genomic region 1, a `GenomicRanges::GRanges` object
# -gr2 genomic region 2, a `GenomicRanges::GRanges` object
# -method function in which input is a vector of distance and output is a scalar
# -... pass to ``method``
#
# == details
# For regions in ``gr1`` and ``gr2``, they are all degenerated as single points
# which are the middle points of corresponding regions. For each middle point in ``gr1``, it looks
# for the nearest point in ``gr2``. Assuming the distance vector is ``d``, the final statistic is ``method(d)``.
#
# == reference
# Favoriv A, et al. Exploring massive, genome scale datasets with the GenometriCorr package. PLoS Comput Biol. 2012 May; 8(5):e1002529
#
# == value
# A single correlation value.
#
# == seealso
# `genomic_regions_correlation`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 5), c(3, 8)))
# gr2 = GRanges(seqnames = "chr1", ranges = IRanges(c(2, 6), c(4, 8)))
# genomic_corr_absdist(gr1, gr2)
#
genomic_corr_absdist = function(gr1, gr2, method = mean, ...) {
# GRanges for mid-points
gr1_mid = ceiling( (start(gr1) + end(gr1))/2 )
gr2_mid = ceiling( (start(gr2) + end(gr2))/2 )
gr1_mid_gr = GRanges(seqnames = seqnames(gr1),
ranges = IRanges(start = gr1_mid,
end = gr1_mid))
gr2_mid_gr = GRanges(seqnames = seqnames(gr2),
ranges = IRanges(start = gr2_mid,
end = gr2_mid))
# look for nearest position
suppressWarnings(mtch <- distanceToNearest(gr1_mid_gr, gr2_mid_gr))
dst = mtch@elementMetadata[, 1]
dst = dst[!is.na(dst)]
stat = method(as.numeric(dst), ...)
return(stat)
}
# == title
# Intersections between two sets of genomic regions
#
# == param
# -gr1 genomic region 1, a `GenomicRanges::GRanges` object
# -gr2 genomic region 2, a `GenomicRanges::GRanges` object
# -method how to calculate the intersection statistic, see "details"
# -... pass to `GenomicRanges::countOverlaps` or `percentOverlaps`
#
# == details
# There are three metrics for the intersection statistic:
#
# -number It calculates number of regions in ``gr1`` that overlap with ``gr2``.
# Please note this value is not equal to the number of intersections betweenn two sets of regions,
# because one region in ``gr1`` may overlap with more than one
# regions in ``gr2``.
# -percent It calculates for each region in ``gr1``, how much it is covered by regions in ``gr2``.
# -length sum of length of the intersection of the two sets of regions.
#
# With methods of"number" and "percent", ``genomic_corr_intersect(gr1, gr2)`` is always not identical
# to ``genomic_corr_intersect(gr2, gr1)``.
#
# == value
# A single correlation value.
#
# == seealso
# `genomic_regions_correlation`
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqnames = "chr1", ranges = IRanges(c(1, 5), c(3, 8)))
# gr2 = GRanges(seqnames = "chr1", ranges = IRanges(c(2, 6), c(4, 8)))
# genomic_corr_intersect(gr1, gr2, method = "number")
# genomic_corr_intersect(gr1, gr2, method = "percent")
# genomic_corr_intersect(gr1, gr2, method = "length")
genomic_corr_intersect = function(gr1, gr2, method = c("number", "percent", "length"), ...) {
method = match.arg(method)[1]
if(method == "number") {
x = countOverlaps(gr1, gr2, ...)
res = sum(x > 0)
} else if(method == "percent") {
x = percentOverlaps(gr1, gr2, ...)
w = width(gr1)
res = sum(x*w)/sum(as.numeric(w))
} else if(method == "length") {
x = intersect(gr1, gr2)
res = sum(as.numeric(width(x)))
}
return(res)
}
make_ecdf = function(x) {
x = sort(x)
env = new.env()
env$fun = ecdf(x)
env$range = c(x[1], x[length(x)])
env
}
max_reduce_of_significance = function(x0, cdf, alpha = 0.05, plot = FALSE) {
x = seq(cdf$range[1], cdf$range[2], length = 1000)
p = cdf$fun(x)
ind = 2:1000
null_mean = sum((x[ind] + x[ind-1])/2 * (p[ind] - p[ind-1]))
fc = x0/null_mean
if(x0 <= null_mean) {
return(data.frame(p = 1 - cdf$fun(x0),
fold_change = fc,
null_mean = null_mean,
max_offset = NA,
rel_change = NA,
alpha = NA))
}
s = seq(0, x0 - null_mean, length = 1000)
p = sapply(s, function(x) {
1 - cdf$fun(x0 - x)
})
x = seq(cdf$range[1], cdf$range[2], length = 1000)
if(plot) plot(x, cdf$fun(x), type = "l", ylab = "P(X >= x)")
if(plot) abline(v = x0)
ind = which(p < alpha)
if(length(ind) == 0) {
return(data.frame(p = 1 - cdf$fun(x0),
fold_change = fc,
null_mean = null_mean,
max_offset = NA,
rel_change = NA,
alpha = NA))
} else {
if(plot) abline(v = x0 - s[length(ind)], col = "red", lty = 2)
max_offset = s[length(ind)]
}
return(data.frame(p = 1 - cdf$fun(x0),
fold_change = fc,
null_mean = null_mean,
max_offset = max_offset,
rel_change = max_offset/x0,
alpha = 1 - cdf$fun(x0 - max_offset)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.