#' @title Density matrix builder
#'
#' @description A function (completely in R) that generates a matrix given a list of regions (.bed files) and signals (.bigWig files) alternative (even though more time consuming) to \link{computeMatrix.deeptools}. The output can be passed as it is to the functions \link{plot.density.profile}, \link{plot.density.summary} and, \link{plot.density.differences}.
#'
#' @param mode A string indicating the method for the matrix computation:
#' \itemize{
#' \item \code{scale-regions} all regions in the BED file are stretched or shrunken to the length (in bases) indicated by the user (\code{body.length});
#' \item \code{reference-point} the matrix will be performed on the range -upstream+downstream from the indicated reference point (center, TSS, TES).
#' }
#' @param regions.list A string vector with a list of full paths to bed files or list of data.frames in at least BED3 format (eg. generated by \link{build.bed}).
#' @param samples.list A string vector with a list of full paths to bigWig files.
#' @param region.names A string vector with the names of the regions. If \code{NULL} or of length lower than the number of regions the names will be assigned using the basename of the file if a path is provided otherwise "region_<order number>". By default \code{NULL}.
#' @param sample.names A string vector with the names of the samples. If \code{NULL} or of length lower than the number of samples the names will be assigned using the basename of the file. By default \code{NULL}.
#' @param sort.regions.coordinates Logical value to define whether the output matrix should contain the regions sorted by genomic location for each region group (sorted by \link{sort.bed}). By default \code{FALSE}.
#' @param reference.point The reference point for the matrix generation could be either the region start (\code{"TSS"}), the region end (\code{"TES"}) or the \code{"center"} of the region. By default \code{"center"}.
#' @param reference.point.label A single string with the label for the reference point that could be used for the plots.
#' @param upstream Distance, in bases (bp), upstream of the reference-point, in "reference-point" mode, or the region start, in "scale-regions" mode. By default \code{500}.
#' @param downstream Distance, in bases (bp), downstream of the reference-point, in "reference-point" mode, or the region start, in "scale-regions" mode. By default \code{500}.
#' @param body.length Distance, in bases (bp), to which all regions will be fit. By default: \code{1000}.
#' @param missing.data.as.zero A logical value to define whether missing data (NAs) should be treated as zeros. By default \code{FALSE}.
#' @param bin.size Length, in bases (bp), of the non-overlapping bins for averaging the score over the regions length. By default \code{10}.
#' @param binning.operation A single string to define the type of statistic that should be used over the bin size range. The options are: "mean", "median", "sum". By default \code{"mean"}.
#' @param stranded Logical value to indicate whether the strand of the region should be taken into account. When \code{TRUE}, the order of the bigWig score for the given region will be reversed. Default \code{FALSE}.
#'
#' @return The function returns a named list containing:
#' \itemize{
#' \item \code{metadata} data.frame with the parameters used to build the matrix;
#' \item \code{matrix.data} data.frame with the computed scores;
#' \item \code{original.file.path} with the string: "Matrix generated by Rseb::density.matrix()".
#' }
#' This list can be passed as it is to the functions \link{plot.density.profile}, \link{plot.density.summary} and, \link{plot.density.differences}.
#'
#' @export density.matrix
density.matrix = function(mode,
regions.list,
samples.list,
region.names = NULL,
sample.names = NULL,
sort.regions.coordinates = FALSE,
reference.point = "center",
reference.point.label = NULL,
upstream = 500,
downstream = 500,
body.length = 1000,
missing.data.as.zero = FALSE,
bin.size = 10,
binning.operation = "mean",
stranded = FALSE) { # BEGIN FUNCTION
##########################################################################################
# Check if Rseb is up-to-date #
Rseb::actualize(update = F, verbose = F)
##########################################################################################
# CHECK PARAMETERS
##########################################################################################
# Regions names
if ((length(region.names) < length(regions.list)) | is.null(region.names)) {
message("The number of region.names is NULL or lower than the number of provided regions files. The region.names will automatically redifined.")
if (class(regions.list) == "character") {
region.names =
sapply(regions.list, function(x){
if (class(x) == "character") {return(gsub(x = basename(x), pattern = "[.].[A-z]*$", replacement = ""))} else {
return(paste("region", which(sapply(regions.list, function(y) {y == x}), sapply(regions.list, function(y) {y == x}) == TRUE), sep="_"))
}
})
} else {
if ("data.frame" %in% class(regions.list)) {regions.list = list(regions.list)}
region.names = paste0("region_", c(1:length(regions.list)))
}
}
# Sample names
if (length(sample.names) < length(samples.list) | is.null(sample.names)) {
message("The number of sample.names is NULL or lower than the number of provided regions files. The sample.names will automatically redifined.")
sample.names = sapply(samples.list, function(x){return(gsub(x = basename(x), pattern = "[.].[A-z]*$", replacement = ""))})
}
# Reference point
if (!(tolower(mode) %in% c("reference-point", "scale-regions"))) {
return(warning("The mode must be one among 'reference-point' and 'scale-regions'."))
}
# Check bin.size compatibility
if (mode == "reference-point") {
if ((upstream+downstream) %% bin.size != 0) {return(warning("The region size (upstream + downstream) must be multiple of the bin.size"))}
} else {
if (upstream %% bin.size != 0) {return(warning("The upstream region size must be multiple of the bin.size"))}
if (downstream %% bin.size != 0) {return(warning("The downstream region size must be multiple of the bin.size"))}
if (body.length %% bin.size != 0) {return(warning("The body.length region size must be multiple of the bin.size"))}
}
# Reference-point
if (!(tolower(reference.point) %in% c("tss", "tes", "center")) | length(reference.point) > 1) {
return(warning("The reference.point must be one among 'TSS', 'TES', 'center'. Default: 'center'."))
}
# Reference-point-labels
if (is.null(reference.point.label)) {
reference.point.label = reference.point
} else if (length(reference.point.label) > 1) {
return(warning("The reference point label must be a single string."))
}
# Set ranges as positive values
upstream = abs(upstream)
downstream = abs(downstream)
bin.size = abs(bin.size)
body.length = ifelse(test = mode == "reference-point",
yes = 0,
no = abs(body.length))
# Binning.operation check
if (!(tolower(binning.operation) %in% c("mean", "median", "sum"))) {
return(warning("The reference.point must be one among 'mean', 'median', 'sum'. Default: 'mean'."))
}
##########################################################################################
# INTERNAL FUNCTIONS DEFINITION
##########################################################################################
###############
import_bed = function(bed, mode, upstream, downstream, reference.point, sort.regions.coordinates) {
# Required libraries
require(GenomicRanges)
require(dplyr)
# importing bed file/regions
if (class(bed) == "character") {regions = read.delim(file = bed, sep = "\t", header = F)
} else if (class(bed) %in% c("data.frame", "tibble")) {regions = as.data.frame(bed)}
if (sort.regions.coordinates == TRUE) {
bed = Rseb::sort.bed(bed = bed, return.bed = T)
}
# Normalize bed file to a BED6
if (class(regions) == "data.frame") {
if (ncol(regions) >= 6) {regions = regions[,1:6]
} else if (ncol(regions) >= 3 & ncol(regions) < 6) {
if (ncol(regions) == 3) {
regions =
regions %>%
dplyr::mutate(name = paste("region", 1:nrow(regions), sep="_"),
score = 0,
strand = ".")
} else if (length(regions) == 4) {
regions =
regions %>%
dplyr::mutate(score = 0,
strand = ".")
} else if (length(regions) == 5) {
regions = regions %>% dplyr::mutate(strand = ".")
}
}
}
colnames(regions) = c("chr", "start", "end", "name", "score", "strand")
regions$strand = sapply(regions$strand, function(x){ifelse(test = x %in% c("+", "-", "*"),
yes = x,
no = "*")})
# Redefine regions depending on mode
if (mode == "reference-point") {
if (tolower(reference.point) == "center") {
reference.position = round((regions$start + regions$end) / 2)
} else if (toupper(reference.point) == "TSS") {
reference.position = ifelse(test = regions$strand == "-",
yes = regions$end,
no = regions$start)
} else if (toupper(reference.point) == "TES") {
reference.position = ifelse(test = regions$strand == "-",
yes = regions$start,
no = regions$end)
}
start = ifelse(test = regions$strand == "-",
yes = reference.position - downstream + 1,
no = reference.position - upstream)
end = ifelse(test = regions$strand == "-",
yes = reference.position + upstream,
no = reference.position + downstream - 1)
} else if (mode == "scale-regions") {
start = ifelse(test = regions$strand == "-",
yes = regions$start - downstream,
no = regions$start - upstream)
end = ifelse(test = regions$strand == "-",
yes = regions$end + upstream,
no = regions$end + downstream)
}
# Convert regions to GRanges object
return(unique(GRanges(seqnames = regions$chr,
ranges = IRanges(start = start,
end = end,
names = regions$name),
strand = regions$strand,
score = regions$score)))
} # end import.bed
###########
get.single.base.scores = function(bigWig, regions, missing.data.as.zero) {
require(rtracklayer)
score_list = list()
for (i in 1:length(regions)) {
# Chek if the bigWig chrom names are the same than in the bed regions
if (suppressWarnings(inherits(try(import(BigWigFile(bigWig), selection = regions[i], as = 'NumericList')[[1]],
silent = TRUE),
"try-error"))) { # it is TRUE when it does not work due to chromosome names in the bigWig != bed
regions[i] = diffloop::rmchr(regions[i])
}
score_list[[i]] = import(BigWigFile(bigWig), selection = regions[i], as = 'NumericList')[[1]]
if ((strand(regions[i])@values[1] == "-") & stranded == TRUE) {score_list[[i]] = rev(score_list[[i]])}
}
if (missing.data.as.zero == TRUE) {purrr::pmap(.l = score_list,
.f = function(score_list){
score_list[is.na(score_list)] = 0
return(score_list)})}
if (missing.data.as.zero == TRUE) {
lapply(score_list,
function(x){
x[is.na(x)] = 0
return(x)})
}
return(score_list)
} # end get.single.base.scores
###########
compute.bins = function(score_list,
mode,
bin.size,
binning.operation,
upstream,
downstream,
body.length) {
### reference point
if (mode == "reference-point") {
# Build a matrix
matrix = do.call(rbind, score_list)
# Compute binning (when bin.size > 1)
if (binning.operation == "mean" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {rowMeans(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else if (binning.operation == "sum" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {rowSums(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else if (binning.operation == "median" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {robustbase::rowMedians(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else {return(matrix)}
### scale-regions
} else if (mode == "scale-regions") {
### Re-scaling of the body regions
# Extract different parts values
upstream.scores = lapply(score_list, function(x){x[1:upstream]})
body.scores = lapply(score_list, function(x){x[-c(1:upstream, (length(x) - downstream + 1):length(x))]})
downstream.scores = lapply(score_list, function(x){x[c((length(x) - downstream + 1):length(x))]})
# Get at least..body.length-size vectors
resized.body.scores = lapply(body.scores, function(x){if (length(x) < body.length){return(rep(x, each = ceiling(body.length/length(x))))} else {return(x)}})
# Split in body.length chunks and calculate the mean for each chunk
rescaled.scores = lapply(resized.body.scores, function(x){
scaled = sapply(split(x, ceiling(seq_along(x)/(length(x)/body.length))), mean, USE.NAMES = F)
if (length(scaled) > body.length) {scaled = scaled[body.length] = mean(scaled[c(body.length:length(scaled))])}
return(scaled[1:body.length])
})
# Merge (upstream + body.length + downstream) matrix
matrix = do.call(rbind, purrr::pmap(.l = list(up = upstream.scores, body = rescaled.scores, down = downstream.scores),
.f = function(up, body, down) {return(c(up, body, down))}))
# Compute binning
if (binning.operation == "mean" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {rowMeans(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else if (binning.operation == "sum" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {rowSums(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else if (binning.operation == "median" & bin.size != 1) {
bin.score.list = purrr::pmap(.l = list(bin.left = seq(1, (upstream+body.length+downstream), by = bin.size),
bin.right = seq(bin.size, (upstream+body.length+downstream), by = bin.size)),
.f = function(bin.left, bin.right) {robustbase::rowMedians(matrix[,bin.left:bin.right], na.rm = F)})
return(do.call(cbind, bin.score.list))
} else {return(matrix)}
} # end scale-regions
} # end compute.bins
##########################################################################################
# COMPUTE WHOLE MATRIX
##########################################################################################
regions.boundaries = 0
group.matrices = list()
for (r in 1:length(regions.list)) {
current.bed = import_bed(bed = regions.list[[r]],
mode = mode,
upstream = upstream,
downstream = downstream,
reference.point = reference.point,
sort.regions.coordinates = sort.regions.coordinates)
regions.boundaries = c(regions.boundaries, length(current.bed) + regions.boundaries[length(regions.boundaries)])
all.samples.matrces = list()
for (s in 1:length(samples.list)) {
all.samples.matrces[[s]] = compute.bins(score_list = get.single.base.scores(bigWig = samples.list[s],
regions = current.bed,
missing.data.as.zero = missing.data.as.zero),
mode = mode,
bin.size = bin.size,
binning.operation = binning.operation,
upstream = upstream,
downstream = downstream,
body.length = body.length)
}
group.matrices[[r]] = cbind(as.data.frame(current.bed, row.names = c()),
as.data.frame(do.call(cbind, all.samples.matrces)))
} # end matrices list generation
whole.matrix = do.call(rbind, group.matrices)
# Define sample boundaries
samples.boundaries = 0
length_samples = ifelse(test = mode == "reference-point",
yes = (upstream + downstream)/bin.size,
no = (upstream + body.length + downstream)/bin.size)
for (i in 1:length(samples.list)) {samples.boundaries = c(samples.boundaries, length_samples + samples.boundaries[length(samples.boundaries)])}
##########################################################################################
# METADATA TABLE
##########################################################################################
metadata = data.frame(upstream = paste(rep(upstream, length(samples.list)), collapse=","),
downstream = paste(rep(upstream, length(samples.list)), collapse=","),
body = ifelse(test = mode == "scale-regions",
yes = paste(rep(body.length, length(samples.list)), collapse=","),
no = paste(rep(0, length(samples.list)), collapse=",")),
bin_size = paste(rep(bin.size, length(samples.list)), collapse=","),
ref_point = paste(rep(reference.point.label, length(samples.list)), collapse=","),
verbose = "false",
bin_avg_type = binning.operation,
missing_data_as_zero = tolower(as.character(missing.data.as.zero)),
min_threshold = "null",
max_threshold = "null",
scale = 1,
skip_zeros = "false",
nan_after_end = "false",
proc_number = parallel::detectCores(),
sort_regions = ifelse(test = sort.regions.coordinates == TRUE, yes = "sorted", no = "keep"),
sort_using = ifelse(test = sort.regions.coordinates == TRUE, yes = "coordinates", no = "null"),
unscaled_5_prime = "null",
unscaled_3_prime = "null",
group_labels = paste(region.names, collapse = ","),
group_boundaries = paste(regions.boundaries, collapse = ","),
sample_labels = paste(sample.names, collapse = ","),
sample_boundaries = paste(samples.boundaries, collapse = ",")
)
##########################################################################################
# OUTPUT
##########################################################################################
return(list(metadata = data.frame(parameters = colnames(metadata),
values = t(metadata)[,1],
row.names = c()),
matrix.data = whole.matrix,
original.file.path = "Matrix generated by Rseb::density.matrix()"))
} # END FUNCTION
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.