# == title
# Normalize Associations between Genomic Signals and Target Regions into a Matrix
#
# == param
# -signal A `GenomicRanges::GRanges-class` object.
# -target A `GenomicRanges::GRanges-class` object.
# -extend Extended base pairs to the upstream and/or downstream of ``target``. It can be a vector of length one or two.
# Length one means same extension to the upstream and downstream.
# -w Window size for splitting upstream and downstream, measured in base pairs
# -value_column Column index in ``signal`` that is mapped to colors. If it is not set, it assumes values for all signal regions are 1.
# -mapping_column Mapping column to restrict overlapping between ``signal`` and ``target``. By default it tries to look for
# all regions in ``signal`` that overlap with every target.
# -background Values for windows that don't overlap with ``signal``.
# -empty_value Deprecated, please use ``background`` instead.
# -mean_mode When a window is not perfectly overlapped to ``signal``, how to summarize
# values to the window. See 'Details' section for a detailed explanation.
# -include_target Whether include ``target`` in the heatmap? If the width of all regions in ``target`` is 1, ``include_target``
# is enforced to ``FALSE``.
# -target_ratio The ratio of ``target`` columns in the normalized matrix. If the value is 1, ``extend`` will be reset to 0.
# -k Number of windows only when ``target_ratio = 1`` or ``extend == 0``, otherwise ignored.
# -smooth Whether apply smoothing on rows in the matrix?
# -smooth_fun The smoothing function that is applied to each row in the matrix. This self-defined function accepts a numeric
# vector (may contain ``NA`` values) and returns a vector with same length. If the smoothing is failed, the function
# should call `base::stop` to throw errors so that `normalizeToMatrix` can catch how many rows are failed in smoothing.
# See the default `default_smooth_fun` for example.
# -keep Percentiles in the normalized matrix to keep. The value is a vector of two percent values. Values less than the first
# percentile is replaces with the first pencentile and values larger than the second percentile is replaced with the
# second percentile.
# -limit Similar as ``keep``, but it provides boundary for absolute values. The value should be a vector of length two.
# -trim Deprecated, please use ``keep`` instead.
# -flip_upstream Sometimes whether the signals are on the upstream or the downstream of the targets
# are not important and users only want to show the relative distance to targets. If the value is set
# to ``TRUE``, the upstream part in the normalized matrix is flipped and added to the downstream part
# The flipping is only allowed when the targets are single-point targets or the targets are excluded
# in the normalized matrix (by setting ``include_target = FALSE``). If the extension for the upstream
# and downstream is not equal, the smaller extension is used for the final matrix.
# -verbose Whether to print help messages.
#
# == details
# In order to visualize associations between ``signal`` and ``target``, the data is transformed into a matrix
# and visualized as a heatmap by `EnrichedHeatmap` afterwards.
#
# Upstream and downstream also with the target body are splitted into a list of small windows and overlap
# to ``signal``. Since regions in ``signal`` and small windows do not always 100 percent overlap, there are four different averaging modes:
#
# Following illustrates different settings for ``mean_mode`` (note there is one signal region overlapping with other signals):
#
# 40 50 20 values in signal regions
# ++++++ +++ +++++ signal regions
# 30 values in signal region
# ++++++ signal region
# ================= a window (17bp), there are 4bp not overlapping to any signal regions.
# 4 6 3 3 overlap
#
# absolute: (40 + 30 + 50 + 20)/4
# weighted: (40*4 + 30*6 + 50*3 + 20*3)/(4 + 6 + 3 + 3)
# w0: (40*4 + 30*6 + 50*3 + 20*3)/(4 + 6 + 3 + 3 + 4)
# coverage: (40*4 + 30*6 + 50*3 + 20*3)/17
#
# == value
# A matrix with following additional attributes:
#
# -``upstream_index`` column index corresponding to upstream of ``target``
# -``target_index`` column index corresponding to ``target``
# -``downstream_index`` column index corresponding to downstream of ``target``
# -``extend`` extension on upstream and downstream
# -``smooth`` whether smoothing was applied on the matrix
# -``failed_rows`` index of rows which are failed after smoothing
#
# The matrix is wrapped into a simple ``normalizeToMatrix`` class.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# signal = GRanges(seqnames = "chr1",
# ranges = IRanges(start = c(1, 4, 7, 11, 14, 17, 21, 24, 27),
# end = c(2, 5, 8, 12, 15, 18, 22, 25, 28)),
# score = c(1, 2, 3, 1, 2, 3, 1, 2, 3))
# target = GRanges(seqnames = "chr1", ranges = IRanges(start = 10, end = 20))
# normalizeToMatrix(signal, target, extend = 10, w = 2)
# normalizeToMatrix(signal, target, extend = 10, w = 2, include_target = TRUE)
# normalizeToMatrix(signal, target, extend = 10, w = 2, value_column = "score")
#
normalizeToMatrix = function(signal, target, extend = 5000, w = max(extend)/100,
value_column = NULL, mapping_column = NULL, background = ifelse(smooth, NA, 0),
empty_value = NULL, mean_mode = c("absolute", "weighted", "w0", "coverage"),
include_target = any(width(target) > 1),
target_ratio = min(c(0.4, mean(width(target))/(sum(extend) + mean(width(target))))),
k = min(c(20, min(width(target)))), smooth = FALSE, smooth_fun = default_smooth_fun,
keep = c(0, 1), limit = NULL, trim = NULL, flip_upstream = FALSE, verbose = TRUE) {
signal_name = deparse(substitute(signal))
target_name = deparse(substitute(target))
if(length(extend) == 1) extend = c(extend, extend)
## if value is categorical data
if(!is.null(value_column)) {
x = mcols(signal)[, value_column]
if(is.factor(x) || is.character(x)) {
if(flip_upstream) {
stop_wrap("The categorical signal is not allowed to flip the upstream signals.")
}
mat = normalize_with_category_variable(signal, target, x, extend = extend, w = w, mapping_column = mapping_column, background = 0,
empty_value = 0, target_ratio = target_ratio, k = k, smooth = smooth, smooth_fun = smooth_fun, keep = keep, limit = limit, trim = trim)
attributes(mat)$signal_name = signal_name
attributes(mat)$target_name = target_name
return(mat)
}
}
if(abs(target_ratio - 1) < 1e-6 || abs(target_ratio) >= 1) {
if(!all(extend == 0)) warning_wrap("Rest `extend` to 0 when `target_ratio` is larger than or euqal to 1.")
extend = c(0, 0)
} else if(all(extend == 0)) {
warning_wrap("Reset `target_ratio` to 1 when `extend` is 0.")
target_ratio = 1
}
if(abs(target_ratio) > 1) target_ratio = 1
target_is_single_point = all(width(target) <= 1)
if(target_is_single_point) {
if(include_target) {
warning_wrap("Width of `target` are all 1, `include_target` is set to `FALSE`.")
}
include_target = FALSE
}
if(extend[1] > 0) {
if(extend[1] %% w > 0) {
warning_wrap("Length of upstream extension is not completely divisible by `w`.")
extend[1] = extend[1] - extend[1] %% w
}
}
if(extend[2] > 0) {
if(extend[2] %% w > 0) {
warning_wrap("Length of downstream extension is not completely divisible by `w`.")
extend[2] = extend[2] - extend[2] %% w
}
}
if(!missing(empty_value) && missing(background)) {
if(!is.null(empty_value)) {
background = empty_value
}
}
if(!missing(trim) && missing(keep)) {
if(!is.null(trim)) {
if(length(trim) == 1) trim = rep(trim, 2)
keep = c(trim[1], 1 - trim[2])
}
}
.seq = function(start, end, by = 1) {
if(end < start) {
return(integer(0))
} else {
seq(start, end, by = by)
}
}
if(target_is_single_point) {
# do not need to separate upstream and downstream
# and it makes the boundary between upstream and downstream smoothing
suppressWarnings(both <- promoters(target, upstream = extend[1], downstream = extend[2] + 1))
mat_both = makeMatrix(signal, both, w = w, value_column = value_column, mapping_column = mapping_column,
background = background, mean_mode = mean_mode)
i = round(extend[1]/(extend[1] + extend[2]) * ncol(mat_both)) # assume
# if(i < 2 | ncol(mat_both) - i < 2) {
# stop("Maybe `w` is too large or one of `extend` is too small.")
# }
mat_upstream = mat_both[, .seq(1, i), drop = FALSE]
mat_downstream = mat_both[, .seq(i+1, ncol(mat_both)), drop = FALSE]
} else {
# extend and normalize in upstream
if(extend[1] <= 0) {
mat_upstream = matrix(0, nrow = length(target), ncol = 0)
} else {
suppressWarnings(upstream <- promoters(target, upstream = extend[1], downstream = 0))
mat_upstream = makeMatrix(signal, upstream, w = w, value_column = value_column, mapping_column = mapping_column,
background = background, mean_mode = mean_mode)
}
# extend and normalize in downstream
e = ifelse(strand(target) == "-", start(target) - 1, end(target) + 1)
end_target = GRanges(seqnames = seqnames(target),
ranges = IRanges(start = e, end = e),
strand = strand(target))
if(extend[2] <= 0) {
mat_downstream = matrix(0, nrow = length(target), ncol = 0)
} else {
suppressWarnings(downstream <- promoters(end_target, upstream = 0, downstream = extend[2] ))
names(downstream) = names(target)
mat_downstream = makeMatrix(signal, downstream, w = w, value_column = value_column, mapping_column = mapping_column,
background = background, mean_mode = mean_mode)
}
}
if(include_target) {
if(!all(extend == 0)) {
k = round((ncol(mat_upstream) + ncol(mat_downstream)) * target_ratio/(1-target_ratio))
if(k < 1) k = 1
}
mat_target = makeMatrix(signal, target, k = k, value_column = value_column, mapping_column = mapping_column, background = background, mean_mode = mean_mode)
} else {
mat_target = matrix(0, nrow = length(target), ncol = 0)
}
mat = cbind(mat_upstream, mat_target, mat_downstream)
# guess whether the signal is methylation
if(is.null(limit) && smooth) {
xx = mat[!is.na(mat)]
if(all(xx >= 0 & xx <= 1)) {
if(verbose) {
message_wrap("All signal values are within [0, 1], so we assume it is methylation signal. Automatically set limit [0, 1] to the smoothed values. If this is not the case, set argument `limit = NA` in the function to remove the limits. Set `verbose = FALSE` to turn off this message.")
}
limit = c(0, 1)
}
}
if(identical(limit, NA)) {
limit = NULL
}
# apply smoothing on rows in mat
failed_rows = NULL
all_positive = all(mat >= 0, na.rm = TRUE)
if(smooth) {
i_row = 0
ow = options("warn")[[1]]
mat = t(apply(mat, 1, function(x) {
i_row <<- i_row + 1
oe = try(x <- suppressWarnings(smooth_fun(x)), silent = TRUE)
if(inherits(oe, "try-error")) {
failed_rows <<- c(failed_rows, i_row)
}
if(all_positive) x[x < 0] = 0
return(x)
}))
options(warn = ow)
if(!is.null(failed_rows)) {
if(length(failed_rows) == 1) {
msg = paste(strwrap(paste0("Smoothing is failed for one row because there are very few signals overlapped to it. Please use `failed_rows(mat)` to get the index of the failed row and consider to remove it.\n")), collapse = "\n")
} else {
msg = paste(strwrap(paste0("Smoothing are failed for ", length(failed_rows), " rows because there are very few signals overlapped to them. Please use `failed_rows(mat)` to get the index of failed rows and consider to remove them.\n")), collapse = "\n")
}
msg = paste0("\n", msg, "\n")
warning_wrap(msg)
}
}
upstream_index = seq_len(ncol(mat_upstream))
target_index = seq_len(ncol(mat_target)) + ncol(mat_upstream)
downstream_index = seq_len(ncol(mat_downstream)) + ncol(mat_upstream) + ncol(mat_target)
attr(mat, "upstream_index") = upstream_index
attr(mat, "target_index") = target_index
attr(mat, "downstream_index") = downstream_index
attr(mat, "extend") = extend
attr(mat, "smooth") = smooth
attr(mat, "signal_name") = signal_name
attr(mat, "target_name") = target_name
attr(mat, "target_is_single_point") = target_is_single_point
attr(mat, "failed_rows") = failed_rows
attr(mat, "background") = background
attr(mat, "signal_is_categorical") = FALSE
.paste0 = function(a, b) {
if(length(a) == 0 || length(b) == 0) {
return(NULL)
} else {
paste0(a, b)
}
}
# dimension names are mainly for debugging
rownames(mat) = names(target)
if(ncol(mat_target)) {
colnames(mat) = c(.paste0("u", seq_along(upstream_index)), .paste0("t", seq_along(target_index)), .paste0("d", seq_along(downstream_index)))
} else {
colnames(mat) = c(.paste0("u", seq_along(upstream_index)), .paste0("d", seq_along(downstream_index)))
}
q1 = quantile(mat, keep[1], na.rm = TRUE)
q2 = quantile(mat, keep[2], na.rm = TRUE)
mat[mat <= q1] = q1
mat[mat >= q2] = q2
if(!is.null(limit)) {
mat[mat < limit[1]] = limit[1]
mat[mat > limit[2]] = limit[2]
}
class(mat) = c("normalizedMatrix", "matrix")
if(flip_upstream) {
mat = flip_upstream(mat)
}
return(mat)
}
normalize_with_category_variable = function(signal, target, category, ...) {
if(!is.factor(category)) {
category = factor(category)
}
level = levels(category)
n_level = length(level)
mat_list = lapply(as.list(split(signal, category)), function(gr) {
normalizeToMatrix(gr, target, value_column = NULL, mean_mode = "coverage", ...)
})[level]
arr = array(, dim = c(dim(mat_list[[1]]), n_level))
for(i in 1:n_level) {
arr[, , i] = mat_list[[i]]
}
mat = apply(arr, c(1, 2), function(x) {
if(all(x == 0)) {
return(0)
} else {
which.max(x)
}
})
mat = copyAttr(mat_list[[1]], mat)
attributes(mat)$signal_is_categorical = TRUE
attributes(mat)$signal_level = level
return(mat)
}
flip_upstream = function(mat) {
if(attributes(mat)$signal_is_categorical) {
stop_wrap("A normalized matrix with categorical signals is not allowed to flip the upstream signals.")
}
upstream_index = attr(mat, "upstream_index")
target_index = attr(mat, "target_index")
downstream_index = attr(mat, "downstream_index")
extend = attr(mat, "extend")
if(length(target_index) != 0) {
stop_wrap("Upstream can only be flipped when the targets are single points or excluded.")
}
k = min(length(upstream_index), length(downstream_index))
m = mat[, upstream_index[seq(length(upstream_index), length(upstream_index) - k + 1)]] +
mat[, downstream_index[1:k]]
attr(m, "upstream_index") = integer(0)
attr(m, "downstream_index") = 1:k
attr(m, "extend") = c(0, min(extend))
attr(m, "target_is_single_point") = attr(mat, "target_is_single_point")
attr(m, "target_index") = integer(0)
attr(m, "smooth") = attr(mat, "smooth")
attr(m, "signal_name") = attr(mat, "signal_name")
attr(m, "target_name") = attr(mat, "target_name")
attr(m, "failed_rows") = attr(mat, "failed_rows")
attr(m, "background") = attr(mat, "background")
colnames(m) = paste0("d", 1:k)
attr(m, "upstream_flipped") = TRUE
class(m) = c("normalizedMatrix", "matrix")
return(m)
}
#
# -gr input regions
# -target the upstream part or body part
# -window absolute size (100) or relative size(0.1)
# -value_column
# -mean_mode how to calculate mean value in a window
#
# == example
# gr = GRanges(seqnames = "chr1", ranges = IRanges(start = c(1, 4, 7), end = c(2, 5, 8)))
# target = GRanges(seqnames = "chr1", ranges =IRanges(start = 1, end = 10))
# makeMatrix(gr, target, w = 2)
#
makeMatrix = function(gr, target, w = NULL, k = NULL, value_column = NULL, mapping_column = NULL, background = 0,
mean_mode = c("absolute", "weighted", "w0", "coverage"), direction = c("normal", "reverse")) {
if(is.null(value_column)) {
gr$..value = rep(1, length(gr))
value_column = "..value"
}
# split `target` into small windows
target_windows = makeWindows(target, w = w, k = k, direction = direction)
strand(target_windows) = "*"
strand(gr) = "*"
# overlap `gr` to `target_windows`
mtch = findOverlaps(gr, target_windows)
mtch = as.matrix(mtch)
# add a `value` column in `target_window` which is the mean value for intersected gr
# in `target_window`
m_gr = gr[ mtch[, 1] ]
m_target_windows = target_windows[ mtch[, 2] ]
# subset `m_gr` and `m_target_windows` based on `mapping_column`
if(!is.null(mapping_column)) {
mapping = mcols(m_gr)[[mapping_column]]
if(is.numeric(mapping)) {
l = mapping == m_target_windows$.i_query
} else {
if(is.null(names(target))) {
stop("`mapping_column` in `gr` is mapped to the names of `target`, which means `target` should have names.")
} else {
l = mapping == names(target)[m_target_windows$.i_query]
}
}
m_gr = m_gr[l]
m_target_windows = m_target_windows[l]
mtch = mtch[l , , drop = FALSE]
}
# the value associated with `gr`
v = mcols(m_gr)[[value_column]]
mean_mode = match.arg(mean_mode)[1]
if(length(mtch)) {
if(mean_mode == "w0") {
mintersect = pintersect(m_gr, m_target_windows)
w = width(mintersect)
target_windows_list = split(ranges(m_gr), mtch[, 2])
target_windows2 = target_windows[as.numeric(names(target_windows_list))]
cov = coverage(target_windows_list, shift = -start(target_windows2), width = width(target_windows2))
#non_intersect_width = sapply(cov, function(x) sum(x == 0))
non_intersect_width = sapply(cov@listData, function(x) {ind = x@values == 0;sum(x@lengths[ind])})
x = tapply(w*v, mtch[, 2], sum, na.rm = TRUE) / (tapply(w, mtch[, 2], sum, na.rm = TRUE) + non_intersect_width)
} else if(mean_mode == "coverage") {
mintersect = pintersect(m_gr, m_target_windows)
p = width(mintersect)/width(m_target_windows)
x = tapply(p*v, mtch[, 2], sum, na.rm = TRUE)
} else if(mean_mode == "absolute") {
x = tapply(v, mtch[, 2], mean, na.rm = TRUE)
} else {
mintersect = pintersect(m_gr, m_target_windows)
w = width(mintersect)
x = tapply(w*v, mtch[, 2], sum, na.rm = TRUE) / tapply(w, mtch[, 2], sum, na.rm = TRUE)
}
v2 = rep(background, length(target_windows))
v2[ as.numeric(names(x)) ] = x
} else {
v2 = rep(background, length(target_windows))
}
target_windows$..value = v2
# transform into a matrix
tb = table(target_windows$.i_query)
target_strand = strand(target)
column_index = mapply(as.numeric(names(tb)), tb, FUN = function(i, n) {
if(as.vector(target_strand[i] == "-")) {
rev(seq_len(n))
} else {
seq_len(n)
}
}, SIMPLIFY = FALSE)
column_index = do.call("cbind", column_index)
# is column_index has the same length for all regions in target?
# if extension of upstream are the same or split body into k pieces,
# then all column index has the same length
# if it is not the same, throw error!
if(!is.matrix(column_index)) {
stop("numbers of columns are not the same.")
}
mat = matrix(background, nrow = length(target), ncol = dim(column_index)[1])
mat[ target_windows$.i_query + (as.vector(column_index) - 1)* nrow(mat) ] = target_windows$..value
# findOverlaps may use a lot of memory
rm(list = setdiff(ls(), "mat"))
gc(verbose = FALSE)
return(mat)
}
# == title
# Split Regions into Windows
#
# == param
# -query A `GenomicRanges::GRanges-class` object.
# -w Window size. A value larger than 1 means the number of base pairs and a value between 0 and 1
# is the percent to the current region.
# -k Number of partitions for each region. If it is set, all other arguments are ignored.
# -direction Where to start the splitting? See 'Details' section.
# -short.keep If the the region can not be split equally under the window size,
# the argument controls whether to keep the windows that are smaller than the window size. See 'Details' section.
#
# == details
# Following illustrates the meaning of ``direction`` and ``short.keep``:
#
# -->-->-->- one region, split by 3bp window (">" represents the direction of the sequence)
# aaabbbccc direction = "normal", short.keep = FALSE
# aaabbbcccd direction = "normal", short.keep = TRUE
# aaabbbccc direction = "reverse", short.keep = FALSE
# abbbcccddd direction = "reverse", short.keep = TRUE
#
#
# == value
# A `GenomicRanges::GRanges-class` object with two additional columns attached:
#
# - ``.i_query`` which contains the correspondance between small windows and original regions in ``query``
# - ``.i_window`` which contains the index of the small window on the current region.
#
# == author
# Zuguang gu <z.gu@dkfz.de>
#
# == example
# query = GRanges(seqnames = "chr1", ranges = IRanges(start = c(1, 11, 21), end = c(10, 20, 30)))
# makeWindows(query, w = 2)
# makeWindows(query, w = 0.5)
# makeWindows(query, w = 3)
# makeWindows(query, w = 3, direction = "reverse")
# makeWindows(query, w = 3, short.keep = TRUE)
# makeWindows(query, w = 3, direction = "reverse", short.keep = TRUE)
# makeWindows(query, w = 12)
# makeWindows(query, w = 12, short.keep = TRUE)
# makeWindows(query, k = 2)
# makeWindows(query, k = 3)
# query = GRanges(seqnames = "chr1", ranges = IRanges(start = c(1, 11, 31), end = c(10, 30, 70)))
# makeWindows(query, w = 2)
# makeWindows(query, w = 0.2)
#
makeWindows = function(query, w = NULL, k = NULL, direction = c("normal", "reverse"),
short.keep = FALSE) {
direction = match.arg(direction)[1]
if(is.null(w) & is.null(k)) {
stop("You should define either `w` or `k`.")
}
ostart = start(query)
oend = end(query)
if(!is.null(k)) {
pos = mapply(ostart, oend, FUN = function(s, e) {
x = seq(s, e, length = k+1)
y = round(x[-1])
x = round(x[-length(x)])
y[-k] = ifelse(y[-k] > x[-k], y[-k] - 1, y[-k])
return(list(start = x, end = y))
})
} else {
if(direction == "normal") {
if(w >= 1) {
w = as.integer(w)
pos = mapply(ostart, oend, FUN = function(s, e) {
x = seq(s, e, by = w)
y = x + w - 1
y = ifelse(y > e, e, y)
if(!short.keep) {
l = (y-x+1) == w
x = x[l]
y = y[l]
}
return(list(start = x, end = y))
})
} else if(w > 0 & w < 1) {
pos = mapply(ostart, oend, FUN = function(s, e) {
w = as.integer(round(e - s + 1)*w)
x = seq(s, e, by = w)
y = x + w - 1
y = ifelse(y > e, e, y)
if(!short.keep) {
l = (y-x+1) == w
x = x[l]
y = y[l]
}
return(list(start = x, end = y))
})
} else {
stop("`w` is wrong.")
}
} else {
if(w >= 1) {
w = as.integer(w)
pos = mapply(ostart, oend, FUN = function(s, e) {
y = seq(e, s, by = -w)
x = y - w + 1
x = ifelse(x < s, s, x)
if(!short.keep) {
l = (y-x+1) == w
x = x[l]
y = y[l]
}
return(list(start = rev(x), end = rev(y)))
})
} else if(w > 0 & w < 1) {
pos = mapply(ostart, oend, FUN = function(s, e) {
w = as.integer(round(e - s + 1)*w)
y = seq(e, s, by = -w)
x = y - w + 1
x = ifelse(x < s, s, x)
if(!short.keep) {
l = (y-x+1) == w
x = x[l]
y = y[l]
}
return(list(start = rev(x), end = rev(y)))
})
} else {
stop("`w` is wrong.")
}
}
}
# check start and end
start = unlist(pos[1, ])
end = unlist(pos[2, ])
i_query = rep(seq_len(ncol(pos)), times = sapply(pos[1, ], length))
i_window = unlist(lapply(pos[1, ], seq_along)) # which window from left to right
chr = seqnames(query)[i_query]
strand = strand(query)[i_query]
gr = GRanges(seqnames = chr,
ranges = IRanges(start = start,
end = end),
strand = strand,
.i_query = i_query,
.i_window = i_window)
return(gr)
}
# == title
# Subset normalized matrix by rows
#
# == param
# -x the normalized matrix returned by `normalizeToMatrix`
# -i row index
# -j column index
# -drop whether drop the dimension
#
# == value
# A ``normalizedMatrix`` class object.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
"[.normalizedMatrix" = function(x, i, j, drop = FALSE) {
attr = attributes(x)
attributes(x) = NULL
for(bb in intersect(names(attr), c("dim", "dimnames"))) {
attr(x, bb) = attr[[bb]]
}
if(!missing(i) && !missing(j)) {
return(x[i, j, drop = FALSE])
}
if(nargs() == 2) {
return(x[i])
}
if(nargs() == 3 && missing(i)) {
return(x[, j])
}
if(missing(i)) {
return(x[i, j, drop = drop])
}
x = x[i, , drop = FALSE]
for(bb in setdiff(names(attr), c("dim", "dimnames"))) {
attr(x, bb) = attr[[bb]]
}
return(x)
}
# == title
# Bind Matrix by Rows
#
# == param
# -... Matrices
# -deparse.level Not used.
#
# == value
# A ``normalizedMatrix`` class object.
#
# == author
# z.gu@dkfz.de
#
rbind.normalizedMatrix = function(..., deparse.level = 1) {
mat_list = list(...)
mat_list = mat_list[sapply(mat_list, function(x) !is.null(x))]
mat_list2 = lapply(mat_list, function(x) {
attributes(x) = attributes(x)["dim"]
x
})
rbind_matrix = selectMethod("rbind", signature = "matrix")
mat = do.call("rbind_matrix", mat_list2)
mat = copyAttr(mat_list[[1]], mat)
return(mat)
}
# == title
# Print the Normalized Matrix
#
# == param
# -x The normalized matrix returned by `normalizeToMatrix`.
# -... Other arguments.
#
# == value
# No value is returned.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
print.normalizedMatrix = function(x, ...) {
upstream_index = attr(x, "upstream_index")
target_index = attr(x, "target_index")
downstream_index = attr(x, "downstream_index")
extend = attr(x, "extend")
smooth = attr(x, "smooth")
signal_name = attr(x, "signal_name")
target_name = attr(x, "target_name")
target_is_single_point = attr(x, "target_is_single_point")
if(is.null(target_is_single_point)) {
target_is_single_point = FALSE
}
upstream_flipped = attr(x, "upstream_flipped")
if(is.null(upstream_flipped)) upstream_flipped = FALSE
op = qq.options(READ.ONLY = FALSE)
on.exit(qq.options(op))
qq.options(code.pattern = "@\\{CODE\\}")
qqcat("Normalize @{signal_name} to @{target_name}:\n")
if(upstream_flipped) {
qqcat(" Extension @{extend[2]} bp (@{length(downstream_index)} window@{ifelse(length(upstream_index) > 1, 's', '')})\n")
qqcat(" upstream is flipped to downstream.\n")
} else {
qqcat(" Upstream @{extend[1]} bp (@{length(upstream_index)} window@{ifelse(length(upstream_index) > 1, 's', '')})\n")
qqcat(" Downstream @{extend[2]} bp (@{length(downstream_index)} window@{ifelse(length(upstream_index) > 1, 's', '')})\n")
}
if(target_is_single_point) {
qqcat(" Include target regions (width = 1)\n")
} else {
if(length(target_index) == 0) {
qqcat(" Not include target regions\n")
} else {
qqcat(" Include target regions (@{length(target_index)} window@{ifelse(length(target_index) > 1, 's', '')})\n")
}
}
qqcat(" @{nrow(x)} target region@{ifelse(nrow(x) > 1, 's', '')}\n")
signal_is_categorical = attr(x, "signal_is_categorical")
if(is.null(signal_is_categorical)) signal_is_categorical = FALSE
if(signal_is_categorical) {
qqcat(" signal is categorical (@{length(attr(x, 'signal_level'))} levels)\n")
}
}
# == title
# Copy Attributes to Another Object
#
# == param
# -x Object 1.
# -y Object 2.
#
# == details
# The `normalizeToMatrix` object is actually a matrix but with more additional attributes attached.
# When manipulating such matrix, there are some circumstances that the attributes are lost.
# This function is used to copy these specific attributes when dealing with the matrix.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr = GRanges(seqnames = c("chr5", "chr5"),
# ranges = IRanges(start = c(98, 98),
# end = c(104, 104)))
# target = GRanges(seqnames = "chr5",
# ranges = IRanges(start = 100,
# end = 100))
# mat1 = normalizeToMatrix(gr, target, extend = 6, w = 1)
# # attributes removed and you cannot use it for EnrichedHeatmap()
# mat2 = mat1[]
# # copy attributes to mat2 and now mat3 can be used for EnrichedHeatmap()
# mat3 = copyAttr(mat1, mat2)
copyAttr = function(x, y) {
if(!identical(ncol(x), ncol(y))) {
stop("x and y should have same number of columns.\n")
}
attr = attributes(x)
for(bb in setdiff(names(attr), c("dim"))) {
if(bb == "dimnames") {
attr(y, bb)[[2]] = attr[[bb]][[2]] # set same column names
} else {
attr(y, bb) = attr[[bb]]
}
}
attr(y, "signal_name") = "\b"
return(y)
}
# == title
# Get Signals from a List
#
# == param
# -lt A list of normalized matrices which are returned by `normalizeToMatrix`. Matrices in the list should be generated with same settings (e.g. they
# should use same target regions, same extension to targets and same number of windows).
# -fun A user-defined function to summarize signals.
#
# == details
# Let's assume you have a list of histone modification signals for different samples and you want
# to visualize the mean pattern across samples. You can first normalize histone mark signals for each sample and then
# calculate means values across all samples. In following example code, ``hm_gr_list`` is a list of ``GRanges`` objects
# which contain positions of histone modifications, ``tss`` is a ``GRanges`` object containing positions of gene TSS.
#
# mat_list = NULL
# for(i in seq_along(hm_gr_list)) {
# mat_list[[i]] = normalizeToMatrix(hm_gr_list[[i]], tss, value_column = "density")
# }
#
# If we compress the list of matrices as a three-dimension array where the first dimension corresponds to genes,
# the second dimension corresponds to windows and the third dimension corresponds to samples, the mean signal
# across all sample can be calculated on the third dimension. Here `getSignalsFromList` simplifies this job.
#
# Applying ``getSignalsFromList()`` to ``mat_list``, it gives a new normalized matrix which contains mean signals across all samples and can
# be directly used in ``EnrichedHeatmap()``.
#
# mat_mean = getSignalsFromList(mat_list)
# EnrichedHeatmap(mat_mean)
#
# The correlation between histone modification and gene expression can
# also be calculated on the third dimension of the array. In the user-defined function ``fun``, ``x`` is the vector for gene i
# and window j in the array, and ``i`` is the index of current gene.
#
# mat_corr = getSignalsFromList(mat_list,
# fun = function(x, i) cor(x, expr[i, ], method = "spearman"))
#
# Then ``mat_corr`` here can be used to visualize how gene expression is correlated to histone modification around TSS.
#
# EnrichedHeatmap(mat_corr)
#
#
# == value
# A `normalizeToMatrix` object which can be directly used for `EnrichedHeatmap`.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# NULL
getSignalsFromList = function(lt, fun = function(x) mean(x, na.rm = TRUE)) {
if(!inherits(lt, "list")) {
stop_wrap("`lt` should be a list of objects which are returned by `normalizeToMatrix()`.")
}
if(!all(sapply(lt, inherits, "normalizedMatrix"))) {
stop_wrap("`lt` should be a list of objects which are returned by `normalizeToMatrix()`.")
}
n = length(lt)
if(n > 1) {
for(i in seq_len(n-1)) {
attr1 = attributes(lt[[i]])[ c("upstream_index", "target_index", "downstream_index", "extend") ]
attr2 = attributes(lt[[i+1]])[ c("upstream_index", "target_index", "downstream_index", "extend") ]
if(!identical(attr1, attr2)) {
stop_wrap("Objects in `lt` should have same settings.")
}
}
}
flag = 0
for(i in seq_len(n)) {
tm = lt[[i]]
if(!flag) {
arr = array(dim = c(dim(tm), length(lt)))
flag = 1
}
arr[, , i] = tm
}
if(identical(fun, mean)) {
fun = function(x) mean(x, na.rm = TRUE)
} else if(identical(fun, median)) {
fun = function(x) median(x, na.rm = TRUE)
} else if(identical(fun, max)) {
fun = function(x) max(x, na.rm = TRUE)
} else if(identical(fun, min)) {
fun = function(x) min(x, na.rm = TRUE)
}
if(length(as.list(fun)) == 2) {
m = apply(arr[, , ,drop = FALSE], c(1, 2), fun)
} else if(length(as.list(fun)) == 3) {
m = matrix(nrow = nrow(lt[[1]]), ncol = ncol(lt[[1]]))
for(i in seq_len(nrow(m))) {
for(j in seq_len(ncol(m))) {
m[i, j] = fun(arr[i, j, ], i)
}
}
} else {
stop_wrap("`fun` can only have one or two arguments.")
}
m = copyAttr(lt[[1]], m)
return(m)
}
# == title
# Default Smoothing function
#
# == param
# -x Input numeric vector.
#
# == details
# The smoothing function is applied to every row in the normalized matrix. For this default smoothing function,
# `locfit::locfit` is first tried on the vector. If there is error, `stats::loess` smoothing is tried afterwards.
# If both smoothing are failed, there will be an error.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
default_smooth_fun = function(x) {
l = !is.na(x)
if(sum(l) >= 2) {
oe1 = try(x <- suppressWarnings(predict(locfit(x[l] ~ lp(seq_along(x)[l], nn = 0.1, h = 0.8)), seq_along(x))), silent = TRUE)
if(inherits(oe1, "try-error")) {
oe2 = try(x <- suppressWarnings(predict(loess(x[l] ~ seq_along(x)[l], control = loess.control(surface = "direct")), seq_along(x))))
if(inherits(oe2, "try-error")) {
stop_wrap("error when doing locfit or loess smoothing")
} else {
return(x)
}
} else {
return(x)
}
} else {
stop_wrap("Too few data points.")
}
return(x)
}
# == title
# Discretize a Continuous Matrix to a Discrete Matrix
#
# == param
# -mat A normalize matrix from `normalizeToMatrix`.
# -rule A list of intervals which provide mapping between continuous values to discrete values.
# Note the order of intervals determines the order of corresponding discrete levels.
# -right_closed Is the interval right closed?
#
# == details
# Assuming we have a normalized matrix with both positive values and negative values, we only
# want to see the enrichment of the windows/regions showing significant positive values and
# negative values and we are only interested in the direction of the values while not the value itself,
# then we can define the rule as:
#
# rule = list(
# "positive" = c(0.5, Inf),
# "negative" = c(-Inf, -0.5)
# )
#
# And we can convert the continuous matrix to a discrete matrix and visualize it:
#
# mat2 = discretize(mat, rule)
# EnrichedHeatmap(mat2, col = c("positive" = "red", "negative" = "green"))
#
# Another example is to discretize the signals to discrete levels according to the intensities:
#
# rule = list(
# "very_high" = c(100, Inf),
# "high" = c(50, 100),
# "intermediate" = c(25, 50),
# "low" = c(1e-6, 25)
# )
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
discretize = function(mat, rule, right_closed = FALSE) {
if(!inherits(mat, "normalizedMatrix")) {
stop_wrap("`mat` should be generated by `normalizeToMatrix().")
}
if(!inherits(rule, "list")) {
stop_wrap("`rule` should be defined as a list of intervals.")
}
if(!all(sapply(rule, length) == 2)) {
stop_wrap("All intervals in `rule` should have length of 2.")
}
mat2 = matrix(0, nrow = nrow(mat), ncol = ncol(mat))
for(i in seq_along(rule)) {
interval = rule[[i]]
if(right_closed) {
l = mat > interval[1] & mat <= interval[2]
} else {
l = mat >= interval[1] & mat < interval[2]
}
mat2[l] = i
}
mat2 = copyAttr(mat, mat2)
attributes(mat2)$signal_is_categorical = TRUE
attributes(mat2)$signal_level = names(rule)
return(mat2)
}
# == title
# Convert a Normal Matrix to a normalizedMatrix Object
#
# == param
# -mat A matrix generated by other software.
# -k_upstream Number of windows in the upstream.
# -k_downstream Number of windows in the downstream.
# -k_target Number of windows in the target.
# -extend Extension to the target. The length should be 1 (if one of ``k_upstream`` or ``k_downstream`` is zero).
# or 2 (if both of ``k_upstream`` and ``k_downstream`` are non-zero).
# -signal_name The name of signal regions. It is only used for printing the object.
# -target_name The name of the target names. It is only used for printing the object.
# -background The background value in the matrix.
# -smooth Whether apply smoothing on rows in the matrix.
# -smooth_fun The smoothing function that is applied to each row in the matrix. This self-defined function accepts a numeric
# vector (may contain ``NA`` values) and returns a vector with same length. If the smoothing is failed, the function
# should call `base::stop` to throw errors so that `normalizeToMatrix` can catch how many rows are failed in smoothing.
# See the default `default_smooth_fun` for example.
# -keep Percentiles in the normalized matrix to keep. The value is a vector of two percent values. Values less than the first
# percentile is replaces with the first pencentile and values larger than the second percentile is replaced with the
# second percentile.
# -trim Deprecated, please use ``keep`` instead.
#
# == details
# If users use the matrix from other software, they can use this function to convert it to the ``normalizedMatrix`` object
# and visualize it afterwards.
#
# == value
# A ``normalizedMatrix`` object.
#
# == author
# z.gu@dkfz.de
#
as.normalizedMatrix = function(mat, k_upstream = 0, k_downstream = 0, k_target = 0,
extend, signal_name = "signals", target_name = "targets",
background = NA, smooth = FALSE, smooth_fun = default_smooth_fun,
keep = c(0, 1), trim = NULL) {
if(k_upstream + k_target + k_downstream != ncol(mat)) {
stop_wrap("sum of `k_upstream`, `k_target` and `k_downstream` should be equal to the col of `mat`.")
}
# apply smoothing on rows in mat
failed_rows = NULL
if(smooth) {
mat[mat == background] = NA
i_row = 0
ow = options("warn")[[1]]
mat = t(apply(mat, 1, function(x) {
i_row <<- i_row + 1
oe = try(x <- suppressWarnings(smooth_fun(x)), silent = TRUE)
if(inherits(oe, "try-error")) {
failed_rows <<- c(failed_rows, i_row)
}
return(x)
}))
options(warn = ow)
if(!is.null(failed_rows)) {
if(length(failed_rows) == 1) {
msg = paste(strwrap(paste0("Smoothing is failed for one row because there are very few signals overlapped to it. Please use `attr(mat, 'failed_rows')` to get the index of the failed row and consider to remove it.\n")), collapse = "\n")
} else {
msg = paste(strwrap(paste0("Smoothing are failed for ", length(failed_rows), " rows because there are very few signals overlapped to them. Please use `attr(mat, 'failed_rows')` to get the index of failed rows and consider to remove them.\n")), collapse = "\n")
}
msg = paste0("\n", msg, "\n")
warning_wrap(msg)
}
background = NA
}
if(k_upstream > 0) {
upstream_index = seq_len(k_upstream)
} else {
upstream_index = integer(0)
}
if(k_target > 0) {
target_index = seq(k_upstream + 1, k_upstream + k_target)
} else {
target_index = integer(0)
}
if(k_downstream > 0) {
downstream_index = seq(k_upstream + k_target + 1, k_upstream + k_target + k_downstream)
} else {
downstream_index = integer(0)
}
if(length(extend) == 1) {
if(length(upstream_index) == 0 && length(downstream_index) == 0) {
extend = c(0, 0)
} else if(length(upstream_index) == 0) {
extend = c(0, extend)
} else if(length(downstream_index) == 0) {
extend = c(extend, 0)
} else {
extend = rep(extend, 2)
}
} else if(length(extend) > 2) {
stop_wrap("length of `extend` should only be 1 or 2.")
}
attr(mat, "upstream_index") = upstream_index
attr(mat, "target_index") = target_index
attr(mat, "downstream_index") = downstream_index
attr(mat, "extend") = extend
attr(mat, "smooth") = smooth
attr(mat, "signal_name") = signal_name
attr(mat, "target_name") = target_name
attr(mat, "target_is_single_point") = FALSE
attr(mat, "failed_rows") = failed_rows
attr(mat, "background") = background
attr(mat, "signal_is_categorical") = FALSE
.paste0 = function(a, b) {
if(length(a) == 0 || length(b) == 0) {
return(NULL)
} else {
paste0(a, b)
}
}
# dimension names are mainly for debugging
if(!attr(mat, "target_is_single_point")) {
colnames(mat) = c(.paste0("u", seq_along(upstream_index)), .paste0("t", seq_along(target_index)), .paste0("d", seq_along(downstream_index)))
} else {
colnames(mat) = c(.paste0("u", seq_along(upstream_index)), .paste0("d", seq_along(downstream_index)))
}
q1 = quantile(mat, keep[1], na.rm = TRUE)
q2 = quantile(mat, keep[2], na.rm = TRUE)
mat[mat <= q1] = q1
mat[mat >= q2] = q2
class(mat) = c("normalizedMatrix", "matrix")
return(mat)
}
# == title
# Indices of Rows Failed from Smoothing
#
# == param
# -m Matrix from `normalizeToMatrix`.
#
# == value
# A numeric vector or ``NULL``.
#
failed_rows = function(m) {
attr(m, "failed_rows")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.