# Function -------------------------------------------------------------------
#' @title Collapse/bind several `hyperSpec` objects into one object
#' @description
#' The spectra from all objects will be put into one object.
#' The resulting object has all wavelengths that occur in any of the input
#' objects, `wl.tolerance` is used to determine which difference in the
#' wavelengths is tolerated as equal: clusters of approximately equal
#' wavelengths will span at most `2 * wl.tolerance`.
#' Differences up to +/- `wl.tolerance` are considered equal.
#'
#' The returned object has wavelengths that are the weighted average
#' (by number of spectra) of the wavelengths within any such cluster of
#' approximately equal wavelengths.
#'
#' Labels will be taken from the first object where they are encountered.
#' However, the order of processing objects is not necessarily the same as
#' the order of objects in the input: `collapse` first processes groups of
#' input objects that share all wavelengths (within `wl.tolerance`).
#'
#' Data points corresponding to wavelengths not in the original spectrum will
#' be set to NA.
#' Extra data is combined in the same manner.
#'
#' If the objects are named, the names will be preserved in extra data column
#' `$.name`.
#' If the wavelengths are names, names are preserved and taken from the first
#' object where they were encountered, the same applies to possible column
#' names of the spectra matrix.
#'
#'
#' @aliases collapse
#' collapse.hyperSpec
#'
#'
#' @param ... `hyperSpec` objects to be collapsed into one object.
#' Instead of giving several arguments, a list with all objects to be
#' collapsed may be given.
#'
#' @param wl.tolerance Tolerance to decide which wavelengths are considered
#' equal.
#'
#' @param collapse.equal Logical indicating whether to try first finding
#' groups of spectra with (approximately) equal wavelength axes. If the
#' data is known to contain few or no such groups, `collapse()` will be
#' faster with this first pass being turned off.
#'
#'
#' @return A `hyperSpec` object
#'
#'
#' @seealso [merge()], [rbind()], and [plyr::rbind.fill()]
#'
#' @author C. Beleites
#'
#' @importFrom dplyr group_by
#' @importFrom dplyr summarise
#'
#'
#' @keywords manip
#' @concept manipulation
#'
#' @export
#'
#' @examples
#' barbiturates[1:3]
#' collapse(barbiturates[1:3])
#'
#' a <- barbiturates[[1]]
#' b <- barbiturates[[2]]
#' c <- barbiturates[[3]]
#'
#' a
#'
#' b
#'
#' c
#'
#' collapse(a, b, c)
#'
#' collapse(barbiturates[1:3], collapse.equal = FALSE)
collapse <- function(...,
wl.tolerance = hy_get_option("wl.tolerance"),
collapse.equal = TRUE) {
wl.tolerance <- .checkpos(wl.tolerance, "wl.tolerance")
dots <- list(...)
## accept also a list of hyperSpec objects
if (length(dots) == 1 && is.list(dots[[1]])) {
dots <- dots[[1]]
}
if (length(dots) == 0) {
stop("No hyperSpec objects were found in the input of collapse().")
} else if (any(sapply(dots, function(x) !inherits(x, "hyperSpec")))) {
stop(
"Not all inputs of collapse() are hyperSpec objects: ",
"either all inputs must be `hyperSpec` objects ",
"or there must be a list with hyperSpec objects only.")
}
## check the arguments
lapply(dots, assert_hyperSpec)
lapply(dots, validObject)
dots <- lapply(dots, wl_sort)
## check: wl.tolerance should be smaller than *half* of the smallest
## wavelength difference within each object
## half because we later check for distance <= wl.tolerance,
## so ± => window size is 2 wl.tolerance
.assert_suitable_tolerance(dots, wl.tolerance)
## make sure there aren't any NAs in wavelength
dots <- .assert_no_na_wl(dots)
## names cause problems with unlisting labels.
## preserve them in column .name
if (!is.null(names(dots))) {
dots <- mapply(function(object, name) {
object$.name <- name
object
}, dots, names(dots))
names(dots) <- NULL
}
## shall we do a first round collapsing objects that have their
## whole wavelength axes approximately equal?
if (collapse.equal) {
dots <- .collapse_equal(dots, wl.tolerance)
if (length(dots) == 1L) {
return(dots[[1]])
}
}
## Now cluster approximately equal wavelengths together
## prepare new labels
labels <- unlist(lapply(dots, slot, "label"))
labels <- labels[unique(names(labels))]
labels <- lapply(labels, function(l) {
if (is.language(l)) l <- as.expression(l) else l
})
## cluster wavelengths into groups of ± wl.tolerance from center
wl.df <- .cluster_wavelengths(dots, wl.tolerance)
## assign cluster number to columns
# wl.df is ordered by wavelength, each object in dots is ordered
# by wavelength, so
for (i in seq_along(dots)) {
colnames(dots[[i]]@data$spc) <- wl.df$wlcluster[wl.df$iobj == i]
}
## now we're ready for the actual work of collapsing the objects
dots <- rbind.fill(lapply(dots, slot, "data"))
## careful with constructing the wavelength vector:
## the columns in $spc are in no particular order,
## but the colnames indicate wavelength rank.
## so reorder $spc accor
dots$spc <- dots$spc[, order(as.numeric(colnames(dots$spc)))]
## we now need summarized wl.df data:
wl.df <- group_by(wl.df, .data$wlcluster)
wl.df <- summarise(wl.df,
wl = sum(.data$wl * .data$nspc) / sum(.data$nspc), # weighted average
old.wlnames = .data$old.wlnames[1L]
)
## prepare wavelength vector & restore old names (as far as possible)
wl <- wl.df$wl
if (any(!is.na(wl.df$old.wlnames))) {
names(wl) <- wl.df$old.wlnames
}
## make a new hyperSpec object
new("hyperSpec", wavelength = wl, data = dots, labels = labels)
}
# Unit tests -----------------------------------------------------------------
hySpc.testthat::test(collapse) <- function() {
context("collapse")
test_that("Empty collapse() returns error", {
# Empty ...
expect_error(collapse(), "No hyperSpec objects were found")
# Empty list
expect_error(collapse(list()), "No hyperSpec objects were found")
})
test_that("wrong collapse() inputs return error", {
# ... contains non-hyperSpecs
expect_error(
collapse(flu, flu, data.frame()),
"Not all inputs of collapse\\(\\) are hyperSpec objects"
)
# input list contains non-hyperSpecs
expect_error(
collapse(list(flu, flu, data.frame())),
"Not all inputs of collapse\\(\\) are hyperSpec objects"
)
})
test_that("basic collapse() functionality is ok", {
expect_silent(collapse(flu, flu))
expect_equal(collapse(flu), flu)
expect_equal(collapse(list(flu)), flu)
expect_equal(collapse(flu, flu), collapse(list(flu, flu)))
})
test_that("correctly assembled", {
new <- do.call(collapse, barbiturates[1:3])
wl <- unlist(lapply(barbiturates[1:3], slot, "wavelength"))
expect_equal(
wl(new),
sort(wl[!duplicated(wl)])
)
for (s in 1:3) {
expect_equal(as.numeric(new[[s, , wl(barbiturates[[s]])]]),
as.numeric(barbiturates[[s]][[]]),
label = paste0("barbiturates[[", s, "]]")
)
}
})
tmp <- collapse(a = flu, b = flu)
test_that("wavelength label is not lost", {
tmp <- collapse(flu, flu)
expect_equal(labels(tmp, ".wavelength"), labels(flu, ".wavelength"))
tmp <- collapse(flu[, , min ~ 410], flu[, , 414 ~ 420])
expect_equal(labels(tmp, ".wavelength"), labels(flu, ".wavelength"))
})
test_that("labels that are expressions stay expressions", {
tmp <- collapse(flu, flu)
expect_true(is.expression(labels(tmp)$.wavelength))
expect_true(is.expression(labels(tmp)$spc))
tmp <- collapse(flu[, , min ~ 405], flu[, , 414 ~ 420])
expect_true(is.expression(labels(tmp)$.wavelength))
expect_true(is.expression(labels(tmp)$spc))
})
test_that("collapse does not mess up labels if a named list is collapsed", {
expect_equal(
labels(tmp)[names(labels(flu))],
labels(flu)
)
})
test_that("named lists should return .name column", {
expect_equal(tmp$.name, rep(c("a", "b"), each = nrow(flu)))
})
test_that("no difference whether list or single arguments are given", {
tmp2 <- list(a = flu, b = flu)
tmp2 <- collapse(a = flu, b = flu)
expect_equal(tmp, tmp2,
check.attributes = TRUE,
check.names = TRUE,
check.column.order = FALSE,
check.label = TRUE
)
})
test_that("wl.tolerance", {
tmp <- flu
wl(tmp) <- wl(tmp) + 0.01
expect_equal(nwl(collapse(tmp, flu)), 2 * nwl(flu))
expect_equal(nwl(collapse(tmp, flu, wl.tolerance = 0.1)), nwl(flu))
})
test_that("check warning occurs for too large tolerance", {
expect_warning(collapse(flu, wl.tolerance = 0.25 + .Machine$double.eps))
})
test_that("bugfix: wl.tolerance generated warning for negative diff (wl (spc))", {
tmp <- flu
wl(tmp) <- rev(wl(tmp))
expect_silent(collapse(tmp, tmp))
})
test_that("result has orded wavelengths", {
tmp <- collapse(barbiturates[1:3])
expect_true(all(diff(wl(tmp)) >= 0))
})
test_that("collapsing objects with equal wavelength axes", {
expect_equivalent(collapse(barbiturates[[1]], barbiturates[[1]]),
barbiturates[[1]][c(1, 1)],
check.label = TRUE
)
})
test_that("new wavelengths are weighted mean of wavelength bin: shortcut for equal wavelength axes", {
tmp <- flu
wl(tmp) <- wl(flu) + 0.03
tmp <- collapse(flu, flu, tmp, wl.tolerance = 0.05)
expect_equal(wl(tmp), wl(flu) + 0.01)
tmp <- flu
wl(tmp) <- wl(flu) + 0.03
tmp <- collapse(flu[rep(1, 12)], tmp, wl.tolerance = 0.05)
expect_equal(wl(tmp), wl(flu) + 0.01)
})
test_that("names of wavelength kept", {
a <- barbiturates[[1]]
b <- barbiturates[[2]]
names(wl(a)) <- paste("Mass A", seq_len(nwl(a)))
names(wl(b)) <- paste("Mass B", seq_len(nwl(b)))
tmp <- collapse(a, b)
expect_true(all(names(wl(a)) %in% names(wl(tmp))))
expect_true(
all(
grep("B", names(wl(tmp)), value = TRUE) %in% names(wl(b))
)
)
expect_true(all(grepl("Mass [AB]", names(wl(tmp)))))
})
test_that("factor behaviour of collapse", {
a <- faux_cell[faux_cell$region == "nucleus"]
a$region <- droplevels(a$region)
b <- faux_cell[faux_cell$region != "nucleus"]
b$region <- droplevels(b$region)
tmp <- collapse(a, b)
tmp$region
expect_equal(
sort(levels(tmp$region)),
sort(levels(faux_cell$region))
)
})
test_that("hyperSpec objects with 1 wavelength", {
expect_equivalent(collapse(flu[, , 450], flu[, , 450]),
flu[rep(1:nrow(flu), 2), , 450],
check.labels = TRUE
)
tmp <- flu[rep(1:nrow(flu), 2)]
tmp[[7:12]] <- NA
tmp[[7:12, , 450]] <- flu[[, , 450]]
expect_equivalent(collapse(flu[, , 450], flu),
tmp,
check.labels = TRUE
)
})
test_that("hyperSpec objects with 0 wavelengths", {
expect_equivalent(collapse(flu[, , FALSE], flu[, , FALSE]),
flu[rep(1:nrow(flu), 2), , FALSE],
check.labels = TRUE
)
tmp <- collapse(flu[, , FALSE], flu[, "spc", 405 ~ 406])
expect_equal(tmp$c, c(flu$c, rep(NA, nrow(flu))))
expect_equal(tmp$spc, rbind(flu[[, , 405 ~ 406]] + NA, flu[[, , 405 ~ 406]]))
expect_equal(labels(tmp), lapply(labels(flu), as.expression))
})
test_that("hyperSpec objects with wavelength being/containing NA", {
expect_warning(collapse(flu[, , 0]))
expect_equal(
suppressWarnings(collapse(flu[, , 0], flu)),
collapse(flu[, , FALSE], flu)
)
expect_equal(
suppressWarnings(collapse(flu[, , c(0, 405)], flu)),
collapse(flu[, , 405], flu)
)
})
}
# Helper function -------------------------------------------------------------
## warn if wl.tolerance is too large, i.e. it would lead to cluster
## multiple wavelengths of the same object together.
.assert_suitable_tolerance <- function(dots, wl.tolerance) {
# wavelengths are ordered => no abs needed
wl.diff <- sapply(dots, function(x) if (nwl(x) < 2L) NA else min(diff(wl(x))))
i.warn <- wl.diff < 2 * wl.tolerance
if (any(isTRUE(i.warn))) {
warning(sprintf(
paste0("object %i: wl.tolerance (%g) too large compared to smallest ",
"wavelength difference within object (%f). Columns will be lost."),
which(i.warn), wl.tolerance, wl.diff[i.warn]
))
}
}
.assert_no_na_wl <- function(dots) {
i.NA <- sapply(dots, function(x) any(is.na(wl(x))))
if (any(i.NA)) {
warning(sprintf(
"object %i: wavelength vector contains NAs: these columns will be dropped",
which(i.NA)
))
dots[i.NA] <- lapply(dots[i.NA], function(x) x[, , !is.na(wl(x))])
}
dots
}
#' Try finding groups of hyperSpec objects with (approximately) equal wavelength axes
#'
#' ... and directly [rbind.fill()] them.
#'
#' @param dots List with hyperSpec object to collapse
#' @param wl.tolerance Wavelength difference tolerance
#'
#' @return Possible shorter list of dots
#' @noRd
.collapse_equal <- function(dots, wl.tolerance) {
## bind groups of objects that have *all* wavelengths equal
## within wl.tolerance from 1st object of potential group
i <- 1
while (i < length(dots)) {
bind_directly <- sapply(tail(dots, -i), function(x) {
(nwl(x) == nwl(dots[[i]])) && all(abs(wl(x) - wl(dots[[i]])) < wl.tolerance)
})
bind_directly <- which(bind_directly)
if (length(bind_directly) > 0L) {
n <- 0
wl <- rep(0, nwl(dots[[i]]))
for (j in c(i, i + bind_directly)) {
wl <- wl + nrow(dots[[j]]) * wl(dots[[j]])
n <- n + nrow(dots[[j]])
# also ensure same column names within spc.
colnames(dots[[j]]@data$spc) <- colnames(dots[[i]]@data$spc)
}
wl <- wl / n
dots[[i]]@data <- rbind.fill(
lapply(dots[c(i, i + bind_directly)], slot, "data"))
.wl(dots[[i]]) <- structure(wl, names = names(wl(dots[[i]])))
labels <- unlist(lapply(dots[c(i, i + bind_directly)], labels))
labels <- lapply(labels, function(l)
if (is.language(l)) l <- as.expression(l) else l)
labels(dots[[i]]) <- labels[!duplicated(names(labels))]
dots <- dots[-(i + bind_directly)]
}
i <- i + 1
}
dots
}
#' Find clusters of approximately equal wavelengths
#'
#' @param dots list of hyperSpec objects to collapse
#' @param wl.tolerance wavelength difference tolerance
#'
#' @concept wavelengths
#'
#' @return `data.frame` with information about suitable wavelength bins
#' @noRd
.cluster_wavelengths <- function(dots, wl.tolerance) {
# set up data.frame to hold relevant information
wl.df <- lapply(seq_along(dots), function(i) {
if (nwl(dots[[i]]) == 0L) {
return(data.frame())
}
data.frame(
wl = wl(dots[[i]]),
iobj = i,
nspc = nrow(dots[[i]]),
wlcol = seq_len(nwl(dots[[i]])),
wlcluster = NA
)
})
wl.df <- do.call("rbind", wl.df)
## save old wavelength names
wl.df$old.wlnames <- NA
for (i in seq_along(dots)) {
wln <- names(dots[[i]]@wavelength)
if (!is.null(wln)) wl.df$old.wlnames[wl.df$iobj == i] <- wln
}
wl.df <- wl.df[order(wl.df$wl), ]
## computational shortcut:
## wavelengths that are > 2 * wl.tolerance apart must be in different clusters,
## so cluster analysis can be split there
wl.diff <- diff(wl.df$wl) > 2 * wl.tolerance
## assign preliminary clusters
wl.df$wlcluster <- c(1, 1 + cumsum(wl.diff))
maxcluster <- tail(wl.df$wlcluster, 1)
## preliminary clusters may need to be split further
for (i in seq_len(tail(wl.df$wlcluster, 1))) {
tmp <- wl.df[wl.df$wlcluster == i, ]
## only 1 wavelength in cluster => nothing to do
if (length(tmp) <= 1L) {
next
}
## all wavelengths within 2 * wl.tolerance => nothing to do
if (tail(tmp$wl, 1) - tmp$wl[1] <= 2 * wl.tolerance) {
next
}
## clustering needs to be done on actually unique wavelengths only
unique.wl <- unique(tmp$wl)
# make clusters that span at most 2 * wl.tolerance
dist <- dist(unique.wl)
dend <- hclust(dist, method = "complete")
u <- data.frame(
wl = unique.wl,
wlcluster = cutree(dend, h = 2 * wl.tolerance) + maxcluster
)
maxcluster <- tail(u$wlcluster, 1)
## "expand" unique wavelengths => wavelengths per object
tmp <- merge(tmp, u, by = "wl", suffixes = c(".prelim", ""))
tmp$wlcluster.prelim <- NULL
wl.df$wlcluster[wl.df$wlcluster == i] <- tmp$wlcluster
}
## cluster numbers so far are in no particular order => rename them so
## they correspond to increasing wavelengths
## this saves one call to wl_sort () later on.
wl.df$wlcluster <- as.numeric(
factor(wl.df$wlcluster, levels = unique(wl.df$wlcluster))
)
wl.df
}
# Unit tests -----------------------------------------------------------------
hySpc.testthat::test(.cluster_wavelengths) <- function() {
context(".cluster_wavelengths")
test_that("clustering with last window being long", {
a <- as.hyperSpec(matrix(1:6, ncol = 3), wl = c(0, 2, 4))
b <- as.hyperSpec(matrix(1:6, ncol = 3), wl = c(0, 2, 5))
expect_equal(wl(collapse(a, b, wl.tolerance = 0.25)), c(0, 2, 4, 5))
expect_equal(wl(collapse(a, b, wl.tolerance = 0.5)), c(0, 2, 4.5))
})
test_that("new wavelengths are weighted mean of wavelength bin", {
a <- barbiturates[[1]][, , min ~ 30]
b <- barbiturates[[2]][, , min ~ 30]
wl(b) <- wl(b) + 0.03 / 2
expect_equal(
wl(collapse(a, b, a, wl.tolerance = 0.03)),
sort(c(
27.0499992370605, 28.1499996185303, 30.0499992370605,
27.1649996185303, 28.0649992370605, 30.1649996185303,
mean(c(29.0499992370605, 29.0499992370605, 29.0649992370605))
))
)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.