Nothing
## TODO: encode some kind of ID into the dissimilarity matrix for later use
#' Evaluate Generalized Horizon Labels
#'
#' Data-driven evaluation of generalized horizon labels using nMDS and
#' silhouette width.
#'
#' Non-metric multidimensional scaling is performed via [MASS::isoMDS()].
#' The input distance matrix is generated by [cluster::daisy()] using
#' (complete cases of) horizon-level attributes from `obj` as named in
#' `vars`.
#'
#' Silhouette widths are computed via [cluster::silhouette()]. The input
#' distance matrix is generated by [cluster::daisy()] using (complete cases
#' of) horizon-level attributes from `obj` as named in `vars`. Note
#' that observations with genhz labels specified in `non.matching.code`
#' are removed filtered before calculation of the distance matrix.
#'
#' @param obj a `SoilProfileCollection` object
#'
#' @param genhz name of horizon-level attribute containing generalized horizon labels
#'
#' @param vars character vector of horizon-level attributes to include in the evaluation
#'
#' @param non.matching.code code used to represent horizons not assigned a generalized horizon label
#'
#' @param stand standardize variables before computing distance matrix, passed to [cluster::daisy()]
#'
#' @param trace verbose output from passed to [MASS::isoMDS()]
#'
#' @param metric distance metric, passed to [cluster::daisy()]
#'
#' @return a list is returned containing:
#' * horizons: `c('mds.1', mds.2', 'sil.width', 'neighbor')`
#' * stats: mean and standard deviation `vars`, computed by generalized horizon label
#' * dist: the distance matrix as passed to [MASS::isoMDS()]
#'
#' @author D.E. Beaudette
#'
#' @seealso [get.ml.hz()]
#'
#' @keywords manip
#'
#' @export
evalGenHZ <- function(obj, genhz = GHL(obj, required = TRUE), vars, non.matching.code='not-used', stand=TRUE, trace=FALSE, metric='euclidean') {
if(!requireNamespace("MASS", quietly = TRUE))
stop("package `MASS` is required", call.=FALSE)
# hack to make R CMD check happy
value <- summarize <- NULL
# extract site / horizons as DF
h <- as(obj, 'data.frame')
# genhz may have its own levels set, but if not a factor, make one
if (!is.factor(h[[genhz]]))
h[[genhz]] <- factor(h[[genhz]])
# make an index to complete data
no.na.idx <- which(complete.cases(h[, vars, drop = FALSE]))
## TODO: all of vars should be numeric or convertable to numeric
# numeric.test <- sapply(vars, function(i) is.numeric(h[[i]]))
# test for duplicate data
# unique IDs are based on a concatenation of variables used... digest would be safer
h.to.test <- h[no.na.idx, c(idname(obj), vars)]
h.to.test$id <- apply(h.to.test[, vars, drop = FALSE], 1, function(i) paste0(i, collapse = '|'))
dupe.names <- names(which(table(h.to.test$id) > 1))
dupe.rows <- h.to.test[which(h.to.test$id %in% dupe.names), ]
dupe.ids <- paste0(unique(dupe.rows[[idname(obj)]]), collapse = ', ')
if (dupe.ids != "")
warning(paste0('duplicate data associated with pedons: ', dupe.ids), call. = FALSE)
# compute pair-wise dissimilarities using our variables of interest
d <- cluster::daisy(h[no.na.idx, vars, drop = FALSE], stand = stand, metric = metric)
# fudge-factor in case of duplicate data (0s in the dissimilarity matrix)
dupe.idx <- which(d < 1e-8)
if (length(dupe.idx) > 0) {
fudge <- min(d[which(d > 0)]) / 100
d[dupe.idx] <- fudge
}
# perform non-metric MDS of dissimilarity matrix
mds <- MASS::isoMDS(d, trace = trace)
# compute silhouette widths after removing not-used genhz class
sil.idx <- which(complete.cases(h[, vars, drop = FALSE]) & h[[genhz]] != non.matching.code)
d.sil <- cluster::daisy(h[sil.idx, vars, drop = FALSE], stand=stand)
sil <- cluster::silhouette(as.numeric(h[[genhz]])[sil.idx], d.sil)
# add new columns
h$mds.1 <- NA
h$mds.2 <- NA
h$sil.width <- NA
h$neighbor <- NA
# copy values
h$mds.1[no.na.idx] <- mds$points[, 1]
h$mds.2[no.na.idx] <- mds$points[, 2]
h$sil.width[sil.idx] <- sil[, 3]
h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]]
## TODO: enforce / check above
## important note: all 'vars' should be numeric
# convert wide -> long form
# using data.table::melt
# suppressing warnings related to mixture of int / numeric
m <- suppressWarnings(
data.table::melt(
data.table::as.data.table(h),
id.vars = genhz,
measure.vars = c(vars, 'sil.width')
)
)
# leave as data.table for aggregation
# compute group-wise summaries
m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')]
# format text
m.summary$stats <- sprintf(
"%s (%s)",
round(m.summary$mean, 2),
round(m.summary$sd, 2)
)
# using data.table::dcast
fm <- paste0(genhz, ' ~ variable')
genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats')
# convert back to data.frame
genhz.stats <- as.data.frame(genhz.stats)
# composite into a list
res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')],
stats = genhz.stats, dist = d)
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.