R/evalGenHz.R

Defines functions evalGenHZ

Documented in evalGenHZ

## TODO: encode some kind of ID into the dissimilarity matrix for later use
#' Evaluate Generalized Horizon Labels
#'
#' Data-driven evaluation of generalized horizon labels using nMDS and
#' silhouette width.
#'
#' Non-metric multidimensional scaling is performed via \code{\link{isoMDS}}.
#' The input distance matrix is generated by \code{\link{daisy}} using
#' (complete cases of) horizon-level attributes from \code{obj} as named in
#' \code{vars}.
#'
#' Silhouette widths are computed via \code{\link{silhouette}}. The input
#' distance matrix is generated by \code{\link{daisy}} using (complete cases
#' of) horizon-level attributes from \code{obj} as named in \code{vars}. Note
#' that observations with genhz labels specified in \code{non.matching.code}
#' are removed filtered before calculation of the distance matrix.
#'
#' @param obj a \code{SoilProfileCollection} object
#' @param genhz name of horizon-level attribute containing generalized horizon
#' labels
#' @param vars character vector of horizon-level attributes to include in the
#' evaluation
#' @param non.matching.code code used to represent horizons not assigned a
#' generalized horizon label
#' @param stand standardize variables before computing distance matrix (default
#' = TRUE), passed to \code{\link{daisy}}
#' @param trace verbose output from passed to \code{\link{isoMDS}}, (default =
#' FALSE)
#' @param metric distance metric, passed to \code{\link{daisy}}
#' @return a list is returned containing: \describe{ \item{horizons}{c('mds.1',
#' 'mds.2', 'sil.width', 'neighbor')} \item{stats}{mean and standard deviation
#' of \code{vars}, computed by generalized horizon label} \item{dist}{the
#' distance matrix as passed to \code{\link{isoMDS}}} }
#' @author D.E. Beaudette
#' @seealso \code{\link{get.ml.hz}}
#' @keywords manip
#' @export
evalGenHZ <- function(obj, genhz = GHL(obj, required = TRUE), vars, non.matching.code='not-used', stand=TRUE, trace=FALSE, metric='euclidean') {
  if(!requireNamespace("MASS", quietly = TRUE))
    stop("package `MASS` is required", call.=FALSE)

  # hack to make R CMD check happy
  value <- summarize <- NULL

  # extract site / horizons as DF
  h <- as(obj, 'data.frame')

  # genhz may have its own levels set, but if not a factor, make one
  if (!is.factor(h[[genhz]]))
    h[[genhz]] <- factor(h[[genhz]])

  # make an index to complete data
  no.na.idx <- which(complete.cases(h[, vars, drop = FALSE]))

  ## TODO: all of vars should be numeric or convertable to numeric
  # numeric.test <- sapply(vars, function(i) is.numeric(h[[i]]))

  # test for duplicate data
  # unique IDs are based on a concatenation of variables used... digest would be safer
  h.to.test <- h[no.na.idx, c(idname(obj), vars)]
  h.to.test$id <- apply(h.to.test[, vars, drop = FALSE], 1, function(i) paste0(i, collapse = '|'))
  dupe.names <- names(which(table(h.to.test$id) > 1))
  dupe.rows <- h.to.test[which(h.to.test$id %in% dupe.names), ]
  dupe.ids <- paste0(unique(dupe.rows[[idname(obj)]]), collapse = ', ')
  if (dupe.ids != "")
    warning(paste0('duplicate data associated with pedons: ', dupe.ids), call. = FALSE)

  # compute pair-wise dissimilarities using our variables of interest
  d <- cluster::daisy(h[no.na.idx, vars, drop = FALSE], stand = stand, metric = metric)

  # fudge-factor in case of duplicate data (0s in the dissimilarity matrix)
  dupe.idx <- which(d < 1e-8)
  if (length(dupe.idx) > 0) {
    fudge <- min(d[which(d > 0)]) / 100
    d[dupe.idx] <- fudge
  }

  # perform non-metric MDS of dissimilarity matrix
  mds <- MASS::isoMDS(d, trace = trace)

  # compute silhouette widths after removing not-used genhz class
  sil.idx <-  which(complete.cases(h[, vars, drop = FALSE]) & h[[genhz]] != non.matching.code)
  d.sil <- cluster::daisy(h[sil.idx, vars, drop = FALSE], stand=stand)
  sil <- cluster::silhouette(as.numeric(h[[genhz]])[sil.idx], d.sil)

  # add new columns
  h$mds.1 <- NA
  h$mds.2 <- NA
  h$sil.width <- NA
  h$neighbor <- NA

  # copy values
  h$mds.1[no.na.idx] <- mds$points[, 1]
  h$mds.2[no.na.idx] <- mds$points[, 2]
  h$sil.width[sil.idx] <- sil[, 3]
  h$neighbor[sil.idx] <- levels(h[[genhz]])[sil[, 2]]

  ## TODO: enforce / check above
  ## important note: all 'vars' should be numeric


  # convert wide -> long form
  # using data.table::melt
  # suppressing warnings related to mixture of int / numeric
  m <- suppressWarnings(
    data.table::melt(
      data.table::as.data.table(h),
      id.vars = genhz,
      measure.vars = c(vars, 'sil.width')
    )
  )

  # leave as data.table for aggregation
  # compute group-wise summaries
  m.summary <- m[, list(mean = mean(value, na.rm = TRUE), sd = sd(value, na.rm = TRUE)), by = c((genhz), 'variable')]

  # format text
  m.summary$stats <- sprintf(
    "%s (%s)",
    round(m.summary$mean, 2),
    round(m.summary$sd, 2)
  )

  # using data.table::dcast
  fm <- paste0(genhz, ' ~ variable')
  genhz.stats <- dcast(data = m.summary, formula = fm, value.var = 'stats')

  # convert back to data.frame
  genhz.stats <- as.data.frame(genhz.stats)

  # composite into a list
  res <- list(horizons = h[, c('mds.1', 'mds.2', 'sil.width', 'neighbor')],
              stats = genhz.stats, dist = d)
  return(res)
}

Try the aqp package in your browser

Any scripts or data that you put into this service are public.

aqp documentation built on Sept. 8, 2023, 5:45 p.m.