Nothing
## TODO: check for rowSums != 1
## this will cause Shannon H to fail
## TODO: conversion from original <-> safe names is clunky
#' @title Determine ML Horizon Boundaries
#'
#' @description This function accepts input from [slab()] (a `data.frame`) along with a vector of
#' horizon names, and returns a `data.frame` of the most likely horizon
#' boundaries.
#'
#' This function expects that `x` is a `data.frame` generated by
#' [slab()]. If `x` was not generated by [slab()], then `o.names` is required.
#'
#' @param x `data.frame`, output from [slab()]
#'
#' @param o.names an optional character vector of horizon designations that will be used in the final table
#'
#' @return A `data.frame` with the following columns:
#' * `hz`: horizon names
#' * `top`: horizon top depth
#' * `bottom`: horizon bottom depth
#' * `confidence`: integrated probability over thickness of each ML horizon, rounded to the nearest integer
#' * `pseudo.brier`: A "pseudo"" Brier Score for a multi-class prediction, where the most-likely horizon label is treated as the "correct" outcome. Details on the calculation for traditional Brier Scores here: \url{https://en.wikipedia.org/wiki/Brier_score}. Lower values suggest better agreement between ML horizon label and class-wise probabilities.
#' * `mean.H`: mean Shannon entropy (bits), derived from probabilities within each most-likely horizon. Larger values suggest more confusion within each ML.
#'
#' @author D.E. Beaudette
#' @seealso [slab()]
#' @keywords manip
#'
#' @references
#' Beaudette, D. E., Roudier, P., & Skovlin, J. (2016). Probabilistic representation of genetic soil horizons. Digital soil morphometrics, 281-293.
#'
#' @export
#' @examples
#'
#' # init SPC
#' data(sp1)
#' depths(sp1) <- id ~ top + bottom
#'
#' # set horizon designation metadata
#' hzdesgnname(sp1) <- 'name'
#'
#' # generalize horizon designations from character vector
#' # result is an ordered factor
#' sp1$genhz <- generalizeHz(
#' sp1$name,
#' new = c('O','A','B','C'),
#' pat = c('O', '^A','^B','C'),
#' ordered = TRUE
#' )
#'
#' # compute slice-wise GHL probability
#' # so that it sums to contributing fraction
#' # from 0-150cm
#' a <- slab(sp1, fm = ~ genhz, cpm = 1, slab.structure = 0:150)
#'
#' # note original GHL names are set by slab()
#' attr(a, 'original.levels')
#'
#' # generate table of ML horizonation
#' get.ml.hz(a)
#'
get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {
# trick R CMD check
.SD = H = top = bottom = NULL
## TODO: cleanup use of this column name
# internally used, temporary column for storing ML hz name
name <- '.name'
# extract horizon levels, set by slab()
o.levels <- attr(x, which = 'original.levels')
# temporary conversion to data.table
x <- data.table::data.table(x)
# sanity check
if(missing(o.names) && is.null(o.levels)) {
stop('x not derived from slab() or o.names is missing')
} else if (!is.null(o.names)) {
o.levels <- o.names
}
## this should accommodate DF-safe names returned by slab()
# make DF-safe names for hz that violate DF constraints
safe.names <- make.names(o.levels)
# LUT for names
names.LUT <- data.frame(
original = o.levels,
safe = safe.names,
stringsAsFactors = FALSE
)
# get index to max probability,
# but only when there is at least one value > 0 and all are not NA
.f.ML.hz <- function(i) {
if(any(i > 0) & !all(is.na(i)))
which.max(i)
else
NA
}
# get most probable, original, horizon designation by slice
x[[name]] <- safe.names[apply(x[, .SD, .SDcols = safe.names], 1, .f.ML.hz)]
# extract ML hz sequences
x.rle <- rle(as.vector(na.omit(x[[name]])))
x.hz.bounds <- cumsum(x.rle$lengths)
# composite into a data.frame
# note: we always start from 0
x.ml <- data.table::data.table(
hz = x.rle$value,
top = c(0, x.hz.bounds[-length(x.hz.bounds)]),
bottom = x.hz.bounds,
stringsAsFactors = FALSE
)
# in cases where probability depth-functions cross more than once,
# it is necessary to account for overlaps
x.ml <- x.ml[, list(top = min(top), bottom = max(bottom)), by = "hz"]
# re-order using vector of original horizon names-- this will result in NAs if a named horizon was not the most likely
x.ml <- x.ml[match(safe.names, x.ml$hz), ]
x.ml <- na.omit(x.ml)
# integrate probability density function over ML bounds
x.ml$confidence <- NA
for(i in seq_along(x.ml$hz)) {
slice.seq <- seq(from = x.ml$top[i], to = x.ml$bottom[i])
x.i <- x[slice.seq, .SD, .SDcols = x.ml$hz[i]][[1]]
hz.int.prob.pct <- round( (sum(x.i) / length(slice.seq)) * 100)
x.ml$confidence[i] <- hz.int.prob.pct
}
# compute a pseudo-brier score using ML hz as the "true" outcome
# Brier's multi-class score : http://en.wikipedia.org/wiki/Brier_score#Original_definition_by_Brier
# filter NA: why would this happen?
idx <- which(!is.na(x[[name]]))
.internalName <- name
x.bs <- x[idx, ][, list(brierScore(.SD, classLabels = safe.names, actual = .internalName)),
by = c(name), .SDcols = c(name, safe.names)]
# Shannon entropy, (log base 2) bits
x$H <- apply(x[, .SD, .SDcols = safe.names], 1, shannonEntropy, b = 2)
x.H <- x[idx, ][, list(H = mean(H)), by = c(name)]
# fix names for joining
names(x.bs) <- c('hz', 'pseudo.brier')
names(x.H) <- c('hz', 'mean.H')
# join brier scores to ML hz table
x.ml <- merge(x.ml, x.bs, by = 'hz', sort = FALSE)
# join shannon H to ML hz table
x.ml <- merge(x.ml, x.H, by = 'hz', sort = FALSE)
# convert safe names -> original names
x.ml$hz <- names.LUT$original[match(x.ml$hz, names.LUT$safe)]
# back to data.frame
x.ml <- as.data.frame(x.ml)
return(x.ml)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.