# R/get.ml.hz.R In aqp: Algorithms for Quantitative Pedology

#### Documented in get.ml.hz

## TODO: check for rowSums != 1
##       this will cause Shannon H to fail

## TODO: conversion from original <-> safe names is clunky

#' @title Determine ML Horizon Boundaries
#'
#' @description This function accepts input from [slab()] (a `data.frame`) along with a vector of
#' horizon names, and returns a `data.frame` of the most likely horizon
#' boundaries.
#'
#' This function expects that `x` is a `data.frame` generated by
#' [slab()]. If `x` was not generated by [slab()], then `o.names` is required.
#'
#' @param x `data.frame`, output from [slab()]
#'
#' @param o.names an optional character vector of horizon designations that will be used in the final table
#'
#' @return A `data.frame` with the following columns:
#'   * `hz`: horizon names
#'   * `top`: horizon top depth
#'   * `bottom`: horizon bottom depth
#'   * `confidence`: integrated probability over thickness of each ML horizon, rounded to the nearest integer
#'   * `pseudo.brier`: A "pseudo"" Brier Score for a multi-class prediction, where the most-likely horizon label is treated as the "correct" outcome. Details on the calculation for traditional Brier Scores here: \url{https://en.wikipedia.org/wiki/Brier_score}. Lower values suggest better agreement between ML horizon label and class-wise probabilities.
#'   * `mean.H`: mean Shannon entropy (bits), derived from probabilities within each most-likely horizon. Larger values suggest more confusion within each ML.
#'
#' @author D.E. Beaudette
#' @seealso [slab()]
#' @keywords manip
#'
#' @references
#' Beaudette, D. E., Roudier, P., & Skovlin, J. (2016). Probabilistic representation of genetic soil horizons. Digital soil morphometrics, 281-293.
#'
#' @export
#' @examples
#'
#' # init SPC
#' data(sp1)
#' depths(sp1) <- id ~ top + bottom
#'
#' # set horizon designation metadata
#' hzdesgnname(sp1) <- 'name'
#'
#' # generalize horizon designations from character vector
#' # result is an ordered factor
#' sp1\$genhz <- generalizeHz(
#'   sp1\$name,
#'   new = c('O','A','B','C'),
#'   pat = c('O', '^A','^B','C'),
#'   ordered = TRUE
#' )
#'
#' # compute slice-wise GHL probability
#' # so that it sums to contributing fraction
#' # from 0-150cm
#' a <- slab(sp1, fm = ~ genhz, cpm = 1, slab.structure = 0:150)
#'
#' # note original GHL names are set by slab()
#' attr(a, 'original.levels')
#'
#' # generate table of ML horizonation
#' get.ml.hz(a)
#'
get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {

# trick R CMD check
.SD = H = top = bottom = NULL

## TODO: cleanup use of this column name
# internally used, temporary column for storing ML hz name
name <- '.name'

# extract horizon levels, set by slab()
o.levels <- attr(x, which = 'original.levels')

# temporary conversion to data.table
x <- data.table::data.table(x)

# sanity check
if(missing(o.names) && is.null(o.levels)) {
stop('x not derived from slab() or o.names is missing')
} else if (!is.null(o.names)) {
o.levels <- o.names
}

## this should accommodate DF-safe names returned by slab()
# make DF-safe names for hz that violate DF constraints
safe.names <- make.names(o.levels)

# LUT for names
names.LUT <- data.frame(
original = o.levels,
safe = safe.names,
stringsAsFactors = FALSE
)

# get index to max probability,
# but only when there is at least one value > 0 and all are not NA
.f.ML.hz <- function(i) {
if(any(i > 0) & !all(is.na(i)))
which.max(i)
else
NA
}

# get most probable, original, horizon designation by slice
x[[name]] <- safe.names[apply(x[, .SD, .SDcols = safe.names], 1, .f.ML.hz)]

# extract ML hz sequences
x.rle <- rle(as.vector(na.omit(x[[name]])))
x.hz.bounds <- cumsum(x.rle\$lengths)

# composite into a data.frame
# note: we always start from 0
x.ml <- data.table::data.table(
hz = x.rle\$value,
top = c(0, x.hz.bounds[-length(x.hz.bounds)]),
bottom = x.hz.bounds,
stringsAsFactors = FALSE
)

# in cases where probability depth-functions cross more than once,
# it is necessary to account for overlaps
x.ml <- x.ml[, list(top = min(top), bottom = max(bottom)), by = "hz"]

# re-order using vector of original horizon names-- this will result in NAs if a named horizon was not the most likely
x.ml <- x.ml[match(safe.names, x.ml\$hz), ]
x.ml <- na.omit(x.ml)

# integrate probability density function over ML bounds
x.ml\$confidence <- NA
for(i in seq_along(x.ml\$hz)) {
slice.seq <- seq(from = x.ml\$top[i], to = x.ml\$bottom[i])
x.i <- x[slice.seq, .SD, .SDcols = x.ml\$hz[i]][[1]]
hz.int.prob.pct <- round( (sum(x.i) / length(slice.seq)) * 100)
x.ml\$confidence[i] <- hz.int.prob.pct
}

# compute a pseudo-brier score using ML hz as the "true" outcome
# Brier's multi-class score : http://en.wikipedia.org/wiki/Brier_score#Original_definition_by_Brier
# filter NA: why would this happen?
idx <- which(!is.na(x[[name]]))

.internalName <- name
x.bs <- x[idx, ][, list(brierScore(.SD, classLabels = safe.names, actual = .internalName)),
by = c(name), .SDcols = c(name, safe.names)]

# Shannon entropy, (log base 2) bits
x\$H <- apply(x[, .SD, .SDcols = safe.names], 1, shannonEntropy, b = 2)
x.H <- x[idx, ][, list(H = mean(H)), by = c(name)]

# fix names for joining
names(x.bs) <- c('hz', 'pseudo.brier')
names(x.H) <- c('hz', 'mean.H')

# join brier scores to ML hz table
x.ml <- merge(x.ml, x.bs, by = 'hz', sort = FALSE)

# join shannon H to ML hz table
x.ml <- merge(x.ml, x.H, by = 'hz', sort = FALSE)

# convert safe names -> original names
x.ml\$hz <- names.LUT\$original[match(x.ml\$hz, names.LUT\$safe)]

# back to data.frame
x.ml <- as.data.frame(x.ml)

return(x.ml)
}

## Try the aqp package in your browser

Any scripts or data that you put into this service are public.

aqp documentation built on Sept. 11, 2024, 7:11 p.m.