R/get.ml.hz.R
In aqp: Algorithms for Quantitative Pedology

Documented in get.ml.hz

## TODO: check for rowSums != 1
##       this will cause Shannon H to fail

## TODO: conversion from original <-> safe names is clunky


#' @title Determine ML Horizon Boundaries
#'
#' @description This function accepts input from [slab()] (a `data.frame`) along with a vector of
#' horizon names, and returns a `data.frame` of the most likely horizon
#' boundaries.
#'
#' This function expects that `x` is a `data.frame` generated by
#' [slab()]. If `x` was not generated by [slab()], then `o.names` is required.
#'
#' @param x `data.frame`, output from [slab()]
#' 
#' @param o.names an optional character vector of horizon designations that will be used in the final table
#' 
#' @return A `data.frame` with the following columns: 
#'   * `hz`: horizon names
#'   * `top`: horizon top depth
#'   * `bottom`: horizon bottom depth
#'   * `confidence`: integrated probability over thickness of each ML horizon, rounded to the nearest integer
#'   * `pseudo.brier`: A "pseudo"" Brier Score for a multi-class prediction, where the most-likely horizon label is treated as the "correct" outcome. Details on the calculation for traditional Brier Scores here: \url{https://en.wikipedia.org/wiki/Brier_score}. Lower values suggest better agreement between ML horizon label and class-wise probabilities.
#'   * `mean.H`: mean Shannon entropy (bits), derived from probabilities within each most-likely horizon. Larger values suggest more confusion within each ML.
#' 
#' @author D.E. Beaudette
#' @seealso [slab()]
#' @keywords manip
#' 
#' @references 
#' Beaudette, D. E., Roudier, P., & Skovlin, J. (2016). Probabilistic representation of genetic soil horizons. Digital soil morphometrics, 281-293.
#' 
#' @export
#' @examples
#'
#' # init SPC
#' data(sp1)
#' depths(sp1) <- id ~ top + bottom
#' 
#' # set horizon designation metadata
#' hzdesgnname(sp1) <- 'name'
#' 
#' # generalize horizon designations from character vector
#' # result is an ordered factor
#' sp1$genhz <- generalizeHz(
#'   sp1$name,
#'   new = c('O','A','B','C'),
#'   pat = c('O', '^A','^B','C'),
#'   ordered = TRUE
#' )
#' 
#' # compute slice-wise GHL probability
#' # so that it sums to contributing fraction
#' # from 0-150cm
#' a <- slab(sp1, fm = ~ genhz, cpm = 1, slab.structure = 0:150)
#' 
#' # note original GHL names are set by slab()
#' attr(a, 'original.levels')
#' 
#' # generate table of ML horizonation
#' get.ml.hz(a)
#'
get.ml.hz <- function(x, o.names = attr(x, which = 'original.levels')) {

  # trick R CMD check
  .SD = H = top = bottom = NULL
  
  ## TODO: cleanup use of this column name
  # internally used, temporary column for storing ML hz name
  name <- '.name'
  
  # extract horizon levels, set by slab()
  o.levels <- attr(x, which = 'original.levels')
  
  # temporary conversion to data.table
  x <- data.table::data.table(x)
  
  # sanity check
  if(missing(o.names) && is.null(o.levels)) {
    stop('x not derived from slab() or o.names is missing')
  } else if (!is.null(o.names)) {
    o.levels <- o.names
  }
  
  ## this should accommodate DF-safe names returned by slab()
  # make DF-safe names for hz that violate DF constraints
  safe.names <- make.names(o.levels)

  # LUT for names
  names.LUT <- data.frame(
    original = o.levels,
    safe = safe.names,
    stringsAsFactors = FALSE
    )

	# get index to max probability,
	# but only when there is at least one value > 0 and all are not NA
	.f.ML.hz <- function(i) {
		if(any(i > 0) & !all(is.na(i)))
			which.max(i)
		else
			NA
	}


	# get most probable, original, horizon designation by slice
	x[[name]] <- safe.names[apply(x[, .SD, .SDcols = safe.names], 1, .f.ML.hz)]

	# extract ML hz sequences
	x.rle <- rle(as.vector(na.omit(x[[name]])))
	x.hz.bounds <- cumsum(x.rle$lengths)

	# composite into a data.frame
	# note: we always start from 0
	x.ml <- data.table::data.table(
	  hz = x.rle$value,
	  top = c(0, x.hz.bounds[-length(x.hz.bounds)]),
	  bottom = x.hz.bounds,
	  stringsAsFactors = FALSE
	)

	# in cases where probability depth-functions cross more than once,
	# it is necessary to account for overlaps
	x.ml <- x.ml[, list(top = min(top), bottom = max(bottom)), by = "hz"]

	# re-order using vector of original horizon names-- this will result in NAs if a named horizon was not the most likely
	x.ml <- x.ml[match(safe.names, x.ml$hz), ]
	x.ml <- na.omit(x.ml)

	# integrate probability density function over ML bounds
	x.ml$confidence <- NA
	for(i in seq_along(x.ml$hz)) {
		slice.seq <- seq(from = x.ml$top[i], to = x.ml$bottom[i])
		x.i <- x[slice.seq, .SD, .SDcols = x.ml$hz[i]][[1]]
		hz.int.prob.pct <- round( (sum(x.i) / length(slice.seq)) * 100)
		x.ml$confidence[i] <- hz.int.prob.pct
	}

	# compute a pseudo-brier score using ML hz as the "true" outcome
  # Brier's multi-class score : http://en.wikipedia.org/wiki/Brier_score#Original_definition_by_Brier
	# filter NA: why would this happen?
	idx <- which(!is.na(x[[name]]))
	
	.internalName <- name
	x.bs <- x[idx, ][, list(brierScore(.SD, classLabels = safe.names, actual = .internalName)), 
	                by = c(name), .SDcols = c(name, safe.names)]
	
	# Shannon entropy, (log base 2) bits
  x$H <- apply(x[, .SD, .SDcols = safe.names], 1, shannonEntropy, b = 2)
  x.H <- x[idx, ][, list(H = mean(H)), by = c(name)]

  # fix names for joining
  names(x.bs) <- c('hz', 'pseudo.brier')
  names(x.H) <- c('hz', 'mean.H')

  # join brier scores to ML hz table
  x.ml <- merge(x.ml, x.bs, by = 'hz', sort = FALSE)

  # join shannon H to ML hz table
  x.ml <- merge(x.ml, x.H, by = 'hz', sort = FALSE)

  # convert safe names -> original names
  x.ml$hz <- names.LUT$original[match(x.ml$hz, names.LUT$safe)]
  
  # back to data.frame
  x.ml <- as.data.frame(x.ml)

	return(x.ml)
}