soilDB: Soil Database Interface

Documented in simplifyFragmentData

## 2022 I propose we move .sieve to aqp and export it there, see updates --agb
## 2023 Added to aqp as aqp::sieve()
# internally-used function to test size classes
# diameter is in mm
# NA diameter results in NA class
.sieve <- function(diameter, flat = FALSE, para = FALSE, 
                   sieves = c(fine_gravel = 5, gravel = 76, cobbles = 250, stones = 600, boulders = 1e10),
                   new.names = NULL) {
  
  if (flat && missing(sieves)) {
    sieves <- c(channers = 150, flagstones = 380, stones = 600, boulders = 1e10)
  }
  
  if (!is.null(new.names)) {
    names(sieves) <- new.names
  }

  # test for NA, and filter-out
  res <- vector(mode = 'character', length = length(diameter))
  res[which(is.na(diameter))] <- NA
  no.na.idx <- which(!is.na(diameter))

  # only assign classes to non-NA diameters
  if (length(no.na.idx) > 0) {
    # pass diameters "through" sieves
    # 2020: latest part 618 uses '<' for all upper values of class range
    # 2022: adjusted gravel upper threshold to 76 mm 
    classes <- t(sapply(diameter[no.na.idx], function(i) i < sieves))

    # determine largest passing sieve name
    res[no.na.idx] <- names(sieves)[apply(classes, 1, which.max)]

    # change names if we are working with parafrags
    
    if (para == TRUE) {
      res[no.na.idx] <- paste0('para', res[no.na.idx])
    }
  }

  return(res)
}

# x: uncoded contents of the phfrags table
# vol.var: column name in x containing fragment volume (default (`"fragvol"`))
# prefix: prefix used in common with hardness and shape var (default `"frag"`)
# ...: additional arguments to .sieve
.rockFragmentSieve <- function(x, vol.var = "fragvol", prefix = "frag", ...) {

  hardvar <- paste0(prefix, "hard")
  shpvar <- paste0(prefix, "shp")
  sizevar <- paste0(paste0(prefix, "size"), c("_l","_r","_h"))
  
  # convert to lower case: NASIS metadata uses upper for labels, lower for values
  x[[hardvar]] <- tolower(x[[hardvar]])
  x[[shpvar]] <- tolower(x[[shpvar]])

  ## assumptions
  # missing hardness = rock fragment
  x[[hardvar]][which(is.na(x[[hardvar]]))] <- 'strongly cemented'
  # missing shape = Nonflat
  x[[shpvar]][which(is.na(x[[shpvar]]))] <- 'nonflat'

  ## the RV fragment size is likely the safest estimate,
  ## given the various upper bounds for GR (74mm, 75mm, 76mm)
  # calculate if missing
  x[[sizevar[2]]] <- ifelse(
    is.na(x[[sizevar[2]]]),
    (x[[sizevar[1]]] + x[[sizevar[3]]]) / 2,
    x[[sizevar[2]]]
  )

  ## split frags / parafrags
  # frags: >= strongly cemented
  # this should generalize across old / modern codes
  idx <- grep('strong|indurated', x[[hardvar]], ignore.case = TRUE)
  frags <- x[idx, ]

  idx <- grep('strong|indurated', x[[hardvar]], ignore.case = TRUE, invert = TRUE)
  parafrags <- x[idx, ]

  ## split flat / non-flat
  # frags
  idx <- which(frags[[shpvar]] == 'nonflat')
  frags.nonflat <- frags[idx, ]

  idx <- which(frags[[shpvar]] == 'flat')
  frags.flat <- frags[idx, ]

  # parafrags
  idx <- which(parafrags[[shpvar]] == 'nonflat')
  parafrags.nonflat <- parafrags[idx, ]

  idx <- which(parafrags[[shpvar]] == 'flat')
  parafrags.flat <- parafrags[idx, ]

  ## sieve
  # non-flat fragments
  frags.nonflat$class <- .sieve(frags.nonflat[[sizevar[2]]], flat = FALSE, ...)

  # non-flat parafragments
  parafrags.nonflat$class <- .sieve(parafrags.nonflat[[sizevar[2]]], flat = FALSE, para = TRUE, ...)

  # flat fragments
  frags.flat$class <- .sieve(frags.flat[[sizevar[2]]], flat = TRUE, ...)

  # flat parafragments
  parafrags.flat$class <- .sieve(parafrags.flat[[sizevar[2]]], flat = TRUE, para = TRUE, ...)

  # combine pieces, note may contain RF classes == NA
  res <- rbind(frags.nonflat, frags.flat, parafrags.nonflat, parafrags.flat)

  # what does an NA fragment class mean?
  #
  # typically, fragment size missing
  # or, worst-case, .sieve() rules are missing criteria
  #
  # keep track of these for QC in an 'unspecified' column
  # but only when there is a fragment volume specified
  idx <- which(is.na(res$class) & !is.na(res[[vol.var]]))
  if (length(idx) > 0) {
    res$class[idx] <- 'unspecified'
  }

  # done
  return(res)
}

#' Simplify Coarse Fraction Data
#'
#' Simplify multiple coarse fraction (>2mm) records by horizon.
#'
#' This function is mainly intended for processing of NASIS pedon/component
#' data which contains multiple coarse fragment descriptions per horizon.
#' `simplifyFragmentData` will "sieve out" coarse fragments into the USDA
#' classes, split into hard and para- fragments. Likewise, `simplifyArtifactData` will sieve out human artifacts, and split total volume into "cohesive", "penetrable", "innocuous", and "persistent".
#'
#' These functions can be applied to data sources other than NASIS by careful use of the `id.var` and `vol.var` arguments. 
#'  - \code{rf} must contain rock or other fragment volumes in the column "fragvol" (or be specified with `vol.var`), fragment size (mm) in columns "fragsize_l", "fragsize_r", "fragsize_h", fragment cementation class in "fraghard" and flat/non-flat in "fragshp".
#'  - \code{art} must contain artifact volumes in the column "huartvol" (or be specified with `vol.var`), fragment size (mm) in columns "huartsize_l", "huartsize_r", "huartsize_h", artifact cementation class in "huarthard" and flat/non-flat in "huartshp".
#'
#' Examples:
#'  - [KSSL data](http://ncss-tech.github.io/AQP/soilDB/KSSL-demo.html)
#'
#' @aliases simplifyFragmentData simplifyArtifactData
#' @param rf a \code{data.frame} object, typically returned from NASIS, see
#' details
#' @param id.var character vector with the name of the column containing an ID
#' that is unique among all horizons in \code{rf}
#' @param vol.var character vector with the name of the column containing the coarse fragment volume. Default `"fragvol"` or `"huartvol`".
#' @param prefix a character vector prefix for input
#' @param nullFragsAreZero should fragment volumes of NULL be interpreted as 0? (default: `TRUE`), see details
#' @param msg Identifier of data being summarized. Default is `"rock fragment volume"` but this routine is also used for `"surface fragment cover"`
#' @param ... Additional arguments passed to sieving function (e.g. `sieves` a named numeric containing sieve size thresholds with class name)
#' @author D.E. Beaudette, A.G Brown
#' @keywords manip
#' @export simplifyFragmentData
simplifyFragmentData <- function(rf, id.var, vol.var = "fragvol", prefix = "frag", 
                                 nullFragsAreZero = TRUE, msg = "rock fragment volume", ...) {

  fragvol <- NULL

  # fragment classes used in this function
  # note that we are adding a catch-all for those strange phfrags records missing fragment size
  frag.classes <- c('fine_gravel', 'gravel', 'cobbles', 'stones', 'boulders', 'channers', 'flagstones', 'parafine_gravel', 'paragravel', 'paracobbles', 'parastones', 'paraboulders', 'parachanners', 'paraflagstones', 'unspecified')

  result.columns <- c(id.var, frag.classes, "total_frags_pct", "total_frags_pct_nopf")

  # warn the user and remove the NA records
  # if all fragvol are NA then result is a data frame with all ID values NA
  if (nrow(rf[which(!is.na(rf[[vol.var]])),]) == 0) {
    message(sprintf('NOTE: all records are missing %s', msg))
    dat <- as.data.frame(t(rep(NA, length(result.columns))))[seq_len(length(rf[[id.var]])),]
    dat[[which(result.columns == id.var)]] <- rf[[id.var]]
    colnames(dat) <- result.columns
    rownames(dat) <- NULL
    return(dat)
  } else if (any(is.na(rf[[vol.var]]))) {
    rf <- rf[which(!is.na(rf[[vol.var]])), ]
    message(sprintf('NOTE: some records are missing %s', msg))
  }

  # extract classes
  # note: these will put any fragments without fragsize into an 'unspecified' class
  rf.classes <- .rockFragmentSieve(rf, vol.var = vol.var, prefix = prefix)

  ## NOTE: this is performed on the data, as-is: not over all possible classes as enforced by factor levels
  # sum volume by id and class
  # class cannot contain NA
  rf.sums <- aggregate(rf.classes[[vol.var]], 
                       by = list(rf.classes[[id.var]], rf.classes[['class']]), 
                       FUN = sum, na.rm = TRUE)
  # fix default names from aggregate()
  names(rf.sums) <- c(id.var, 'class', 'volume')

  ## NOTE: we set factor levels here because the reshaping (long->wide) needs to account for all possible classes
  ## NOTE: this must include all classes that related functions return
  # set levels of classes
  rf.sums$class <- factor(rf.sums$class, levels = frag.classes)
  
  # convert to wide format
  if (nrow(rf.sums) == 0) {
    rf.wide <- as.data.frame(rep(list(numeric(0)), length(frag.classes)))
    colnames(rf.wide) <- frag.classes
    iddf <- data.frame(character(0))
    colnames(iddf) <- id.var
    rf.wide <- cbind(iddf, rf.wide)
  } else {
    fm <- as.formula(paste0(id.var, ' ~ class'))
    rf.wide <- as.data.frame(data.table::dcast(data.table::data.table(rf.sums), fm, value.var = 'volume', drop = FALSE))
  }
  # must determine the index to the ID column in the wide format
  id.col.idx <- which(names(rf.wide) == id.var)

  ## optionally convert NULL frags -> 0
  if (nullFragsAreZero & ncol(rf.wide) > 1) {
    rf.wide <- as.data.frame(
      cbind(rf.wide[, id.col.idx, drop = FALSE],
            lapply(rf.wide[, -id.col.idx], function(i) ifelse(is.na(i), 0, i))
      ), stringsAsFactors = FALSE)
  }

  # are there any fractions or the total >= 100%
  gt.100 <- lapply(rf.wide[, -id.col.idx, drop = FALSE], FUN = function(i) i >= 100)

  # check each size fraction and report id.var if there are any
  gt.100.matches <- sapply(gt.100, any, na.rm = TRUE)
  if (any(gt.100.matches)) {
    # search within each fraction
    class.idx <- which(gt.100.matches)
    idx <- unique(unlist(lapply(gt.100[class.idx], which)))
    flagged.ids <- rf.wide[[id.var]][idx]

    message(sprintf("NOTE: %s >= 100%% in %s: %s", msg, id.var, paste(flagged.ids, collapse = ",")))
  }

  # compute total fragments
  if (ncol(rf.wide) > 1) {
    # calculate another column for total RF, ignoring "parafractions"
    # index of columns to ignore, para*
    idx.pf <- grep(names(rf.wide), pattern = "para")
    
    # also remove ID column
    idx <- c(id.col.idx, idx.pf)
    # this could result in an error if all fragments are para*
    rf.wide$total_frags_pct_nopf <- rowSums(rf.wide[, -idx], na.rm = TRUE)

    # calculate total fragments (including para)
    # excluding ID and last columns
    idx <- c(id.col.idx, length(names(rf.wide)))
    rf.wide$total_frags_pct <- rowSums(rf.wide[, -idx], na.rm = TRUE)
  }

  # 1. fine gravel is a subset of gravel, therefore: gravel = gravel + fine_gravel
  rf.wide$gravel <- rowSums(cbind(rf.wide$gravel, rf.wide$fine_gravel), na.rm = TRUE)
  rf.wide$paragravel <- rowSums(cbind(rf.wide$paragravel, rf.wide$parafine_gravel), na.rm = TRUE)

  # done
  return(rf.wide)
}