aqp: Algorithms for Quantitative Pedology

Documented in aggregateColor

## TODO: consider using Munsell notation for `col`, this would save an entire round-trip through the conversion process


#' @title Summarize Soil Colors
#' 
#' @description Summarize soil color data, weighted by occurrence and horizon thickness.
#'
#' @param x a `SoilProfileCollection` object
#' @param groups the name of a horizon or site attribute used to group horizons, see examples
#' @param col the name of a horizon-level attribute with soil color specified in hexadecimal (i.e. "#rrggbb")
#' @param colorSpace (now deprecated, removed in aqp 2.1) 'CIE2000' used for all cases
#' @param k single integer specifying the number of colors discretized via PAM ([cluster::pam()]), see details
#' @param profile_wt the name of a site-level attribute used to modify weighting, e.g. area
#' 
#' @param mixingMethod method used to estimate "aggregate" soil colors, see [mixMunsell()]
#'
#' @return A list with the following components:
#' 
#' \item{scaled.data}{a `list` of colors and associated weights, one item for each generalized horizon label with at least one color specified in the source data}
#' \item{aggregate.data}{a `data.frame` of weighted-mean colors, one row for each generalized horizon label with at least one color specified in the source data}
#' 
#' @export
#' 
#' @details Weights are computed by:
#' `w_i = sqrt(sum(thickness_i)) * n_i`
#' where `w_i` is the weight associated with color `i`, `thickness_i` is the total thickness of all horizons associated with the color `i`, and `n_i` is the number of horizons associated with color `i`. Weights are computed within groups specified by `groups`.
#' 
#' @author D.E. Beaudette
#' 
#' @seealso [generalize.hz()]
#' 
#' 
#' @examples
#' 
#' # keep examples from using more than 2 cores
#' data.table::setDTthreads(Sys.getenv("OMP_THREAD_LIMIT", unset = 2))
#' 
#' # load some example data
#' data(sp1, package='aqp')
#' 
#' # upgrade to SoilProfileCollection and convert Munsell colors
#' sp1$soil_color <- with(sp1, munsell2rgb(hue, value, chroma))
#' depths(sp1) <- id ~ top + bottom
#' site(sp1) <- ~ group
#' 
#' # generalize horizon names
#' n <- c('O', 'A', 'B', 'C')
#' p <- c('O', 'A', 'B', 'C')
#' sp1$genhz <- generalize.hz(sp1$name, n, p)
#' 
#' # aggregate colors over horizon-level attribute: 'genhz'
#' a <- aggregateColor(sp1, groups = 'genhz', col = 'soil_color')
#' 
#' # check results
#' str(a)
#' 
#' \dontrun{
#' # aggregate colors over site-level attribute: 'group'
#' a <- aggregateColor(sp1, groups = 'group', col = 'soil_color')
#' 
#' # aggregate colors over site-level attribute: 'group'
#' # discretize colors to 4 per group
#' a <- aggregateColor(sp1, groups = 'group', col = 'soil_color', k = 4)
#' 
#' # aggregate colors over depth-slices
#' s <- dice(sp1, c(5, 10, 15, 25, 50, 100, 150) ~ soil_color)
#' s$slice <- paste0(s$top, ' cm')
#' s$slice <- factor(s$slice, levels=guessGenHzLevels(s, 'slice')$levels)
#' a <- aggregateColor(s, groups = 'slice', col = 'soil_color')
#' 
#'   # optionally plot with helper function
#'   if(require(sharpshootR))
#'     aggregateColorPlot(a)
#' 
#' # a more interesting example
#'   data(loafercreek, package = 'soilDB')
#'   
#'   # generalize horizon names using REGEX rules
#'   n <- c('Oi', 'A', 'BA','Bt1','Bt2','Bt3','Cr','R')
#'   p <- c('O', '^A$|Ad|Ap|AB','BA$|Bw', 
#'          'Bt1$|^B$','^Bt$|^Bt2$','^Bt3|^Bt4|CBt$|BCt$|2Bt|2CB$|^C$','Cr','R')
#'   loafercreek$genhz <- generalize.hz(loafercreek$hzname, n, p)
#'   
#'   # remove non-matching generalized horizon names
#'   loafercreek$genhz[loafercreek$genhz == 'not-used'] <- NA
#'   loafercreek$genhz <- factor(loafercreek$genhz)
#'   
#'   a <- aggregateColor(loafercreek, 'genhz')
#'   
#'   # plot results with helper function
#'   par(mar=c(1,4,4,1))
#'   aggregateColorPlot(a, print.n.hz = TRUE)
#'   
#'   # inspect aggregate data
#'   a$aggregate.data
#' }
#' 
aggregateColor <- function(x, groups = 'genhz', col = 'soil_color', colorSpace = 'CIE2000', k = NULL, profile_wt = NULL, mixingMethod = c('estimate', 'exact')) {
  
  # colorSpace argument is now deprecated
  if(!missing(colorSpace)) {
    message('colorSpace argument is now deprecated, CIE2000 distance is always used.')
  }
  
  # sanity check
  mixingMethod <- match.arg(mixingMethod)
  
  # sanity check
  if(!is.null(k)) {
    k <- round(k)
    
    # sanity check, need this for color distance eval
    if (!requireNamespace('farver', quietly = TRUE))
      stop('please install the `farver` package.', call.=FALSE)
    
    if(is.na(k)) {
      stop('k must be a single integer > 0')
    }
  }
  
  # sanity check: profile weight must be a valid site-level attribute
  if(!is.null(profile_wt)) {
    if(! profile_wt %in% siteNames(x)) {
      stop('`profile_wt` must be a site-level attribute of `x`', call. = FALSE)
    }
  }
  
  # sanity check: groups must be a horizon-level attribute
  if(! groups %in% names(x))
    stop('`groups` must be a site or horizon attribute of `x`', call. = FALSE)
  
  # sanity check: col must be a horizon-level attribute
  if(! col %in% horizonNames(x))
    stop('`col` must be a horizon-level attribute of `x`', call. = FALSE)
  
  ## hack to make R CMD check --as-cran happy
  top <- bottom <- thick <- NULL
  
  # extract pieces
  h <- as(x, 'data.frame')
  
  # keep track of just those variables we are using
  # profile_wt is NULL by default
  vars <- c(groups, horizonDepths(x), col, profile_wt)
  
  # remove missing data
  # note: missing color data will result in 'white' when converting to sRGB, why?
  h.no.na <- na.omit(h[, vars])
  
  # re-name depths
  names(h.no.na)[2:3] <- c('top', 'bottom')
  
  # safely compute thickness
  # abs() used in rare cases where horizon logic is wrong: e.g. old-style O horizons
  h.no.na$thick <- abs(h.no.na$bottom - h.no.na$top)
  
  # 0-thickness will result in NA weights
  # replace with a 1-unit slice
  idx <- which(h.no.na$thick == 0)
  if(length(idx) > 0) {
    h.no.na$thick[idx] <- 1
  }
  
  # apply site-level weighting here, if present
  if(!is.null(profile_wt)) {
    # multiply thickness by site-level weight
    h.no.na$thick <- h.no.na$thick * h.no.na[[profile_wt]]
  }
  
  # drop empty group levels
  h.no.na[[groups]] <- factor(h.no.na[[groups]])
  
  # split by genhz
  ss <- split(h.no.na, h.no.na[[groups]])
  
  # iterate over groups
  # note that some genhz will have 0 records
  s <- lapply(ss, function(i) {
    
    # current group label
    this.group <- i[[groups]][1]
    
    # optionally reduce to `k` colors via PAM, by group
    # this will only work if there are >= 1 unique colors
    n.cols <- length(unique(i[[col]]))
    
    if(is.numeric(k) & n.cols > 1) {
      
      # condition number of classes on available data
      k.adj <- pmin(k, n.cols - 1)
      
      # work with a unique subset, there is no need to compute distances / cluster all colors
      lut <- data.frame(col=unique(i[[col]]), stringsAsFactors = FALSE)
      
      # same order as LUT
      # convert to sRGB
      # NA will create bogus colors, filtered above
      v <- t(col2rgb(lut$col) / 255)
      
      # convert to LAB
      v <- convertColor(
        v, 
        from = 'sRGB', 
        to = 'Lab', 
        from.ref.white = 'D65', 
        to.ref.white = 'D65', 
        clip = FALSE
      )
      
      # compute perceptually based distance matrix on unique colors
      dE00 <- farver::compare_colour(v, v, from_space = 'lab', method = 'CIE2000', white_from = 'D65')
      dE00 <- as.dist(dE00)
      
      # clustering from distance matrix
      # TODO: save clustering results for later
      v.pam <- cluster::pam(dE00, k = k.adj, diss = TRUE, pamonce = 5)
      
      # put clustering vector into LUT
      lut$cluster <- v.pam$clustering
      
      # get medoid colors
      med.col <- lut$col[v.pam$medoids]
      
      # expand original colors to medoid colors based on LUT
      idx <- base::match(i[[col]], lut$col)
      
      # replace original colors with discretized colors
      i[[col]] <- med.col[lut$cluster[idx]]
    }
    
    # aggregate depth by unique soil color
    # this assumes that thickness > 0, otherwise NaN is returned
    # convert to data.table for summary
    i <- data.table::as.data.table(i)
    
    # not sure about most readable style
    res <- i[, 
             list(
               weight = sqrt(sum(thick)) * length(thick),
               n.hz = length(thick)
             ), 
             by = col
    ]
    
    # convert back to data.frame
    res <- as.data.frame(res)
    
    # sort by thickness-- this is our metric for relevance
    res <- res[order(res$weight, decreasing=TRUE), ]
    
    ## TODO: this is wasteful, as we likely "knew" the munsell notation before-hand
    # back-calculate the closest Munsell color
    # m <- rgb2munsell(t(col2rgb(res[[col]])) / 255, colorSpace = colorSpace)
    m <- col2Munsell(res[[col]])
    
    
    # format as text
    res$munsell <- paste0(m[, 1], ' ', m[, 2], '/', m[, 3])
    
    # add group ID
    res[['.id']] <- this.group
    
    return(res)
  })
  
  
  
  # rescale using the sum of the weights within the current horizon
  s.scaled <- lapply(s, function(i) {
    i$weight <- i$weight / sum(i$weight)
    return(i)
  })
  
  
  # iteration over groups, estimation of:
  # aggregate colors via mixing
  # number of associated colors
  # Shannon H, base 2
  s.agg <- lapply(s.scaled, function(i) {
    
    # estimation of mixture via wt. mean in CIELAB coordinates
    mix <- mixMunsell(x = i$munsell, w = i$weight, mixingMethod = mixingMethod)
    
    # split into pieces for backwards-compatibility
    m.pieces <- parseMunsell(mix$munsell, convertColors = FALSE)
    col <- parseMunsell(mix$munsell)
    
    # consolidate and return
    res <- data.frame(
      .id = i[['.id']][1],
      m.pieces,
      munsell = mix$munsell,
      distance = mix$distance,
      col = col, 
      n = nrow(i),
      H = shannonEntropy(i$weight)
    )
    
    return(res)
  })
  
  s.agg <- do.call('rbind', s.agg)
  names(s.agg)[1] <- groups
  row.names(s.agg) <- NULL
  
  # return scaled color data
  return(list(scaled.data = s.scaled, aggregate.data = s.agg))
}