inst/reports/region2/mlra-comparison-dynamic/custom.R

## Mapunit summary utility functions

## Note: warnings will corrupt md markup that is created by this function

makeCategoricalOutput <- function(dat, do.spatial.summary=TRUE) {
  
  # this just takes first name; fn intended to be called via lapply w/ list of dataframes 1 frame per var
  variable.name <- unique(dat$variable)[1]
  pat <- variableNameToPattern(variable.name)
  
  metadat <- lvls <- lbls <- list()
  
  # if no specific metadata for this variable, use the variable name as used in config file as header
  if(is.null(pat)) {
    metadat$header <- variable.name 
    metadat$description <- ""
    metadat$levels <- unique(dat$value)
    # does nothing special... one value = one label as character
    metadat$labels <- metadat$levels
    
    # local copies, for what?
    lvls <- metadat$levels
    lbls <- metadat$labels
    
    ## TODO: better colors for varyinf number of possible categories
    # default colors, not very nice
    # note: use a color ramp function for an arbitrary number of colors
    metadat$colors <- colorRampPalette(brewer.pal(pmin(length(metadat$levels), 9), 'Set1'))(length(metadat$levels))
    
    metadat$decimals <- 2
    dat$value <- factor(dat$value, levels=metadat$levels, labels=metadat$labels)
    
  } else {
    # metadata is defined in categorical_definitions.R
    ## note: this object exists outside of the scope of this function
    metadat <- categorical.defs[[pat]] 
    flag <- FALSE
    
    # do not have to specify keep all classes, optional parameter.
    if(!is.null(metadat$keep.all.classes)) {
      if(metadat$keep.all.classes) 
        flag <- TRUE
    }
    
    # default behavior is to remove extraneous classes for table readability
    if(!flag) {
      lvls <- metadat$levels[metadat$levels %in% dat$value]
      lbls <- metadat$labels[metadat$levels %in% lvls]
    } else {
      # TODO: document this
      lvls <- metadat$levels
      lbls <- metadat$labels
    }
    
    dat$value <- factor(dat$value, levels=lvls, labels=lbls)
  }
  
  cat(paste0("  \n### ", metadat$header,"  \n"))
  if(!is.null(metadat$description))  {
    cat(paste0("  \n",metadat$description,"  \n"))
  }
  if(!is.null(metadat$usage))  {
    cat("  \n  **Suggested usage:**  \n")
    cat(paste0("  \n",metadat$usage,"  \n  \n"))
  }
  
  
  ## sanity check: if the wrong raster is specified in the config file, levels will not match codes
  ##               and all values with be NA
  if(all(is.na(dat$value))){
    stop(sprintf('all samples for "%s" are NA, perhaps the wrong raster file was specified?', variable.name))
  }
  
  # convert counts into proportions
  x <- sweepProportions(dat)
  
  # tidy legend by removing "near-zero" proportions
  idx <- which(apply(x, 2, function(i) any(i > 0.001)))
  
  # now remove the extra classes if we had to drop a class after determining proportions to be too small
  # this ensures that the correct color is used for each class
  bad.vars.flag <- (!lbls %in% colnames(x[, -idx]))
  lvls <- lvls[bad.vars.flag]
  lbls <- lbls[bad.vars.flag]
  
  # remove classes with near-zero proportions
  # result is a table
  x <- x[, idx, drop=FALSE]
  
  # convert table -> data.frame, factor levels are lost
  # implicitly converted to long format
  x.long <-as.data.frame.table(x, stringsAsFactors = FALSE)
  
  # re-add factor levels for MUSYM and raster labels
  x.long$.id <- factor(x.long$.id, levels = levels(dat$.id))
  x.long$value <- factor(x.long$value, levels = lbls)
  
  # re-label long-format for plotting
  names(x.long)[2] <- "label"
  names(x.long)[3] <- "value"
  
  
  # create a signature of the most frequent classes that sum to 75% or
  x.sig <- apply(x, 1, classSignature)
  x.sig <- as.data.frame(x.sig)
  names(x.sig) <- 'mapunit composition "signature" (most frequent classes that sum to 75% or more)'
  
  # get a signature for each polygon
  spatial.summary <- ddply(dat, c( '.id', 'pID'), .fun=sweepProportions, drop.unused.levels=FALSE, single.id=TRUE)
  
  ## most likely class
  most.likely.class.idx <- 1
  
  if(!is.null(ncol(spatial.summary[,-c(1,2)]))) # prevent problems when one class is observed in samples (MU has low variance w.r.t raster)
    most.likely.class.idx <- apply(spatial.summary[, -c(1:2)], 1, which.max)
  
  spatial.summary[[paste0('ml_', variable.name)]] <- levels(dat$value)[most.likely.class.idx]
  
  ## implemented via aqp::shannonEntropy()
  # shannon entropy, log base 2 (bits) by polygon
  # shannon entropy is zero when mapunit is "pure"
  if(!is.null(ncol(spatial.summary[, -c(1,2,length(names(spatial.summary)))]))) #handle same problem as above;
    spatial.summary[[paste0('shannon_h_', variable.name)]] <- apply(spatial.summary[, -c(1,2,length(names(spatial.summary)))], 1, aqp::shannonEntropy, b=2)
  else
    spatial.summary[[paste0('shannon_h_', variable.name)]] <- rep(0, length(spatial.summary[, -c(1,2,length(names(spatial.summary)))])) 
  
  
  # setup plot styling
  colors <-  metadat$colors[metadat$levels %in% lvls]
  tps <- list(superpose.polygon=list(col=colors, lwd=2, lend=2))
  trellis.par.set(tps)
  
  # setup legend configuration
  sK.levels <- levels(x.long$label)
  if(length(sK.levels) < 3) {
    sK.columns <- length(sK.levels)
  } else {
    sK.columns <- 3 # hard-coded for now, TODO: dynamically set for space-saving and readability (#66)
  }
  
  # compose legend
  sK <- simpleKey(space='top', columns=sK.columns, text=sK.levels, rectangles = TRUE, points=FALSE)
  
  if(length(unique(x.long$.id)) > 1 & do.spatial.summary) {
    # cluster proportions
    # note: daisy will issue warnings when there is only a single class with non-0 proportions
    #       warnings emitted at this point will corrupt MD markup
    x.d <- as.hclust(diana(suppressWarnings(daisy(x))))
    # re-order MU labels levels based on clustering
    x.long$.id <- factor(x.long$.id, levels=unique(x.long$.id)[x.d$order])
    # musym are re-ordered according to clustering
    cat('  \n  \n')
    print(barchart(.id ~ value, groups=x.long$label, data=x.long, horiz=TRUE, stack=TRUE, xlab='Proportion of Samples', scales=list(cex=1.5), key=sK, legend=list(right=list(fun=dendrogramGrob, args=list(x = as.dendrogram(x.d), side="right", size=10)))))
    cat('  \n  \n')
  } else {
    # re-order MUSYM labels according to original ordering, specified in mu.set
    x.long$.id <- factor(x.long$.id, levels=levels(dat$.id))
    
    trellis.par.set(tps)
    cat('  \n  \n')
    print(barchart(.id ~ value, groups=x.long$label, data=x.long, horiz=TRUE, stack=TRUE, xlab='Proportion of Samples', scales=list(cex=1.5), key=sK))
    cat('  \n  \n')
  }  
  print(kable(x, digits = metadat$decimals))
  cat('  \n  \n')
  if(do.spatial.summary) {
    print(kable(x.sig))
    cat('  \n  \n')
    mu <- merge(mu, spatial.summary, by='pID', all.x=TRUE)
  }
  return(TRUE)
}


classSignature <- function(i) {
  # order the proportions of each row
  o <- order(i, decreasing = TRUE)
  
  # determine a cut point for cumulative proportion >= threshold value
  thresh.n <- which(cumsum(i[o]) >= 0.75)[1]
  
  # sanity check: if all classes are 0 then results are NA and we stop now
  if(is.na(thresh.n))
    return("NA")
  
  # if there is only a single class that dominates, then offset index as we subtract 1 next
  if(thresh.n == 1)
    thresh.n <- 2
  
  # get the top classes
  top.classes <- i[o][1:(thresh.n-1)]
  
  # format for adding to a table
  paste(names(top.classes), collapse = '/')
}


# remove NA from $value
# compute density for $value, using 1.5x "default" bandwidth
# re-scale to {0,1}
# truncate to original range of $value
# return x,y values
scaled.density <- function(d, constantScaling=TRUE) {
  # basic density estimate
  v <- na.omit(d$value)
  res <- stats::density(v, kernel='gaussian', adjust=1.5)
  
  # optionally re-scale to {0,1}
  if(constantScaling)
    res$y <- scales::rescale(res$y)
  
  # constrain to original range
  r <- range(v)
  idx.low <- which(res$x < r[1])
  idx.high <- which(res$x > r[2])
  # replace with NA
  res$x[c(idx.low, idx.high)] <- NA
  res$y[c(idx.low, idx.high)] <- NA
  
  return(data.frame(x=res$x, y=res$y))
}


# TODO: this could be useful in soilReports? not really sharpshootR worthy since it has nothing to do with soil... might be best just left here
# abstracted this for use in the default symbology for "undefined" categoricals, made a call for the old use of defining mapunit colors for legends
makeNiceColors <- function(n) {
  # 7 or fewer classes, use high-constrast colors
  if(n <= 7) {
    cols <- brewer.pal(9, 'Set1') 
    # remove light colors
    cols <- cols[c(1:5,7,9)]
  } else {
    # otherwise, use 12 paired colors
    cols <- brewer.pal(12, 'Paired')
  }
  return(cols[1:n])
}

abbreviateNames <- function(spdf) {
  sapply(names(spdf)[-1], function(i) {
    # keep only alpha and underscore characters in field names
    i <- gsub('[^[:alpha:]_]', '', i)
    # abbreviate after filtering other bad chars
    abbr <- abbreviate(i, minlength = 10)
    return(abbr)
  })
}

# return DF with proportions outside range for each polygon (by pID)
flagPolygons <- function(i) {
  
  # convert to values -> quantiles
  e.i <- ecdf(i$value)
  q.i <- e.i(i$value)
  # locate those samples outside of our 5th-95th range
  out.idx <- which(q.i < 0.05 | q.i > 0.95)
  
  ## TODO: may need to protect against no matching rows?
  tab <- sort(prop.table(table(i$pID[out.idx])), decreasing = TRUE)
  df <- data.frame(pID=names(tab), prop.outside.range=round(as.vector(tab), 2))
  
  # keep only those with > 15% of samples outside of range
  #df <- df[which(df$prop.outside.range > p.crit), ]  
  
  #all proportions outside now reported in QC shapefile; no need to have a threshold
  return(df)
}


# http://stackoverflow.com/questions/16225530/contours-of-percentiles-on-level-plot
kdeContours <- function(i, prob, cols, m, ...) {
  
  if(nrow(i) < 2) {
    return(NULL)
  }
  
  this.id <- unique(i$.id)
  this.col <- cols[match(this.id, m)]
  dens <- kde2d(i$x, i$y, n=200); ## estimate the z counts
  
  dx <- diff(dens$x[1:2])
  dy <- diff(dens$y[1:2])
  sz <- sort(dens$z)
  c1 <- cumsum(sz) * dx * dy
  levels <- sapply(prob, function(x) {
    approx(c1, sz, xout = 1 - x)$y
  })
  
  # add contours if possibly
  if(!is.na(levels))
    contour(dens, levels=levels, drawlabels=FALSE, add=TRUE, col=this.col, ...)
  
  # # add bivariate medians
  # points(median(i$x), median(i$y), pch=3, lwd=2, col=this.col)
}


# masking function applied to a "wide" data.frame of sampled raster data
# function is applied column-wise
# note: using \leq and \geq for cases with very narrow distributions
mask.fun <- function(i) {
  res <- i >= quantile(i, prob=0.05, na.rm=TRUE) & i <= quantile(i, prob=0.95, na.rm=TRUE)
  return(res)
}


# cut down to reasonable size: using cLHS
f.subset <- function(i, n, non.id.vars) {
  # if there are more than n records, then sub-sample
  if(nrow(i) > n) {
    # columns with IDs have been pre-filtered
    idx <- clhs(i[, non.id.vars], size=n, progress=FALSE, simple=TRUE, iter=1000)
    i.sub <- i[idx, ]
  }
  #	otherwise use what we have
  else
    i.sub <- i
  
  return(i.sub)
}


# set multi-row figure based on number of groups and fixed number of columns
dynamicPar <- function(n, max.cols=3) {
  # simplest case, fewer than max number of allowed columns
  if(n <= max.cols) {
    n.rows <- 1
    n.cols <- n
  } else {
    
    # simplest case, a square
    if(n %% max.cols == 0) {
      n.rows <- n / max.cols
      n.cols <- max.cols
    } else {
      # ragged
      n.rows <- round(n / max.cols) + 1
      n.cols <- max.cols
    }
  }
  
  par(mar=c(0,0,0,0), mfrow=c(n.rows, n.cols))
  # invisibly return geometry
  invisible(c(n.rows, n.cols))
}

# stat summary function
f.summary <- function(i, p) {
  
  # remove NA
  v <- na.omit(i$value)
  
  # compute quantiles
  q <- quantile(v, probs=p)
  res <- data.frame(t(q))
  
  ## TODO: implement better MADM processing and explanation  
  if(nrow(res) > 0) {
    #     # MADM: MAD / median
    #     # take the natural log of absolute values of MADM
    #     res$log_abs_madm <- log(abs(mad(v) / median(v)))
    #     # 0's become -Inf: convert to 0
    #     res$log_abs_madm[which(is.infinite(res$log_abs_madm))] <- 0
    
    # assign reasonable names (quantiles)
    names(res) <- c(paste0('Q', p * 100))
    
    return(res)
  }
  else
    return(NULL)
}

# custom stats for box-whisker plot: 5th-25th-50th-75th-95th percentiles
# NOTE: we are re-purposing the coef argument!
# x: vector of values to summarize
# coef: Moran's I associated with the current raster
custom.bwplot <- function(x, coef=NA, do.out=FALSE) {
  # custom quantiles for bwplot
  stats <- quantile(x, p=c(0.05, 0.25, 0.5, 0.75, 0.95), na.rm = TRUE)
  # number of samples
  n <- length(na.omit(x))
  
  # compute effective sample size
  rho <- coef
  n_eff <- ESS_by_Moran_I(n, rho)
  
  # confidence "notch" is based on ESS
  iqr <- stats[4] - stats[2]
  conf <- stats[3] + c(-1.58, 1.58) * iqr/sqrt(n_eff)
  
  out.low <- x[which(x < stats[1])]
  out.high <- x[which(x > stats[5])]
  
  return(list(stats=stats, n=n, conf=conf, out=c(out.low, out.high)))
}


#####
## Functions for manipulating categorical variable set
#TODO: these are connotative but somewhat confusingly named... refactor eventually. all basicalyl internal stuff so not a huge issue.
# Functions for use in dealing with arbitrary categoricals, categorical symbology and toggling of sections when new categoricals are added
variableNameFromPattern <-  function (pattern) {
  unique(d.cat$variable)[grep(pattern, unique(d.cat$variable), ignore.case = TRUE)]
}

variableNameToPattern <- function(name) {
  patterns = names(categorical.defs)
  for(p in patterns)
    if(grepl(p, name, ignore.case = TRUE)) return(p)
  return(NULL)
}

categoricalVariableHasDefinition <- function(variable.name) {
  if(sum(names(categorical.defs) %in% variable.name) > 0) return(TRUE)
  return(FALSE)
}

subsetByName <- function(name) {
  return(subset(d.cat, subset=(variable == name)))
}

subsetByPattern <- function(pattern) {
  subsetByName(getCategoricalVariableNameFromPattern(pattern))
}

# i: data.frame with '.id' and 'value' columns
# drop.unused.levels: this affects all unused levels
# single.id: FALSE when summarizing all MUSYM, TRUE when specific to single MUSYM
sweepProportions <- function(i, drop.unused.levels=FALSE, single.id=FALSE) {
  # must drop missing .id factor levels when used with a single .id e.g. for spatial summaries
  if(single.id)
    i$.id <- factor(i$.id)
  
  # tabulate and convert to proportions, retain all levels of ID
  foo <- xtabs(~ .id + value, data=i, drop.unused.levels=drop.unused.levels)
  res <- sweep(foo, MARGIN = 1, STATS = rowSums(foo), FUN = '/')
  
  # 2017-12-11: 0-samples result in NA, convert those back to 0
  if(any(is.na(res)))
    res[is.na(res)] <- 0
  
  return(res)
}
ncss-tech/soilReports documentation built on April 25, 2024, 1:03 a.m.