In ncss-tech/soilReports: R Markdown Reports and Convenience Functions for Soil Survey

    .report.name <- 'fs-mu-comparison'
    .report.version <- '1.0.0'
    .report.description <- 'compare stack of raster data, sampled from polygons associated with 1-8 map units'

library(knitr, quietly=TRUE)
# need to do this in order to access general-purpose functions
library(soilReports, quietly=TRUE)

# package options
opts_knit$set(message=FALSE, warning=FALSE, verbose=FALSE, progress=FALSE)

# chunk options
# R session options
options(width=100, stringsAsFactors=FALSE)


### -----
### SANITY CHECK #1 - make sure user has run reportSetup recently :)
### TODO: verify they are not running an old report version that might rely on outdated packages?

### check if needed CRAN packages are installed
packz <- c("MASS","rgdal","rgeos","raster","plyr","latticeExtra","cluster","clhs","randomForest","spdep","reshape2","aqp","sharpshootR")
newpackz <- packz[!(packz %in% installed.packages()[,"Package"])]

# Last two (aqp and sharpshootR) are (in general) installed from GitHub. 

loaded <- lapply(packz, FUN=require, character.only=TRUE, quietly=TRUE)
if(sum(as.numeric(loaded)) != length(packz)) {
  stop("Failed to load one or more required packages! Be sure you have the latest version of the soilReports package from GitHub. Then run `soilReports::reportSetup('southwest/mu-comparison')` to install all required packages. Use `reportUpdate()` to ensure you have the latest version of the report.", call. = FALSE)
  geterrmessage()
}
### -----

## load report-specific functions
source('custom.R')

## load local configuration 
# TODO: allow entry of simple fields interactively if they are not specified?
# TODO: allow for batching from a basic report.Rmd
source('config.R')

### -----
### Sanity check #2 - make sure the shapefile to be summarized exists,can be read and has the specified column
## load map unit polygons from OGR data source
mu <- try(readOGR(dsn=mu.dsn, layer=mu.layer, stringsAsFactors = FALSE))
if(class(mu) == 'try-error')
  stop(paste0('Cannot read map unit polygon/feature file: "', mu.dsn, ' / ', mu.layer, '"'), call. = FALSE)
if(!(mu.col %in% names(mu)))
  stop(paste0('Cannot find map unit column (',mu.col,') in attribute table of: "', mu.dsn, ' / ', mu.layer,'"'), call. = FALSE)
### -----

### -----
### Sanity check #3 - make sure all raster/input file paths exist and are readable
###               NB: this might not give correct results for parallel computation. Actual file read calls will also be wrapped in try()
accessible.inputs <- file.access(as.character(unlist(raster.list)), mode = 4) + 1 #file.access returns 0 for success and -1 for failure
if(any( accessible.inputs == 0 )) {
  unreadable.files <- names(which(accessible.inputs == 0))
  stop(paste0("The following input files either do not exist or are unreadable: \n", paste0(unreadable.files, collapse=", ")))
}
### -----

# just in case, coerce mu.col to character
mu[[mu.col]] <- as.character(mu[[mu.col]])

# TODO: this type of if(exists(obj)) needs to wrap all references to report parameters 
if(exists('mu.set')) { 
  # coerce mu.set to character just in integers were specified
  mu.set <- as.character(mu.set)

  # check to see if the mu.set specifies musyms that are not in the spatial layer
  mu.set.bak <- mu.set
  mu_nosp <- !(mu.set %in% mu[[mu.col]])

  # check if any of the predefined musyms are absent from the spatial
  if(any(mu_nosp))
    mu.set <- mu.set[-which(mu_nosp)] #if so, remove them

  # if we removed everything (e.g. no musyms match due to wrong config file?), fail gracefully
  if(length(mu.set) == 0) 
    stop(paste0("Cannot find map unit polygons with symbol: ",paste0(mu.set.bak,collapse=", ")))

  # if mu.set defined in config.R, only keep the features with musym matching set
  mu <- mu[which(mu[[mu.col]] %in% mu.set), ] 
} else {
  # if mu.set is not predefined, ordering is determined by the order of musyms in the source file
  mu.set <- sort(unique(mu[[mu.col]]))
}

### -----
### Sanity check #4 - make sure output paths are valid (no illegal characters)
###
# make an output directory if it doesn't exist
if(!dir.exists('output')) 
  dir.create('./output')

# shapefile output names; NB: writeOGR uses 'output' as the data source name, so just the base file name
# note: concatenation of mu.set could result in illegal (too long) file names ~ 130 characters on windows
# https://github.com/ncss-tech/soilReports/issues/93
if(!exists('shp.unsampled.fname')) shp.unsampled.fname <- paste0('un-sampled-', paste(mu.set, collapse='_'))
if(!exists('shp.stats.fname')) shp.stats.fname <- paste0('polygons-with-stats-', paste(mu.set, collapse='_'))
if(!exists('shp.qc.fname')) shp.qc.fname <- paste0('poly-qc-', paste(mu.set, collapse='_'))

if(!exists('csv.qc.fname')) csv.qc.fname <- paste0('poly-qc-', paste(mu.set, collapse='_'),'.csv')
if(!exists('csv.stats.fname')) csv.stats.fname <- paste0('poly-stats-', paste(mu.set, collapse='_'), '.csv')

outputfiles <- c(paste0(c(shp.unsampled.fname,shp.stats.fname,shp.qc.fname),".shp"), csv.qc.fname, csv.stats.fname)
if(any(grepl(basename(outputfiles), pattern='([/\\|<>:\\*?\"])'))) {
  stop("Map unit set or output file name contains invalid characters for filename. Either override default output file names in config.R or remove [/\\|<>:\\*?\"] characters from your map unit symbols/file names.", call. = FALSE)
}

if(any(nchar(basename(outputfiles)) > 100)) {
    stop("Auto-generated output file names are over 100 characters long. Please set output file names in config.R.", call. = FALSE)
}
### -----

# in order to avoid column name collisions due to DBF limitations, keep only the mu.col column
mu <- mu[, mu.col, drop=FALSE]

# add a unique polygon ID
mu$pID <- seq(from=1, to=length(mu))

if(cache.samples & file.exists('cached-samples.Rda')) {
  message('Using cached raster samples...')
  .sampling.time <- 'using cached samples'
  load('cached-samples.Rda')
} else {
  # suppressing warnings: these have to do with conversion of CRS of points
  # iterate over map units and sample rasters
  # result is a list
  .timer.start <- Sys.time()
  sampling.res <- suppressWarnings(sampleRasterStackByMU(mu, mu.set, mu.col, raster.list, pts.per.acre, estimateEffectiveSampleSize = correct.sample.size))
  .timer.stop <- Sys.time()

  .sampling.time <- format(difftime(.timer.stop, .timer.start, units='mins'), digits=2)
  print(paste0("Completed sampling of raster stack for poly symbols (",paste(mu.set,collapse=","),") in ",.sampling.time,"."))

  sampling.res$raster.samples$.id <- factor(sampling.res$raster.samples$.id, levels=mu.set)

  # Nobody (other than AGB) cares about having this output :) -- sorry for filling up your folders with timestamped sample Rdata files.
  #if(archive_samples) {
  #  rda.sample.fname <- paste0("output/samples_",paste(mu.set,collapse="_"),"-",strftime(Sys.time(),format="%Y%m%d_%H%M%S"),".Rda"
  #  save(sampling.res, file=rda.sample.fname)
  #}

  print("DONE!")

  if(cache.samples)
    save(mu, sampling.res,file = 'cached-samples.Rda')
}

## duplicate data, but assign new ID (for "ALL GROUPS")
d.all <- sampling.res$raster.samples 
d.all$.id <- "All Groups"
sampling.res$raster.samples <- rbind(d.all, sampling.res$raster.samples)

## subset raster samples by data type: continuous, categorical, circular
d.continuous <- subset(sampling.res$raster.samples, variable.type == 'continuous')
d.continuous$variable <- factor(d.continuous$variable, levels=names(raster.list$continuous))
d.circ <- subset(sampling.res$raster.samples, variable.type == 'circular')

# subset and enable aspect summary
if(nrow(d.circ) > 0) { ## TODO: could there ever be more than one type of circular variable? 
  do.aspect <- TRUE
} else do.aspect <- FALSE

### SUBSET CATEGORICAL VARIABLEs
d.cat <- subset(sampling.res$raster.samples, variable.type == 'categorical')
d.cat$.id <- factor(d.cat$.id, levels=c('All Groups', mu.set))
d.cat.sub <- list()
categorical.defs <- list()          #define the aliases and label:value:color sets for displaying categorical variables
source("categorical_definitions.R") #load the structures that define value:label:color for categorical plot. 
                                    #updates the "section_toggle" variable defined at top of report which will knit the appropriate plots for the specified categoricals from config.R

### FORMATTING
## figure out reasonable figure heights for bwplots and density plots
# baseline height for figure margins, axes, and legend
min.height <- 2 
# height required for each panel
panel.height <- 2 * length(levels(d.continuous$variable))
# extra height for each ID
id.height <- 0.25 * length(levels(d.continuous$.id))
dynamic.fig.height <- min.height + panel.height + id.height

# nice colors
cols <- makeNiceColors(length(mu.set) + 1)

# TODO: Null out raster samples to save memory TOGGLE
## http://adv-r.had.co.nz/memory.html#memory
# sampling.res$raster.samples <- NULL


# TODO / DEBUGGING: report on invalid geometries that could be interfering with sampling
#    * https://github.com/ncss-tech/sharpshootR/commit/9ada343fea46d18f7c2e10c12a7dc1cbfb71679f
#   
# results in:
# sampling.res$mu.validity.check

Map units (r mu.col): r paste(mu.set, collapse = ", ")
report version r .report.version
r format(Sys.time(), "%Y-%m-%d")

This report is designed to provide statistical summaries of the environmental properties for one or more map units. Summaries are based on raster data extracted from fixed-density sampling of map unit polygons.

Percentiles are used as robust metrics of distribution central tendency and spread.

Please see the document titled R-Based Map Unit Summary Report Introduction and Description for background and setup.

Map Unit Polygon Data Source

fd <- data.frame(`MU Polygons`=mu.dsn, `File or Feature`=mu.layer)
kable(fd, row.names = FALSE)

Raster Data Sources

kable(sampling.res$raster.summary, row.names = FALSE, digits = 3)

Area Summaries

Target sampling density: r pts.per.acre points/ac. defined in config.R. Consider increasing if there are unsampled polygons or if the number of samples is less than about 200. Note that the mean sampling density (per polygon) will always be slightly lower than the target sampling density, depending on polygon shape.

kable(sampling.res$area.stats, caption='Map Unit Acreage by Polygon', align = 'r', col.names=c(mu.col, names(sampling.res$area.stats)[-1]))

Modified Box and Whisker Plots

Whiskers extend from the 5th to 95th percentiles, the body represents the 25th through 75th percentiles, and the dot is the 50th percentile. Box width is proportional to the number of samples per map unit (e.g. total map unit area).

Notches (if enabled) represent an approximate confidence interval around the median, adjusted for spatial autocorrelation. Overlapping notches suggest that median values are not significantly different. This feature can be enabled by setting correct.sample.size=TRUE in config.R.

Suggested usage:

Gauge overlap between map units in terms of boxes (25th-75th percentiles) and whiskers (5th-95th percentiles).
Non-overlapping boxes are a strong indication that the central tendencies (of select raster data) differ.
Distribution shape is difficult to infer from box and whisker plots, remember to cross-reference with density plots below.

tps <- list(box.rectangle=list(col='black'), box.umbrella=list(col='black', lty=1), box.dot=list(cex=0.75), plot.symbol=list(col=rgb(0.1, 0.1, 0.1, alpha = 0.25, maxColorValue = 1), cex=0.25))

# NOTE: notches rely on effective sampling size
bwplot(.id ~ value | variable, data=d.continuous, 
       scales=list(y=list(alternating=1), x=list(relation='free', tick.number=10)), as.table=TRUE, col='black', 
       strip=strip.custom(bg=grey(0.85)), xlab='', par.settings=tps, subscripts=TRUE, 
       layout=c(1, length(levels(d.continuous$variable))),
       varwidth=TRUE,
       panel=function(x, subscripts=subscripts, ...) {

         # extract the current raster name
         this.raster <- as.character(unique(d.continuous$variable[subscripts]))

         # get associated Moran's I
         idx <- which(sampling.res$Moran_I$Variable == this.raster)
         this.Moran.I <- sampling.res$Moran_I$Moran.I[idx]

         # make a grid
         panel.grid(h=0, v=-1, col='grey', lty=3)
         panel.abline(h=1:length(unique(d.continuous$.id)), col='grey', lty=3)

         # boxplots with custom sampling size:
         # coef: Moran's I associated with this raster
         panel.bwplot(x, stats=custom.bwplot, notch=correct.sample.size, coef=this.Moran.I, ...)

       })

Density Plots

These plots are a smooth alternative (density estimation) to the classic "binned" (histogram) approach to visualizing distributions. Peaks correspond to values that are most frequent within a data set. Each data set (ID / variable) are rescaled to {0,1} so that the y-axis can be interpreted as the "relative proportion of samples".

Suggested usage:

Density plots depict a more detailed summary of distribution shape.
When making comparisons, be sure to look for:
multiple peaks
narrow peaks vs. wide "mounds"
short vs. long "tails"

## TODO: consider variable line width proportional to area
## https://github.com/ncss-tech/soilReports/issues/81
# var.lwd <- scales::rescale(log(sampling.res$area.stats$`Total Area`), to=c(0.5, 2.5))
tps <- list(superpose.line=list(col=cols, lwd=2, lend=2))

# dynamic setting of columns in legend
n.cols <- ifelse(length(mu.set) <= 4, length(mu.set), 5)

# scaling of density curves
# just in case this isn't in the config file, set a default value here
if( ! exists('scaleDensityCurves'))
  scaleDensityCurves <- TRUE

# compute densities and optionally re-scale to {0,1}
density.plot.data <- ddply(d.continuous, c('.id', 'variable'), scaled.density, constantScaling=scaleDensityCurves)

xyplot(y ~ x | variable, groups=.id, data=density.plot.data, xlab='', ylab='Relative Proportion', scales=list(relation='free', x=list(tick.number=10), y=list(at=NULL)), plot.points=FALSE, strip=strip.custom(bg=grey(0.85)), as.table=TRUE, layout=c(1, length(levels(d.continuous$variable))), auto.key=list(lines=TRUE, points=FALSE, columns=n.cols, cex=0.8), par.settings=tps, type=c('l','g'))

rm(density.plot.data)

Tabular Summaries

Table of select percentiles, by variable. In these tables, headings like "Q5" can be interpreted as the the "5th percentile"; 5% of the data are less than this value. The 50th percentile ("Q50") is the median.

# summarize raster data for tabular output
mu.stats <- ddply(d.continuous, c('variable', '.id'), f.summary, p=p.quantiles)

# print medians
dg <- c(0, rep(2, times=length(unique(mu.stats$variable))))
mu.stats.wide <- dcast(mu.stats, .id ~ variable, value.var = 'Q50')
kable(mu.stats.wide, row.names=FALSE, caption = 'Median Values', align = 'r', digits=dg, col.names=c(mu.col, names(mu.stats.wide)[-1]))

# iterate over variables and print smaller tables
# note: https://github.com/yihui/knitr/issues/886
l_ply(split(mu.stats, mu.stats$variable), function(i) {
  # remove variable column
  var.name <- unique(i$variable)
  i$variable <- NULL
  dg <- c(0, rep(2, times=length(p.quantiles)), 3)
  print(kable(i, caption = var.name, row.names=FALSE, align = 'r', digits=dg, col.names=c(mu.col, names(i)[-1])))
})

Slope Aspect

A graphical summary of slope aspect values using density and percentile estimation methods adapted to circular data. Spread and central tendency are depicted with a combination of (circular) kernel density estimate (dashed blue lines) and arrows.

The 50th percentile value is shown with a red arrow and the 10th and 90th percentile values are shown with gray arrows. Arrow length is proportional to the strength of directionality. Use the figures and table below to determine "clockwise" / "counter clockwise" values for NASIS component records.

Suggested usage:

Check circular density estimates for peaks: this suggests directionality.
These summaries are meaningless when slope values are less than approximately 3%.
These summaries are (mostly) meaningless when arrow lengths are short; e.g. low directionality.
There is no general relationship between 10th/90th percentiles and "clockwise" vs. "counterclockwise"

## circular stats, by map unit
d.circ.list <- split(d.circ, d.circ$.id)

# this has to be called 2x, as we are adjusting the device settings on the fly
fig.geom <- dynamicPar(length(d.circ.list))

# update default device output size
opts_chunk$set(fig.height=fig.geom[1] * 5) # rows
opts_chunk$set(fig.width=fig.geom[2] * 5) # cols

# reset multi-figure plotting parameters
dynamicPar(length(d.circ.list))

res <- ldply(d.circ.list, function(i) {
  mu <- unique(i$.id)
  circ.stats <- aspect.plot(i$value, q=c(0.1, 0.5, 0.9), plot.title=mu, pch=NA, bg='RoyalBlue', col='black', arrow.col=c('grey', 'red', 'grey'), stack=FALSE, p.bw=90)

  return(round(circ.stats))
})

# tabular summary
kable(res, align = 'r', col.names=c(mu.col, names(res)[-1]))

# make categorical summaries for all categorical variables
d.cat.list <- split(d.cat, f=d.cat$variable)

l <- lapply(d.cat.list, FUN=makeCategoricalOutput)

## TODO:
# 1. combine median of continuous, geomorphons proportions, and NLCD proportions for dendrogram

# cast to wide format
d.mu.wide <- dcast(d.continuous, sid + pID + .id ~ variable, value.var = 'value')

# drop rows with NA
d.mu.wide <- na.omit(d.mu.wide)

# locate "non-id" vars that have non-zero SD
# these are safe for cLHs and distance calc
# solution for https://github.com/ncss-tech/soilReports/issues/87
d.mu.wide.vars <- findSafeVars(d.mu.wide, id = c('.id', 'sid', 'pID'))

# must have > 1 variables to perform multivariate summary
if(length(d.mu.wide.vars) < 2) {
  multivariate.summary <- FALSE
} else {
  multivariate.summary <- TRUE

  ## DEB 2019-05-15: the following check for low SD variables is too strict
  ##             keeping it around just in case, slated for removal
  # https://github.com/ncss-tech/soilReports/issues/87

  # # check SD of each column, by group
  # sd.by.id <- ddply(d.mu.wide, '.id', function(i) {sapply(i[, d.mu.wide.vars, drop=FALSE], sd, na.rm=TRUE)})
  # sd.by.id$res <- apply(sd.by.id[, -1], 1, function(i) any(i < 1e-5))
  #
  # # if the SD is low in any column from all MU then stop
  # if(all(sd.by.id$res)) {
  #   multivariate.summary <- FALSE
  # } else {
  #   # OK to run MV summaries
  #   multivariate.summary <- TRUE
  #
  #   ## TODO: this may be too strict, remove variables first
  #   ## https://github.com/ncss-tech/soilReports/issues/87
  #   # filter out low-variance MU
  #   if(any(sd.by.id$res)) {
  #     ids.to.keep <- sd.by.id$.id[which(!sd.by.id$res)]
  #     d.mu.wide <- d.mu.wide[which(d.mu.wide$.id %in% ids.to.keep), ]
  #
  #     # reset mu.set accordingly
  #     idx.to.keep <- which(! mu.set %in% setdiff(mu.set, ids.to.keep))
  #     mu.set <- mu.set[idx.to.keep]
  #   }



    ## TODO: what is a reasonable sample size?
    # only sub-sample if there are "a lot" of samples
    if(nrow(d.mu.wide) > 1000) {
      # sub-sample via LHS: this takes time
      # first three columns are IDs
      # n: this is the number of sub-samples / map unit
      # non.id.vars: this is an index to non-ID columns
      d.sub <- ddply(d.mu.wide, '.id', cLHS_subset, n=50, non.id.vars=d.mu.wide.vars)
    } else {
      d.sub <- d.mu.wide
    }

    ## NOTE: data with very low variability will cause warnings
    # eval numerical distance, removing 'sid' and '.id' columns
    d.dist <- daisy(d.sub[, d.mu.wide.vars], stand=TRUE)

    ## map distance matrix to 2D space via principal coordinates
    d.betadisper <- vegan::betadisper(d.dist, group=d.sub$.id, bias.adjust = TRUE, sqrt.dist = TRUE, type='median')
    d.scores <- vegan::scores(d.betadisper)

    # contour density estimates
    # add contours for fixed pct of data density using KDE
    # other ideas: https://stat.ethz.ch/pipermail/r-help/2012-March/305425.html
    s <- data.frame(x=d.scores$sites[, 1], y=d.scores$sites[, 2], .id=d.sub$.id)
    s <- split(s, s$.id)

    # plot
    par(mar=c(1,1,3,1))
    plot(d.scores$sites, type='n', axes=FALSE)
    abline(h=0, v=0, lty=2, col='grey')

    # NOTE: lines are not added if data are too densely spaced for evaluation of requested prob. level
    # add contours of prob density
    res <- lapply(s, kdeContours, id='.id', prob=c(0.75), cols=cols, m=levels(d.sub$.id), lwd=1, lty=3)
    res <- lapply(s, kdeContours, id='.id', prob=c(0.5), cols=cols, m=levels(d.sub$.id), lwd=1, lty=2)
    res <- lapply(s, kdeContours, id='.id', prob=c(0.25), cols=cols, m=levels(d.sub$.id), lwd=2, lty=1)

    points(d.scores$sites, cex=0.45, col=cols[as.numeric(d.sub$.id)], pch=16)
    # note special indexing for cases when low-var MU have been removed
    vegan::ordilabel(d.betadisper, display='centroids', col=cols[match(c('All Groups', mu.set), levels(d.sub$.id))])
    title('Ordination of Raster Samples (cLHS Subset) with 25%, 50%, 75% Density Contours')
    box()
}

# if(multivariate.summary == FALSE)
#   print('Cannot create ordination plot: >1 rasters required or not enough variance within each map unit.')

Raster Data Correlation

The following figure highlights shared information among raster data sources based on Spearman's Ranked Correlation coefficient.

Branch height is associated with the degree of shared information between raster data.

Suggested usage:

Look for clustered sets of raster data: typically PRISM-derived and elevation data are closely correlated.
Highly correlated raster data sources reduce the reliability of the "raster data importance" figure.

par(mar=c(2,5,2,2))
## note that we don't load the Hmisc package as it causes many NAMESPACE conflicts
## This requires 3 or more variables
if(length(d.mu.wide.vars) > 3) {
  try(plot(Hmisc::varclus(as.matrix(d.sub[, d.mu.wide.vars]))), silent=TRUE)
} else
  print('This plot requires three or more raster variables, apart from aspect, curvature class, and geomorphons.')

Raster Data Importance

The following figure ranks raster data sources in terms of how accurately each can be used to discriminate between map unit concepts.

Suggested usage:

Map unit concepts are more consistently predicted (by supervised classification) using those raster data sources with relatively larger "Mean Decrease in Accuracy" values.
Highly correlated raster data sources will "compete" for positions in this figure. For example, if elevation and mean annual air temperature are highly correlated, then their respective "importance" values are interchangeable.

# this will only work with >= 2 map units and >= 2 variables

# reset factor levels so that empty classes are not passed on to randomForest() (will cause error if empty/low-variance MUs are present)
d.sub$.id <- factor(d.sub$.id)

if(length(levels(d.sub$.id)) >= 2) {
 # use supervised classification to empirically determine the relative importance of each raster layer
  # TODO: include geomorphons and curvature classes
  # TODO: consider using party::cforest() for conditional variable importance-- varimp
  m <- randomForest(x=d.sub[, d.mu.wide.vars], y=d.sub$.id, importance = TRUE)

  # variable importance
  # TODO: how to interpret raw output from importance:
  # http://stats.stackexchange.com/questions/164569/interpreting-output-of-importance-of-a-random-forest-object-in-r/164585#164585
  varImpPlot(m, scale=TRUE, type=1, main='Mean Decrease in Accuracy')
  # kable(importance(m, scale=FALSE, type=2), digits = 3)

  # ## this adds several seconds to processing time
  # # predict using samples from each polygon, to get proportions of each MU
  # d.mu.wide <- na.omit(d.mu.wide)
  # d.mu.wide$.predMU <- as.character(predict(m, d.mu.wide))
  # 
  # ## TODO: add number of samples / sid
  # # compute proportion of each class by sid
  # pred.by.sid <- ddply(d.mu.wide, 'sid', .fun=function(i) {
  #   # explicit setting of levels results in exact output each iteration
  #   prop.table(table(factor(i$.predMU, levels=mu.set)))
  # })

  ## TODO
  # compute Shannon Entropy at each sid and fix names (useful?)

  ## TODO: join with output SHP via sid 
} else {
  # print message about not enough map unit s
  print('This plot requires two or more map units.')
}


## re-make example tables below
# kable(head(x[, c(1,2, 14:22)], 4), row.names = FALSE, digits = 2)
# kable(head(x[, c(1,2, 23:34)], 4), row.names = FALSE, digits = 2)
# kable(head(x[, c(1,2, 35)], 3), row.names = FALSE, digits = 2)

Polygon Summaries

A shapefile is generated each time a report is run ("polygons-with-stats-XXX" where "XXX" is the set of map units symbols listed in config.R) that contains several useful summaries, computed by polygon. Polygons are uniquely identified by the pID column. Median raster values are given, with data source names abbreviated to conform to the limitations of DBF files:

| pID|MUSYM | EstmtMASTC| AnnlBmRdMJ| EffctvPrcp| Elevationm| FrostFrDys| GrwngDgrDC| MnAnnlArTC| MnAnnlPrcp| SlopeGrdnt| |---:|:-----|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:| | 1|7011 | 17.75| 67118.27| -413.72| 139.0| 319| 2656| 16.58| 443| 5| | 2|7011 | 16.66| 67126.19| -231.08| 298.0| 293| 2583| 16.20| 612| 2| | 3|7089 | 15.46| 56270.39| -84.02| 321.5| 297| 2557| 16.17| 747| 34| | 4|7011 | 17.02| 66833.37| -270.86| 242.0| 306| 2629| 16.41| 588| 2|

There are several columns containing the proportions of each landform element (geomorphons algorithm), the most likely ("ml_landfrm") landform element, and the Shannon entropy associated with landform proportions. The Shannon entropy value can be used to judge the relative "landform purity" of a delineation: smaller values are associated with more homogeneous delineations. Equal proportions of all landform elements (within a polygon) would result in a Shannon entropy value of 1.

| pID|MUSYM | flat| summit| ridge| shoulder| spur| slope| hollow| footslope| valley| depression|ml_landfrm | shannon_h| |---:|:-----|----:|------:|-----:|--------:|----:|-----:|------:|---------:|------:|----------:|:----------|---------:| | 1|7011 | 0.00| 0.01| 0.03| 0.00| 0.05| 0.12| 0.19| 0.04| 0.52| 0.03|valley | 0.44| | 2|7011 | 0.34| 0.00| 0.02| 0.02| 0.05| 0.15| 0.04| 0.20| 0.17| 0.00|flat | 0.62| | 3|7089 | 0.00| 0.04| 0.08| 0.00| 0.32| 0.45| 0.08| 0.00| 0.02| 0.02|slope | 0.54| | 4|7011 | 0.00| 0.00| 0.00| 0.00| 0.00| 0.00| 0.08| 0.08| 0.85| 0.00|valley | 0.17|

In the case of un-sampled polygons (very small delineations or too low sampling density), an additional shapefile will be saved in the output folder with a prefix of "un-sampled-". This file contains those polygons that were not allocated any sampling points and thus not included in the report summaries.

Polygon Quality Control

A shapefile is generated each time a report is run ("poly-qc-XXX" where "XXX" is the set of map units symbols listed in config.R) that contains the proportion of samples outside the 5-95% percentile range. In the attribute table there is one column per raster data source and one row per map unit delineation.

The 5-95% percentile range for each map unit is derived from the samples across all polygons with the corresponding map unit symbol. Proportions of samples outside the range within individual polygons are given for each (continuous) raster data source. Data source names are abbreviated to conform to the limitations of DBF files. Polygons are uniquely identified by the pID column.

Assuming one has sufficient polygons and samples to characterize the data distribution, and that the data are roughly normally distributed, one would expect that 10% of samples across the extent of a particular map unit will fall outside the 5-95% percentile range. Individual delineations that have more than 10-15% of samples outside the range for one or more raster data sources may need to be investigated to see that they fit the map unit concept. It is expected that some delineations will occur at the margins of the map unit extent and therefore may have higher proportions outside the range. Expert judgement is required to determine whether action should be taken to resolve any potentially problematic delineations.

| pID|MUSYM | EffctvPrcp| Elevationm| FrostFrDys| GrwngDgrDC| MnAnnlArTC| MnAnnlPrcp| SlopeGrdnt| |---:|:-----|----------:|----------:|----------:|----------:|----------:|----------:|----------:| | 1|7089 | 0.00| 0| 0.05| 0.03| 0.05| 0.00| 0.00| | 2|7089 | 0.00| 0| 0.01| 0.00| 0.00| 0.00| 0.00| | 3|7011 | 0.05| 0| 0.05| 0.01| 0.01| 0.04| 0.03|

# apply across raster values, to all polygons

#calculates prop outside range for _all_ polys
polygons.to.check <- ddply(d.continuous, c('.id', 'variable'), flagPolygons) #, p.crit = 0) 

#this line retains original p.crit behavior for the tabular output in the report
##display.idx <- which(polygons.to.check$prop.outside.range > 0.15) 

poly.check.wide <- dcast(polygons.to.check, pID ~ variable, value.var = 'prop.outside.range')
poly.check.wide[is.na(poly.check.wide)] = 0 #replace NAs with zero (no samples outside 5-95% percentile range)

mu.check <- merge(mu, poly.check.wide, by='pID', all.x=TRUE)
names(mu.check)[-1] <- abbreviateNames(mu.check)

# fix names for printing
names(polygons.to.check)[1] <- mu.col

# print table (removed from report now that shapefile with proportions outside range is generated)
#kable(polygons.to.check[display.idx,], row.names = FALSE) #only shows polys with p.crit > 0.15 in report tabular output

#save a SHP file with prop.outside.range for each polygon and raster data source combination
if(nrow(polygons.to.check) > 0) {
  try(writeOGR(mu.check, dsn='output', layer=shp.qc.fname, driver='ESRI Shapefile', overwrite_layer=TRUE))
  try(write.csv(mu.check, paste0('output/',csv.qc.fname)))
}

## generates sample prop.outside.range table for documentation purposes
#kable(head(mu.check[,c(1,2,14:20)],3), row.names = FALSE, digits = 2)

# save SHP with any un-sampled polygons
if(length(sampling.res$unsampled.ids) > 0) {
  try(writeOGR(mu[sampling.res$unsampled.ids, ], dsn='output', layer=shp.unsampled.fname, driver='ESRI Shapefile', overwrite_layer=TRUE))
}

# compute summaries
poly.stats <- ddply(d.continuous, c('pID', 'variable'), f.summary, p=p.quantiles)

# convert to wide format, keeping median value
poly.stats.wide <- dcast(poly.stats, pID ~ variable, value.var = 'Q50')
# # convert to wide format, keeping log_abs_madm
# poly.stats.wide.2 <- dcast(poly.stats, pID ~ variable, value.var = 'log_abs_madm')

# add a suffix to variable names so that we can combine
# names(poly.stats.wide.1)[-1] <- paste0(names(poly.stats.wide.1)[-1], '_med')
# names(poly.stats.wide.2)[-1] <- paste0(names(poly.stats.wide.2)[-1], '_var')

## TODO: pending further review
# join median + MADM stats for each polygon
# poly.stats.wide <- join(poly.stats.wide.1, poly.stats.wide.2, by='pID')
# poly.stats.wide <- poly.stats.wide.1

# save to CSV
# sanity tests at head of report should prevent output issues
try(write.csv(poly.stats.wide, file=paste0('output/',csv.stats.fname), row.names=FALSE))

## join stats to map unit polygon attribute table
mu <- merge(mu, poly.stats.wide, by='pID', all.x=TRUE)
names(mu)[-1] <- abbreviateNames(mu)

# remove internally-used MU ID
mu$.id <- NULL

# save to SHP
# sanity tests at head of report should prevent output issues
try(writeOGR(mu, dsn='output', layer=shp.stats.fname, driver='ESRI Shapefile', overwrite_layer=TRUE))

## TODO: how do you trap warnings within a .Rmd knitting session?
# save warnings to log file
# cat(warnings(), file = 'output/warning-log.txt')

This document is based on sharpshootR version r utils::packageDescription("sharpshootR", field="Version").
Report configuration and source code are hosted on GitHub.
Sampling time: r .sampling.time

ncss-tech/soilReports documentation built on March 6, 2025, 1:15 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com